diff --git a/build.xml b/build.xml index 37fe3c289..e5ad9daf0 100644 --- a/build.xml +++ b/build.xml @@ -43,16 +43,17 @@ + + + - - + + + - - - @@ -79,6 +80,16 @@ + + + + + + + + + + @@ -160,6 +171,16 @@ + + + + + + + + + + @@ -234,6 +255,7 @@ + @@ -265,7 +287,7 @@ - + @@ -326,7 +348,7 @@ - + @@ -457,6 +479,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + @@ -490,6 +541,7 @@ + @@ -657,54 +709,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -718,20 +722,113 @@ - - - + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -741,10 +838,10 @@ - + - - + - - + + + + - - + + + + + + + + + + + + + + - - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + - + - + + + - + - + + + - + - + + + - + - + + + - + + + + + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - + - - - - - + + + + + + + + + + + + - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1044,7 +1105,7 @@ - + diff --git a/ivy.xml b/ivy.xml index 3f3d1c97f..115f4062a 100644 --- a/ivy.xml +++ b/ivy.xml @@ -12,6 +12,9 @@ + + + @@ -30,6 +33,9 @@ + + + diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R new file mode 100644 index 000000000..31916361e --- /dev/null +++ b/public/R/queueJobReport.R @@ -0,0 +1,169 @@ +library(gsalib) +require("ggplot2") +require("gplots") + +# +# Standard command line switch. Can we loaded interactively for development +# or executed with RScript +# +args = commandArgs(TRUE) +onCMDLine = ! is.na(args[1]) +if ( onCMDLine ) { + inputFileName = args[1] + outputPDF = args[2] +} else { + #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt" + inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" + #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt" + outputPDF = NA +} + +RUNTIME_UNITS = "(sec)" +ORIGINAL_UNITS_TO_SECONDS = 1/1000 + +# +# Helper function to aggregate all of the jobs in the report across all tables +# +allJobsFromReport <- function(report) { + names <- c("jobName", "startTime", "analysisName", "doneTime", "exechosts") + sub <- lapply(report, function(table) table[,names]) + do.call("rbind", sub) +} + +# +# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job +# +plotJobsGantt <- function(gatkReport, sortOverall) { + allJobs = allJobsFromReport(gatkReport) + if ( sortOverall ) { + title = "All jobs, by analysis, by start time" + allJobs = allJobs[order(allJobs$analysisName, allJobs$startTime, decreasing=T), ] + } else { + title = "All jobs, sorted by start time" + allJobs = allJobs[order(allJobs$startTime, decreasing=T), ] + } + allJobs$index = 1:nrow(allJobs) + minTime = min(allJobs$startTime) + allJobs$relStartTime = allJobs$startTime - minTime + allJobs$relDoneTime = allJobs$doneTime - minTime + allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts) + maxRelTime = max(allJobs$relDoneTime) + p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName)) + p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm"))) + p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2) + p <- p + xlim(0, maxRelTime * 1.1) + p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS)) + p <- p + ylab("Job") + p <- p + opts(title=title) + print(p) +} + +# +# Plots scheduling efficiency at job events +# +plotProgressByTime <- function(gatkReport) { + allJobs = allJobsFromReport(gatkReport) + nJobs = dim(allJobs)[1] + allJobs = allJobs[order(allJobs$startTime, decreasing=F),] + allJobs$index = 1:nrow(allJobs) + + minTime = min(allJobs$startTime) + allJobs$relStartTime = allJobs$startTime - minTime + allJobs$relDoneTime = allJobs$doneTime - minTime + + times = sort(c(allJobs$relStartTime, allJobs$relDoneTime)) + + countJobs <- function(p) { + s = allJobs$relStartTime + e = allJobs$relDoneTime + x = c() # I wish I knew how to make this work with apply + for ( time in times ) + x = c(x, sum(p(s, e, time))) + x + } + + pending = countJobs(function(s, e, t) s > t) + done = countJobs(function(s, e, t) e < t) + running = nJobs - pending - done + + d = data.frame(times=times, pending=pending, running=running, done=done) + + p <- ggplot(data=melt(d, id.vars=c("times")), aes(x=times, y=value, color=variable)) + p <- p + facet_grid(variable ~ ., scales="free") + p <- p + geom_line(size=2) + p <- p + xlab(paste("Time since start of first job", RUNTIME_UNITS)) + p <- p + opts(title = "Job scheduling") + print(p) +} + +# +# Creates tables for each job in this group +# +standardColumns = c("jobName", "startTime", "formattedStartTime", "analysisName", "intermediate", "exechosts", "formattedDoneTime", "doneTime", "runtime") +plotGroup <- function(groupTable) { + name = unique(groupTable$analysisName)[1] + groupAnnotations = setdiff(names(groupTable), standardColumns) + sub = groupTable[,c("jobName", groupAnnotations, "runtime")] + sub = sub[order(sub$iteration, sub$jobName, decreasing=F), ] + + # create a table showing each job and all annotations + textplot(sub, show.rownames=F) + title(paste("Job summary for", name, "full itemization"), cex=3) + + # create the table for each combination of values in the group, listing iterations in the columns + sum = cast(melt(sub, id.vars=groupAnnotations, measure.vars=c("runtime")), ... ~ iteration, fun.aggregate=mean) + textplot(as.data.frame(sum), show.rownames=F) + title(paste("Job summary for", name, "itemizing each iteration"), cex=3) + + # histogram of job times by groupAnnotations + if ( length(groupAnnotations) == 1 && dim(sub)[1] > 1 ) { + # todo -- how do we group by annotations? + p <- ggplot(data=sub, aes(x=runtime)) + geom_histogram() + p <- p + xlab("runtime in seconds") + ylab("No. of jobs") + p <- p + opts(title=paste("Job runtime histogram for", name)) + print(p) + } + + # as above, but averaging over all iterations + groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration") + if ( dim(sub)[1] > 1 ) { + sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd)) + textplot(as.data.frame(sum), show.rownames=F) + title(paste("Job summary for", name, "averaging over all iterations"), cex=3) + } +} + +# print out some useful basic information +print("Report") +print(paste("Project :", inputFileName)) + +convertUnits <- function(gatkReportData) { + convertGroup <- function(g) { + g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS + g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS + g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS + g + } + lapply(gatkReportData, convertGroup) +} + + +# read the table +gatkReportData <- gsa.read.gatkreport(inputFileName) +gatkReportData <- convertUnits(gatkReportData) +#print(summary(gatkReportData)) + +if ( ! is.na(outputPDF) ) { + pdf(outputPDF, height=8.5, width=11) +} + +plotJobsGantt(gatkReportData, T) +plotJobsGantt(gatkReportData, F) +plotProgressByTime(gatkReportData) +for ( group in gatkReportData ) { + plotGroup(group) +} + +if ( ! is.na(outputPDF) ) { + dev.off() +} diff --git a/public/R/src/gsalib/R/gsa.read.gatkreport.R b/public/R/src/gsalib/R/gsa.read.gatkreport.R index 9b3ef1ad1..011b5240d 100644 --- a/public/R/src/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/gsalib/R/gsa.read.gatkreport.R @@ -20,6 +20,20 @@ assign(tableName, d, envir=tableEnv); } +# Read a fixed width line of text into a list. +.gsa.splitFixedWidth <- function(line, columnStarts) { + splitStartStop <- function(x) { + x = substring(x, starts, stops); + x = gsub("^[[:space:]]+|[[:space:]]+$", "", x); + x; + } + + starts = c(1, columnStarts); + stops = c(columnStarts - 1, nchar(line)); + + sapply(line, splitStartStop)[,1]; +} + # Load all GATKReport tables from a file gsa.read.gatkreport <- function(filename) { con = file(filename, "r", blocking = TRUE); @@ -31,9 +45,10 @@ gsa.read.gatkreport <- function(filename) { tableName = NA; tableHeader = c(); tableRows = c(); + version = NA; for (line in lines) { - if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) { + if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) { headerFields = unlist(strsplit(line, "[[:space:]]+")); if (!is.na(tableName)) { @@ -43,13 +58,37 @@ gsa.read.gatkreport <- function(filename) { tableName = headerFields[2]; tableHeader = c(); tableRows = c(); + + # For differences in versions see + # $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java + if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) { + version = "v0.1"; + + } else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) { + version = "v0.2"; + columnStarts = c(); + + } + } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) { # do nothing } else if (!is.na(tableName)) { - row = unlist(strsplit(line, "[[:space:]]+")); + + if (version == "v0.1") { + row = unlist(strsplit(line, "[[:space:]]+")); + + } else if (version == "v0.2") { + if (length(tableHeader) == 0) { + headerChars = unlist(strsplit(line, "")); + # Find the first position of non space characters, excluding the first character + columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1); + } + + row = .gsa.splitFixedWidth(line, columnStarts); + } if (length(tableHeader) == 0) { - tableHeader = row; + tableHeader = row; } else { tableRows = rbind(tableRows, row); } diff --git a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java index 8825c3767..6c8fe1834 100644 --- a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java +++ b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java @@ -25,7 +25,6 @@ package net.sf.picard.reference; -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSourceProgressListener; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import static net.sf.picard.reference.FastaSequenceIndexBuilder.Status.*; @@ -39,8 +38,8 @@ import org.broadinstitute.sting.utils.exceptions.UserException; * Produces fai file with same output as samtools faidx */ public class FastaSequenceIndexBuilder { - public File fastaFile; - ReferenceDataSourceProgressListener progress; // interface that provides a method for updating user on progress of reading file + final public File fastaFile; + final boolean printProgress; // keep track of location in file long bytesRead, endOfLastLine, lastTimestamp, fileLength; // initialized to -1 to keep 0-indexed position in file; @@ -55,10 +54,10 @@ public class FastaSequenceIndexBuilder { public enum Status { NONE, CONTIG, FIRST_SEQ_LINE, SEQ_LINE, COMMENT } Status status = Status.NONE; // keeps state of what is currently being read. better to use int instead of enum? - public FastaSequenceIndexBuilder(File fastaFile, ReferenceDataSourceProgressListener progress) { - this.progress = progress; + public FastaSequenceIndexBuilder(File fastaFile, boolean printProgress) { this.fastaFile = fastaFile; fileLength = fastaFile.length(); + this.printProgress = printProgress; } /** @@ -252,8 +251,8 @@ public class FastaSequenceIndexBuilder { if (System.currentTimeMillis() - lastTimestamp > 10000) { int percentProgress = (int) (100*bytesRead/fileLength); - if (progress != null) - progress.percentProgress(percentProgress); + if (printProgress) + System.out.println(String.format("PROGRESS UPDATE: file is %d percent complete", percentProgress)); lastTimestamp = System.currentTimeMillis(); } } diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index b9e380295..7ea515591 100755 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -31,30 +31,85 @@ import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; import java.io.*; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Map; import java.util.regex.Pattern; /** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Dec 1, 2009 + * Call R scripts to plot residual error versus the various covariates. + * + *

+ * After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which + * reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically + * show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities). + * In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run + * CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated + * by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual + * error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated + * bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration. + * + *

+ * The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into + * each of the quality score estimates. It is defined as follows for N, the number of observations: + * + *

    + *
  • light blue means N < 1,000
  • + *
  • cornflower blue means 1,000 <= N < 10,000
  • + *
  • dark blue means N >= 10,000
  • + *
  • The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically + * meaningless and so aren't included in any of the numerical calculations.
  • + *
+ * + *

+ * NOTE: For those running this tool externally from the Broad, it is crucial to note that both the -Rscript and -resources options + * must be changed from the default. -Rscript needs to point to your installation of Rscript (this is the scripting version of R, + * not the interactive version) while -resources needs to point to the folder holding the R scripts that are used. For those using + * this tool as part of the Binary Distribution the -resources should point to the resources folder that is part of the tarball. + * For those using this tool by building from the git repository the -resources should point to the R/ subdirectory of the Sting checkout. + * + *

+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration + * + *

Input

+ *

+ * The recalibration table file in CSV format that was generated by the CountCovariates walker. + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar AnalyzeCovariates.jar \
+ *   -recalFile /path/to/recal.table.csv  \
+ *   -outputDir /path/to/output_dir/  \
+ *   -resources resources/  \
+ *   -ignoreQ 5
+ * 
* - * Create collapsed versions of the recal csv file and call R scripts to plot residual error versus the various covariates. */ +@DocumentedGATKFeature( + groupName = "AnalyzeCovariates", + summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator") public class AnalyzeCovariates extends CommandLineProgram { ///////////////////////////// // Command Line Arguments ///////////////////////////// - + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + */ @Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false) private String RECAL_FILE = "output.recal_data.csv"; @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false) @@ -65,13 +120,22 @@ public class AnalyzeCovariates extends CommandLineProgram { private String PATH_TO_RESOURCES = "public/R/"; @Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false) private int IGNORE_QSCORES_LESS_THAN = 5; - @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false) + @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false) private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups + + /** + * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation + * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later. + */ @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 50") private int MAX_QUALITY_SCORE = 50; + + /** + * This argument is useful for comparing before/after plots and you want the axes to match each other. + */ @Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots") private int MAX_HISTOGRAM_VALUE = 0; - @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, this value will be the max value of the histogram plots") + @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting") private boolean DO_INDEL_QUALITY = false; @@ -261,13 +325,14 @@ public class AnalyzeCovariates extends CommandLineProgram { } private void callRScripts() { + RScriptExecutor.RScriptArgumentCollection argumentCollection = + new RScriptExecutor.RScriptArgumentCollection(PATH_TO_RSCRIPT, Arrays.asList(PATH_TO_RESOURCES)); + RScriptExecutor executor = new RScriptExecutor(argumentCollection, true); int numReadGroups = 0; - + // for each read group for( Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet() ) { - - Process p; if(++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) { String readGroup = readGroupKey.toString(); @@ -276,35 +341,19 @@ public class AnalyzeCovariates extends CommandLineProgram { // for each covariate for( int iii = 1; iii < requestedCovariates.size(); iii++ ) { Covariate cov = requestedCovariates.get(iii); - try { - - if (DO_INDEL_QUALITY) { - p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_indelQuality.R" + " " + - OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " + - cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice - p.waitFor(); - - } else { + final String outputFilename = OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat"; + if (DO_INDEL_QUALITY) { + executor.callRScripts("plot_indelQuality.R", outputFilename, + cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice + } else { if( iii == 1 ) { - // Analyze reported quality - p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " + - OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " + - IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE + " " + MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored - p.waitFor(); - } else { // Analyze all other covariates - p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " + - OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " + - cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice - p.waitFor(); - } + // Analyze reported quality + executor.callRScripts("plot_residualError_QualityScoreCovariate.R", outputFilename, + IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored + } else { // Analyze all other covariates + executor.callRScripts("plot_residualError_OtherCovariate.R", outputFilename, + cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice } - } catch (InterruptedException e) { - e.printStackTrace(); - System.exit(-1); - } catch (IOException e) { - System.out.println("Fatal Exception: Perhaps RScript jobs are being spawned too quickly? One work around is to process fewer read groups using the -numRG option."); - e.printStackTrace(); - System.exit(-1); } } } else { // at the maximum number of read groups so break out diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java new file mode 100644 index 000000000..9350e4a66 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java @@ -0,0 +1,4 @@ +/** + * Package to plot residual accuracy versus error covariates for the base quality score recalibrator. + */ +package org.broadinstitute.sting.analyzecovariates; \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/commandline/Advanced.java b/public/java/src/org/broadinstitute/sting/commandline/Advanced.java new file mode 100644 index 000000000..7aeefe261 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/Advanced.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +import java.lang.annotation.*; + +/** + * Indicates that a walker argument should is considered an advanced option. + * + * @author Mark DePristo + * @version 0.1 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE,ElementType.FIELD}) +public @interface Advanced { +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java index 9f92df6e0..8e3f753a8 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java @@ -174,7 +174,8 @@ public class ArgumentDefinitions implements Iterable { static DefinitionMatcher VerifiableDefinitionMatcher = new DefinitionMatcher() { public boolean matches( ArgumentDefinition definition, Object key ) { - return definition.validation != null; + // We can perform some sort of validation for anything that isn't a flag. + return !definition.isFlag; } }; } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java index 60ed8c899..351583c07 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java @@ -44,7 +44,7 @@ public class ArgumentMatch implements Iterable { public final String label; /** - * Maps indicies of command line arguments to values paired with that argument. + * Maps indices of command line arguments to values paired with that argument. */ public final SortedMap> indices = new TreeMap>(); diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java index f48ca864a..8ec0d650a 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java @@ -151,6 +151,14 @@ public class ArgumentSource { return field.isAnnotationPresent(Hidden.class) || field.isAnnotationPresent(Deprecated.class); } + /** + * Is the given argument considered an advanced option when displaying on the command-line argument system. + * @return True if so. False otherwise. + */ + public boolean isAdvanced() { + return field.isAnnotationPresent(Advanced.class); + } + /** * Is this command-line argument dependent on some primitive argument types? * @return True if this command-line argument depends on other arguments; false otherwise. @@ -175,13 +183,17 @@ public class ArgumentSource { return typeDescriptor.createsTypeDefault(this); } + public String typeDefaultDocString() { + return typeDescriptor.typeDefaultDocString(this); + } + /** * Generates a default for the given type. * @param parsingEngine the parsing engine used to validate this argument type descriptor. * @return A default value for the given type. */ public Object createTypeDefault(ParsingEngine parsingEngine) { - return typeDescriptor.createTypeDefault(parsingEngine,this,field.getType()); + return typeDescriptor.createTypeDefault(parsingEngine,this,field.getGenericType()); } /** diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index 9c33e084d..5fff8f609 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.commandline; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.gatk.walkers.Multiplex; import org.broadinstitute.sting.gatk.walkers.Multiplexer; import org.broadinstitute.sting.utils.classloader.JVMUtils; @@ -33,6 +35,7 @@ import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import java.io.File; import java.lang.annotation.Annotation; import java.lang.reflect.*; import java.util.*; @@ -80,14 +83,26 @@ public abstract class ArgumentTypeDescriptor { */ public boolean createsTypeDefault(ArgumentSource source) { return false; } + /** + * Returns a documentation-friendly value for the default of a type descriptor. + * Must be overridden if createsTypeDefault return true. cannot be called otherwise + * @param source Source of the command-line argument. + * @return Friendly string of the default value, for documentation. If doesn't create a default, throws + * and UnsupportedOperationException + */ + public String typeDefaultDocString(ArgumentSource source) { + throw new UnsupportedOperationException(); + } + /** * Generates a default for the given type. + * * @param parsingEngine the parsing engine used to validate this argument type descriptor. * @param source Source of the command-line argument. * @param type Type of value to create, in case the command-line argument system wants influence. * @return A default value for the given type. */ - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source,Class type) { throw new UnsupportedOperationException("Unable to create default for type " + getClass()); } + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { throw new UnsupportedOperationException("Unable to create default for type " + getClass()); } /** * Given the given argument source and attributes, synthesize argument definitions for command-line arguments. @@ -109,7 +124,7 @@ public abstract class ArgumentTypeDescriptor { * @return The parsed object. */ public Object parse(ParsingEngine parsingEngine, ArgumentSource source, ArgumentMatches matches) { - return parse(parsingEngine, source, source.field.getType(), matches); + return parse(parsingEngine, source, source.field.getGenericType(), matches); } /** @@ -131,18 +146,18 @@ public abstract class ArgumentTypeDescriptor { protected ArgumentDefinition createDefaultArgumentDefinition( ArgumentSource source ) { Annotation argumentAnnotation = getArgumentAnnotation(source); return new ArgumentDefinition( ArgumentIOType.getIOType(argumentAnnotation), - source.field.getType(), - ArgumentDefinition.getFullName(argumentAnnotation, source.field.getName()), - ArgumentDefinition.getShortName(argumentAnnotation), - ArgumentDefinition.getDoc(argumentAnnotation), - source.isRequired() && !createsTypeDefault(source) && !source.isFlag() && !source.isDeprecated(), - source.isFlag(), - source.isMultiValued(), - source.isHidden(), - getCollectionComponentType(source.field), - ArgumentDefinition.getExclusiveOf(argumentAnnotation), - ArgumentDefinition.getValidationRegex(argumentAnnotation), - getValidOptions(source) ); + source.field.getType(), + ArgumentDefinition.getFullName(argumentAnnotation, source.field.getName()), + ArgumentDefinition.getShortName(argumentAnnotation), + ArgumentDefinition.getDoc(argumentAnnotation), + source.isRequired() && !createsTypeDefault(source) && !source.isFlag() && !source.isDeprecated(), + source.isFlag(), + source.isMultiValued(), + source.isHidden(), + makeRawTypeIfNecessary(getCollectionComponentType(source.field)), + ArgumentDefinition.getExclusiveOf(argumentAnnotation), + ArgumentDefinition.getValidationRegex(argumentAnnotation), + getValidOptions(source) ); } /** @@ -151,7 +166,7 @@ public abstract class ArgumentTypeDescriptor { * @return The parameterized component type, or String.class if the parameterized type could not be found. * @throws IllegalArgumentException If more than one parameterized type is found on the field. */ - protected Class getCollectionComponentType( Field field ) { + protected Type getCollectionComponentType( Field field ) { return null; } @@ -162,7 +177,7 @@ public abstract class ArgumentTypeDescriptor { * @param matches The argument matches for the argument source, or the individual argument match for a scalar if this is being called to help parse a collection. * @return The individual parsed object matching the argument match with Class type. */ - public abstract Object parse( ParsingEngine parsingEngine, ArgumentSource source, Class type, ArgumentMatches matches ); + public abstract Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ); /** * If the argument source only accepts a small set of options, populate the returned list with @@ -273,6 +288,123 @@ public abstract class ArgumentTypeDescriptor { public static boolean isArgumentHidden(Field field) { return field.isAnnotationPresent(Hidden.class); } + + public Class makeRawTypeIfNecessary(Type t) { + if ( t == null ) + return null; + else if ( t instanceof ParameterizedType ) + return (Class)((ParameterizedType) t).getRawType(); + else if ( t instanceof Class ) { + return (Class)t; + } else { + throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t); + } + } +} + +/** + * Parser for RodBinding objects + */ +class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want RodBinding class objects + * @param type The type to check. + * @return true if the provided class is a RodBinding.class + */ + @Override + public boolean supports( Class type ) { + return isRodBinding(type); + } + + public static boolean isRodBinding( Class type ) { + return RodBinding.class.isAssignableFrom(type); + } + + @Override + public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + return RodBinding.makeUnbound((Class)parameterType); + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "none"; + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + String value = getArgumentValue( defaultDefinition, matches ); + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + + try { + String name = defaultDefinition.fullName; + String tribbleType = null; + Tags tags = getArgumentTags(matches); + // must have one or two tag values here + if ( tags.getPositionalTags().size() > 2 ) { + throw new UserException.CommandLineException( + String.format("Unexpected number of positional tags for argument %s : %s. " + + "Rod bindings only suport -X:type and -X:name,type argument styles", + value, source.field.getName())); + } if ( tags.getPositionalTags().size() == 2 ) { + // -X:name,type style + name = tags.getPositionalTags().get(0); + tribbleType = tags.getPositionalTags().get(1); + } else { + // case with 0 or 1 positional tags + FeatureManager manager = new FeatureManager(); + + // -X:type style is a type when we cannot determine the type dynamically + String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; + if ( tag1 != null ) { + if ( manager.getByName(tag1) != null ) // this a type + tribbleType = tag1; + else + name = tag1; + } + + if ( tribbleType == null ) { + // try to determine the file type dynamically + File file = new File(value); + if ( file.canRead() && file.isFile() ) { + FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); + if ( featureDescriptor != null ) { + tribbleType = featureDescriptor.getName(); + logger.info("Dynamically determined type of " + file + " to be " + tribbleType); + } + } + + if ( tribbleType == null ) + if ( ! file.canRead() | ! file.isFile() ) { + throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file); + } else { + throw new UserException.CommandLineException( + String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + + "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", + manager.userFriendlyListOfAvailableFeatures(parameterType))); + } + } + } + + Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); + RodBinding result = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags); + parsingEngine.addTags(result,tags); + parsingEngine.addRodBinding(result); + return result; + } catch (InvocationTargetException e) { + throw new UserException.CommandLineException( + String.format("Failed to parse value %s for argument %s.", + value, source.field.getName())); + } catch (Exception e) { + throw new UserException.CommandLineException( + String.format("Failed to parse value %s for argument %s. Message: %s", + value, source.field.getName(), e.getMessage())); + } + } } /** @@ -282,9 +414,10 @@ public abstract class ArgumentTypeDescriptor { class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public boolean supports( Class type ) { - if( type.isPrimitive() ) return true; - if( type.isEnum() ) return true; - if( primitiveToWrapperMap.containsValue(type) ) return true; + if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) ) return false; + if ( type.isPrimitive() ) return true; + if ( type.isEnum() ) return true; + if ( primitiveToWrapperMap.containsValue(type) ) return true; try { type.getConstructor(String.class); @@ -298,7 +431,8 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { } @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Class type, ArgumentMatches matches) { + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type fulltype, ArgumentMatches matches) { + Class type = makeRawTypeIfNecessary(fulltype); if (source.isFlag()) return true; @@ -339,7 +473,7 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { throw e; } catch (InvocationTargetException e) { throw new UserException.CommandLineException(String.format("Failed to parse value %s for argument %s. This is most commonly caused by providing an incorrect data type (e.g. a double when an int is required)", - value, source.field.getName())); + value, source.field.getName())); } catch (Exception e) { throw new DynamicClassResolutionException(String.class, e); } @@ -351,7 +485,7 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { return result; } - + /** * A mapping of the primitive types to their associated wrapper classes. Is there really no way to infer @@ -382,10 +516,10 @@ class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override @SuppressWarnings("unchecked") - public Object parse(ParsingEngine parsingEngine,ArgumentSource source, Class type, ArgumentMatches matches) { - Class componentType; + public Object parse(ParsingEngine parsingEngine,ArgumentSource source, Type fulltype, ArgumentMatches matches) { + Class type = makeRawTypeIfNecessary(fulltype); + Type componentType; Object result; - Tags tags; if( Collection.class.isAssignableFrom(type) ) { @@ -399,7 +533,7 @@ class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { } componentType = getCollectionComponentType( source.field ); - ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(componentType); + ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); Collection collection; try { @@ -428,7 +562,7 @@ class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { } else if( type.isArray() ) { componentType = type.getComponentType(); - ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(componentType); + ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); // Assemble a collection of individual values used in this computation. Collection values = new ArrayList(); @@ -436,7 +570,7 @@ class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { for( ArgumentMatch value: match ) values.add(value); - result = Array.newInstance(componentType,values.size()); + result = Array.newInstance(makeRawTypeIfNecessary(componentType),values.size()); int i = 0; for( ArgumentMatch value: values ) { @@ -459,16 +593,16 @@ class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { * @throws IllegalArgumentException If more than one parameterized type is found on the field. */ @Override - protected Class getCollectionComponentType( Field field ) { - // If this is a parameterized collection, find the contained type. If blow up if more than one type exists. - if( field.getGenericType() instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); - if( parameterizedType.getActualTypeArguments().length > 1 ) - throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); - return (Class)parameterizedType.getActualTypeArguments()[0]; - } - else - return String.class; + protected Type getCollectionComponentType( Field field ) { + // If this is a parameterized collection, find the contained type. If blow up if more than one type exists. + if( field.getGenericType() instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); + if( parameterizedType.getActualTypeArguments().length > 1 ) + throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); + return parameterizedType.getActualTypeArguments()[0]; + } + else + return String.class; } } @@ -510,12 +644,12 @@ class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { } @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source,Class type) { + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { if(multiplexer == null || multiplexedIds == null) throw new ReviewedStingException("No multiplexed ids available"); Map multiplexedMapping = new HashMap(); - Class componentType = getCollectionComponentType(source.field); + Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); ArgumentTypeDescriptor componentTypeDescriptor = parsingEngine.selectBestTypeDescriptor(componentType); for(Object id: multiplexedIds) { @@ -527,15 +661,19 @@ class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { return multiplexedMapping; } + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "None"; + } @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Class type, ArgumentMatches matches) { + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { if(multiplexedIds == null) throw new ReviewedStingException("Cannot directly parse a MultiplexArgumentTypeDescriptor; must create a derivative type descriptor first."); Map multiplexedMapping = new HashMap(); - Class componentType = getCollectionComponentType(source.field); + Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); for(Object id: multiplexedIds) { @@ -606,7 +744,7 @@ class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { * @throws IllegalArgumentException If more than one parameterized type is found on the field. */ @Override - protected Class getCollectionComponentType( Field field ) { + protected Type getCollectionComponentType( Field field ) { // Multiplex arguments must resolve to maps from which the clp should extract the second type. if( field.getGenericType() instanceof ParameterizedType) { ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index aba4fc109..d88e7030e 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -43,7 +43,7 @@ import java.util.Locale; public abstract class CommandLineProgram { /** The command-line program and the arguments it returned. */ - protected ParsingEngine parser = null; + public ParsingEngine parser = null; /** the default log level */ @Argument(fullName = "logging_level", @@ -144,6 +144,11 @@ public abstract class CommandLineProgram { public static int result = -1; + @SuppressWarnings("unchecked") + public static void start(CommandLineProgram clp, String[] args) throws Exception { + start(clp, args, false); + } + /** * This function is called to start processing the command line, and kick * off the execute message of the program. @@ -153,7 +158,7 @@ public abstract class CommandLineProgram { * @throws Exception when an exception occurs */ @SuppressWarnings("unchecked") - public static void start(CommandLineProgram clp, String[] args) throws Exception { + public static void start(CommandLineProgram clp, String[] args, boolean dryRun) throws Exception { try { // setup our log layout @@ -180,8 +185,9 @@ public abstract class CommandLineProgram { // - InvalidArgument in case these arguments are specified by plugins. // - MissingRequiredArgument in case the user requested help. Handle that later, once we've // determined the full complement of arguments. - parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, - ParsingEngine.ValidationType.InvalidArgument)); + if ( ! dryRun ) + parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, + ParsingEngine.ValidationType.InvalidArgument)); parser.loadArgumentsIntoObject(clp); // Initialize the logger using the loaded command line. @@ -195,36 +201,40 @@ public abstract class CommandLineProgram { if (isHelpPresent(parser)) printHelpAndExit(clp, parser); - parser.validate(); + if ( ! dryRun ) parser.validate(); } else { parser.parse(args); - if (isHelpPresent(parser)) - printHelpAndExit(clp, parser); + if ( ! dryRun ) { + if (isHelpPresent(parser)) + printHelpAndExit(clp, parser); - parser.validate(); + parser.validate(); + } parser.loadArgumentsIntoObject(clp); // Initialize the logger using the loaded command line. clp.setupLoggerLevel(layout); } - // if they specify a log location, output our data there - if (clp.toFile != null) { - FileAppender appender; - try { - appender = new FileAppender(layout, clp.toFile, false); - logger.addAppender(appender); - } catch (IOException e) { - throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); + if ( ! dryRun ) { + // if they specify a log location, output our data there + if (clp.toFile != null) { + FileAppender appender; + try { + appender = new FileAppender(layout, clp.toFile, false); + logger.addAppender(appender); + } catch (IOException e) { + throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); + } } + + // regardless of what happens next, generate the header information + HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), args); + + // call the execute + CommandLineProgram.result = clp.execute(); } - - // regardless of what happens next, generate the header information - HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), args); - - // call the execute - CommandLineProgram.result = clp.execute(); } catch (ArgumentException e) { clp.parser.printHelp(clp.getApplicationDetails()); diff --git a/public/java/src/org/broadinstitute/sting/commandline/Output.java b/public/java/src/org/broadinstitute/sting/commandline/Output.java index 22565dbf5..f8aef0355 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/Output.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Output.java @@ -55,7 +55,7 @@ public @interface Output { * --help argument is specified. * @return Doc string associated with this command-line argument. */ - String doc() default "An output file presented to the walker. Will overwrite contents if file exists."; + String doc() default "An output file created by the walker. Will overwrite contents if file exists"; /** * Is this argument required. If true, the command-line argument system will diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java index 8423bb2f2..fbf8c6516 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.commandline; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.JVMUtils; @@ -41,11 +42,16 @@ import java.util.*; * A parser for Sting command-line arguments. */ public class ParsingEngine { + /** + * The loaded argument sources along with their back definitions. + */ + private Map argumentSourcesByDefinition = new HashMap(); + /** * A list of defined arguments against which command lines are matched. * Package protected for testing access. */ - ArgumentDefinitions argumentDefinitions = new ArgumentDefinitions(); + public ArgumentDefinitions argumentDefinitions = new ArgumentDefinitions(); /** * A list of matches from defined arguments to command-line text. @@ -59,11 +65,17 @@ public class ParsingEngine { */ private List parsingMethods = new ArrayList(); + /** + * All of the RodBinding objects we've seen while parsing + */ + private List rodBindings = new ArrayList(); + /** * Class reference to the different types of descriptors that the create method can create. * The type of set used must be ordered (but not necessarily sorted). */ private static final Set STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet( Arrays.asList(new SimpleArgumentTypeDescriptor(), + new RodBindingArgumentTypeDescriptor(), new CompoundArgumentTypeDescriptor(), new MultiplexArgumentTypeDescriptor()) ); @@ -80,6 +92,7 @@ public class ParsingEngine { protected static Logger logger = Logger.getLogger(ParsingEngine.class); public ParsingEngine( CommandLineProgram clp ) { + RodBinding.resetNameCounter(); parsingMethods.add( ParsingMethod.FullNameParsingMethod ); parsingMethods.add( ParsingMethod.ShortNameParsingMethod ); @@ -107,8 +120,13 @@ public class ParsingEngine { */ public void addArgumentSource( String sourceName, Class sourceClass ) { List argumentsFromSource = new ArrayList(); - for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) ) - argumentsFromSource.addAll( argumentSource.createArgumentDefinitions() ); + for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) ) { + List argumentDefinitions = argumentSource.createArgumentDefinitions(); + for(ArgumentDefinition argumentDefinition: argumentDefinitions) { + argumentSourcesByDefinition.put(argumentDefinition,argumentSource); + argumentsFromSource.add( argumentDefinition ); + } + } argumentDefinitions.add( new ArgumentDefinitionGroup(sourceName, argumentsFromSource) ); } @@ -199,16 +217,25 @@ public class ParsingEngine { throw new InvalidArgumentException( invalidArguments ); } - // Find invalid argument values (arguments that fail the regexp test. + // Find invalid argument values -- invalid arguments are either completely missing or fail the specified 'validation' regular expression. if( !skipValidationOf.contains(ValidationType.InvalidArgumentValue) ) { Collection verifiableArguments = argumentDefinitions.findArgumentDefinitions( null, ArgumentDefinitions.VerifiableDefinitionMatcher ); Collection> invalidValues = new ArrayList>(); for( ArgumentDefinition verifiableArgument: verifiableArguments ) { ArgumentMatches verifiableMatches = argumentMatches.findMatches( verifiableArgument ); + // Check to see whether an argument value was specified. Argument values must be provided + // when the argument name is specified and the argument is not a flag type. + for(ArgumentMatch verifiableMatch: verifiableMatches) { + ArgumentSource argumentSource = argumentSourcesByDefinition.get(verifiableArgument); + if(verifiableMatch.values().size() == 0 && !verifiableArgument.isFlag && argumentSource.createsTypeDefault()) + invalidValues.add(new Pair(verifiableArgument,null)); + } + + // Ensure that the field contents meet the validation criteria specified by the regular expression. for( ArgumentMatch verifiableMatch: verifiableMatches ) { for( String value: verifiableMatch.values() ) { - if( !value.matches(verifiableArgument.validation) ) + if( verifiableArgument.validation != null && !value.matches(verifiableArgument.validation) ) invalidValues.add( new Pair(verifiableArgument, value) ); } } @@ -304,7 +331,17 @@ public class ParsingEngine { if(!tags.containsKey(key)) return new Tags(); return tags.get(key); - } + } + + /** + * Add a RodBinding type argument to this parser. Called during parsing to allow + * us to track all of the RodBindings discovered in the command line. + * @param rodBinding the rodbinding to add. Must not be added twice + */ + @Requires("rodBinding != null") + public void addRodBinding(final RodBinding rodBinding) { + rodBindings.add(rodBinding); + } /** * Notify the user that a deprecated command-line argument has been used. @@ -327,7 +364,7 @@ public class ParsingEngine { */ private void loadValueIntoObject( ArgumentSource source, Object instance, ArgumentMatches argumentMatches ) { // Nothing to load - if( argumentMatches.size() == 0 && !(source.createsTypeDefault() && source.isRequired())) + if( argumentMatches.size() == 0 && ! source.createsTypeDefault() ) return; // Target instance into which to inject the value. @@ -344,6 +381,10 @@ public class ParsingEngine { } } + public Collection getRodBindings() { + return Collections.unmodifiableCollection(rodBindings); + } + /** * Gets a collection of the container instances of the given type stored within the given target. * @param source Argument source. @@ -390,7 +431,6 @@ public class ParsingEngine { return ArgumentTypeDescriptor.selectBest(argumentTypeDescriptors,type); } - private List extractArgumentSources(Class sourceClass, Field[] parentFields) { // now simply call into the truly general routine extract argument bindings but with a null // object so bindings aren't computed @@ -515,10 +555,14 @@ class InvalidArgumentValueException extends ArgumentException { private static String formatArguments( Collection> invalidArgumentValues ) { StringBuilder sb = new StringBuilder(); for( Pair invalidValue: invalidArgumentValues ) { - sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)", - invalidValue.first.fullName, - invalidValue.second, - invalidValue.first.validation) ); + if(invalidValue.getSecond() == null) + sb.append( String.format("%nArgument '--%s' requires a value but none was provided", + invalidValue.first.fullName) ); + else + sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)", + invalidValue.first.fullName, + invalidValue.second, + invalidValue.first.validation) ); } return sb.toString(); } diff --git a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java new file mode 100644 index 000000000..e0b1154c4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broad.tribble.Feature; + +import java.util.*; + +/** + * A RodBinding representing a walker argument that gets bound to a ROD track. + * + * The RodBinding is a formal GATK argument that bridges between a walker and + * the RefMetaDataTracker to obtain data about this rod track at runtime. The RodBinding + * is explicitly typed with type of the Tribble.Feature expected to be produced by this + * argument. The GATK Engine takes care of initializing the binding and connecting it + * to the RMD system. + * + * It is recommended that optional RodBindings be initialized to the value returned + * by the static method makeUnbound(). + * + * Note that this class is immutable. + */ +public final class RodBinding { + protected final static String UNBOUND_VARIABLE_NAME = ""; + protected final static String UNBOUND_SOURCE = "UNBOUND"; + protected final static String UNBOUND_TRIBBLE_TYPE = ""; + + /** + * Create an unbound Rodbinding of type. This is the correct programming + * style for an optional RodBinding + * + * At Input() + * RodBinding x = RodBinding.makeUnbound(T.class) + * + * The unbound binding is guaranteed to never match any binding. It uniquely + * returns false to isBound(). + * + * @param type the Class type produced by this unbound object + * @param any class extending Tribble Feature + * @return the UNBOUND RodBinding producing objects of type T + */ + @Requires("type != null") + protected final static RodBinding makeUnbound(Class type) { + return new RodBinding(type); + } + + /** The name of this binding. Often the name of the field itself, but can be overridden on cmdline */ + final private String name; + /** where the data for this ROD is coming from. A file or special value if coming from stdin */ + final private String source; + /** the string name of the tribble type, such as vcf, bed, etc. */ + final private String tribbleType; + /** The command line tags associated with this RodBinding */ + final private Tags tags; + /** The Java class expected for this RodBinding. Must correspond to the type emited by Tribble */ + final private Class type; + /** True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments */ + final private boolean bound; + + /** + * The name counter. This is how we create unique names for collections of RodBindings + * on the command line. If you have provide the GATK with -X file1 and -X file2 to a + * RodBinding argument as List> then each binding will receive automatically + * the name of X and X2. + */ + final private static Map nameCounter = new HashMap(); + + /** for UnitTests */ + final public static void resetNameCounter() { + nameCounter.clear(); + } + + @Requires("rawName != null") + @Ensures("result != null") + final private static synchronized String countedVariableName(final String rawName) { + Integer count = nameCounter.get(rawName); + if ( count == null ) { + nameCounter.put(rawName, 1); + return rawName; + } else { + nameCounter.put(rawName, count + 1); + return rawName + (count + 1); + } + } + + @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) + public RodBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { + this.type = type; + this.name = countedVariableName(rawName); + this.source = source; + this.tribbleType = tribbleType; + this.tags = tags; + this.bound = true; + } + + /** + * Make an unbound RodBinding. Only available for creating the globally unique UNBOUND object + * @param type class this unbound RodBinding creates + */ + @Requires({"type != null"}) + private RodBinding(Class type) { + this.type = type; + this.name = UNBOUND_VARIABLE_NAME; // special value can never be found in RefMetaDataTracker + this.source = UNBOUND_SOURCE; + this.tribbleType = UNBOUND_TRIBBLE_TYPE; + this.tags = new Tags(); + this.bound = false; + } + + + /** + * @return True for all RodBindings except the special UNBOUND binding, which is the default for optional arguments + */ + final public boolean isBound() { + return bound; + } + + /** + * @return The name of this binding. Often the name of the field itself, but can be overridden on cmdline + */ + @Ensures({"result != null"}) + final public String getName() { + return name; + } + + /** + * @return the string name of the tribble type, such as vcf, bed, etc. + */ + @Ensures({"result != null"}) + final public Class getType() { + return type; + } + + /** + * @return where the data for this ROD is coming from. A file or special value if coming from stdin + */ + @Ensures({"result != null"}) + final public String getSource() { + return source; + } + + /** + * @return The command line tags associated with this RodBinding. Will include the tags used to + * determine the name and type of this RodBinding + */ + @Ensures({"result != null"}) + final public Tags getTags() { + return tags; + } + + /** + * @return The Java class expected for this RodBinding. Must correspond to the type emited by Tribble + */ + @Ensures({"result != null"}) + final public String getTribbleType() { + return tribbleType; + } + + @Override + public String toString() { + return String.format("(RodBinding name=%s source=%s)", getName(), getSource()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index a080ab439..32002e093 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -25,21 +25,20 @@ package org.broadinstitute.sting.gatk; -import org.broadinstitute.sting.commandline.ArgumentTypeDescriptor; -import org.broadinstitute.sting.commandline.CommandLineProgram; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.io.stubs.SAMFileReaderArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.text.ListFileUtils; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; +import java.util.*; /** * @author aaron @@ -64,6 +63,8 @@ public abstract class CommandLineExecutable extends CommandLineProgram { */ private final Collection argumentSources = new ArrayList(); + protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); + /** * this is the function that the inheriting class can expect to have called * when the command line system has initialized. @@ -81,7 +82,6 @@ public abstract class CommandLineExecutable extends CommandLineProgram { // File lists can require a bit of additional expansion. Set these explicitly by the engine. engine.setSAMFileIDs(ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser)); - engine.setReferenceMetaDataFiles(ListFileUtils.unpackRODBindings(getArgumentCollection().RODBindings,getArgumentCollection().DBSNPFile,parser)); engine.setWalker(walker); walker.setToolkit(engine); @@ -96,6 +96,24 @@ public abstract class CommandLineExecutable extends CommandLineProgram { loadArgumentsIntoObject(walker); argumentSources.add(walker); + Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); + + // todo: remove me when the old style system is removed + if ( getArgumentCollection().RODBindings.size() > 0 ) { + logger.warn("################################################################################"); + logger.warn("################################################################################"); + logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2."); + logger.warn("Please use arguments defined by each specific walker instead."); + for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) { + logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags()); + } + logger.warn("################################################################################"); + logger.warn("################################################################################"); + System.exit(1); + } + + engine.setReferenceMetaDataFiles(rodBindings); + for (ReadFilter filter: filters) { loadArgumentsIntoObject(filter); argumentSources.add(filter); @@ -112,6 +130,7 @@ public abstract class CommandLineExecutable extends CommandLineProgram { return 0; } + /** * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. * This report will be written to either STDOUT or to the run repository, depending on the options @@ -142,7 +161,6 @@ public abstract class CommandLineExecutable extends CommandLineProgram { */ protected Collection getArgumentTypeDescriptors() { return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources), - new SAMFileReaderArgumentTypeDescriptor(engine), new SAMFileWriterArgumentTypeDescriptor(engine,System.out), new OutputStreamArgumentTypeDescriptor(engine,System.out) ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index da2be74bf..b8488dc9a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -30,25 +30,27 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.gatk.walkers.Attribution; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.ApplicationDetails; +import org.broadinstitute.sting.utils.help.*; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.util.*; /** - * @author aaron - * @version 1.0 - * @date May 8, 2009 - *

- * Class CommandLineGATK - *

+ * The GATK engine itself. Manages map/reduce data access and runs walkers. + * * We run command line GATK programs using this class. It gets the command line args, parses them, and hands the * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here, * the gatk engine should deal with any data related information. */ +@DocumentedGATKFeature( + groupName = "GATK Engine", + summary = "Features and arguments for the GATK engine itself, available to all walkers.", + extraDocs = { UserException.class }) public class CommandLineGATK extends CommandLineExecutable { @Argument(fullName = "analysis_type", shortName = "T", doc = "Type of analysis to run") private String analysisName = null; @@ -173,12 +175,12 @@ public class CommandLineGATK extends CommandLineExecutable { StringBuilder additionalHelp = new StringBuilder(); Formatter formatter = new Formatter(additionalHelp); - formatter.format("Description:%n"); + formatter.format("Available Reference Ordered Data types:%n"); + formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); + formatter.format("%n"); - WalkerManager walkerManager = engine.getWalkerManager(); - String walkerHelpText = walkerManager.getWalkerDescriptionText(walkerType); - - printDescriptorLine(formatter,WALKER_INDENT,"",WALKER_INDENT,FIELD_SEPARATOR,walkerHelpText,TextFormattingUtils.DEFAULT_LINE_WIDTH); + formatter.format("For a full description of this walker, see its GATKdocs at:%n"); + formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); return additionalHelp.toString(); } @@ -192,8 +194,6 @@ public class CommandLineGATK extends CommandLineExecutable { StringBuilder additionalHelp = new StringBuilder(); Formatter formatter = new Formatter(additionalHelp); - formatter.format("Available analyses:%n"); - // Get the list of walker names from the walker manager. WalkerManager walkerManager = engine.getWalkerManager(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 918bc1251..5b9ebd99b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -43,7 +43,7 @@ import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.walkers.*; @@ -370,33 +370,6 @@ public class GenomeAnalysisEngine { throw new ArgumentException("Walker does not allow a reference but one was provided."); } - /** - * Verifies that all required reference-ordered data has been supplied, and any reference-ordered data that was not - * 'allowed' is still present. - * - * @param rods Reference-ordered data to load. - */ - protected void validateSuppliedReferenceOrderedData(List rods) { - // Check to make sure that all required metadata is present. - List allRequired = WalkerManager.getRequiredMetaData(walker); - for (RMD required : allRequired) { - boolean found = false; - for (ReferenceOrderedDataSource rod : rods) { - if (rod.matchesNameAndRecordType(required.name(), required.type())) - found = true; - } - if (!found) - throw new ArgumentException(String.format("Walker requires reference metadata to be supplied named '%s' of type '%s', but this metadata was not provided. " + - "Please supply the specified metadata file.", required.name(), required.type().getSimpleName())); - } - - // Check to see that no forbidden rods are present. - for (ReferenceOrderedDataSource rod : rods) { - if (!WalkerManager.isAllowed(walker, rod)) - throw new ArgumentException(String.format("Walker of type %s does not allow access to metadata: %s", walker.getClass(), rod.getName())); - } - } - protected void validateSuppliedIntervals() { // Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped. if(!(walker instanceof ReadWalker)) { @@ -716,8 +689,6 @@ public class GenomeAnalysisEngine { validateSuppliedReads(); readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference()); - sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); - for (ReadFilter filter : filters) filter.initialize(this); @@ -926,9 +897,6 @@ public class GenomeAnalysisEngine { GenomeLocParser genomeLocParser, ValidationExclusion.TYPE validationExclusionType) { RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser,validationExclusionType); - // try and make the tracks given their requests - // create of live instances of the tracks - List tracks = new ArrayList(); List dataSources = new ArrayList(); for (RMDTriplet fileDescriptor : referenceMetaDataFiles) @@ -939,7 +907,6 @@ public class GenomeAnalysisEngine { flashbackData())); // validation: check to make sure everything the walker needs is present, and that all sequence dictionaries match. - validateSuppliedReferenceOrderedData(dataSources); validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), dataSources, builder); return dataSources; @@ -994,7 +961,7 @@ public class GenomeAnalysisEngine { /** * Get the list of intervals passed to the engine. - * @return List of intervals. + * @return List of intervals, or null if no intervals are in use */ public GenomeLocSortedSet getIntervals() { return this.intervals; diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index cf190835e..f053c299c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -33,9 +33,7 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.help.DescriptionTaglet; -import org.broadinstitute.sting.utils.help.DisplayNameTaglet; -import org.broadinstitute.sting.utils.help.SummaryTaglet; +import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.util.*; @@ -82,19 +80,10 @@ public class WalkerManager extends PluginManager { * @return A suitable display name for the package. */ public String getPackageDisplayName(String packageName) { - // Try to find an override for the display name of this package. - String displayNameKey = String.format("%s.%s",packageName,DisplayNameTaglet.NAME); - String displayName; - if(helpText.containsKey(displayNameKey)) { - displayName = helpText.getString(displayNameKey); - } - else { - // If no override exists... - // ...try to compute the override from the text of the package name, while accounting for - // unpackaged walkers. - displayName = packageName.substring(packageName.lastIndexOf('.')+1); - if(displayName.trim().equals("")) displayName = ""; - } + // ...try to compute the override from the text of the package name, while accounting for + // unpackaged walkers. + String displayName = packageName.substring(packageName.lastIndexOf('.')+1); + if (displayName.trim().equals("")) displayName = ""; return displayName; } @@ -104,7 +93,7 @@ public class WalkerManager extends PluginManager { * @return Package help text, or "" if none exists. */ public String getPackageSummaryText(String packageName) { - String key = String.format("%s.%s",packageName,SummaryTaglet.NAME); + String key = String.format("%s.%s",packageName, ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); if(!helpText.containsKey(key)) return ""; return helpText.getString(key); @@ -116,7 +105,7 @@ public class WalkerManager extends PluginManager { * @return Walker summary description, or "" if none exists. */ public String getWalkerSummaryText(Class walkerType) { - String walkerSummary = String.format("%s.%s",walkerType.getName(), SummaryTaglet.NAME); + String walkerSummary = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); if(!helpText.containsKey(walkerSummary)) return ""; return helpText.getString(walkerSummary); @@ -137,7 +126,7 @@ public class WalkerManager extends PluginManager { * @return Walker full description, or "" if none exists. */ public String getWalkerDescriptionText(Class walkerType) { - String walkerDescription = String.format("%s.%s",walkerType.getName(), DescriptionTaglet.NAME); + String walkerDescription = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.DESCRIPTION_TAGLET_NAME); if(!helpText.containsKey(walkerDescription)) return ""; return helpText.getString(walkerDescription); @@ -188,19 +177,7 @@ public class WalkerManager extends PluginManager { * @return The list of allowed reference meta data. */ public static List getAllowsMetaData(Class walkerClass) { - Allows allowsDataSource = getWalkerAllowed(walkerClass); - if (allowsDataSource == null) - return Collections.emptyList(); - return Arrays.asList(allowsDataSource.referenceMetaData()); - } - - /** - * Get a list of RODs allowed by the walker. - * @param walker Walker to query. - * @return The list of allowed reference meta data. - */ - public static List getAllowsMetaData(Walker walker) { - return getAllowsMetaData(walker.getClass()); + return Collections.emptyList(); } /** @@ -237,24 +214,7 @@ public class WalkerManager extends PluginManager { * @return True if the walker forbids this data type. False otherwise. */ public static boolean isAllowed(Class walkerClass, ReferenceOrderedDataSource rod) { - Allows allowsDataSource = getWalkerAllowed(walkerClass); - - // Allows is less restrictive than requires. If an allows - // clause is not specified, any kind of data is allowed. - if( allowsDataSource == null ) - return true; - - // The difference between unspecified RMD and the empty set of metadata can't be detected. - // Treat an empty 'allows' as 'allow everything'. Maybe we can have a special RMD flag to account for this - // case in the future. - if( allowsDataSource.referenceMetaData().length == 0 ) - return true; - - for( RMD allowed: allowsDataSource.referenceMetaData() ) { - if( rod.matchesNameAndRecordType(allowed.name(),allowed.type()) ) - return true; - } - return false; + return true; } /** @@ -294,8 +254,7 @@ public class WalkerManager extends PluginManager { * @return The list of required reference meta data. */ public static List getRequiredMetaData(Class walkerClass) { - Requires requiresDataSource = getWalkerRequirements(walkerClass); - return Arrays.asList(requiresDataSource.referenceMetaData()); + return Collections.emptyList(); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceProgressListener.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java similarity index 68% rename from public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceProgressListener.java rename to public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java index 8dace8fe4..2f4dd06e2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSourceProgressListener.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java @@ -23,8 +23,26 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.datasources.reference; +package org.broadinstitute.sting.gatk.arguments; + + +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.simpleframework.xml.*; + +/** + * @author ebanks + * @version 1.0 + */ +@Root +public class DbsnpArgumentCollection { + + /** + * A dbSNP VCF file. + */ + @Input(fullName="dbsnp", shortName = "D", doc="dbSNP file", required=false) + public RodBinding dbsnp; -public interface ReferenceDataSourceProgressListener { - public void percentProgress(int percent); } + diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index ee2e85025..fd39d46b0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -101,6 +101,8 @@ public class GATKArgumentCollection { @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) public File referenceFile = null; + @Deprecated + @Hidden @ElementList(required = false) @Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :, ", required = false) public ArrayList RODBindings = new ArrayList(); @@ -117,11 +119,6 @@ public class GATKArgumentCollection { @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; - - @Element(required = false) - @Input(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false) - public String DBSNPFile = null; - /** * The override mechanism in the GATK, by default, populates the command-line arguments, then * the defaults from the walker annotations. Unfortunately, walker annotations should be trumped @@ -345,14 +342,6 @@ public class GATKArgumentCollection { return false; } } - if (other.RODBindings.size() != RODBindings.size()) { - return false; - } - for (int x = 0; x < RODBindings.size(); x++) { - if (!RODBindings.get(x).equals(other.RODBindings.get(x))) { - return false; - } - } if (!other.samFiles.equals(this.samFiles)) { return false; } @@ -380,9 +369,6 @@ public class GATKArgumentCollection { if (!other.excludeIntervals.equals(this.excludeIntervals)) { return false; } - if (!other.DBSNPFile.equals(this.DBSNPFile)) { - return false; - } if (!other.unsafe.equals(this.unsafe)) { return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java new file mode 100644 index 000000000..654770fe7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.arguments; + + +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.simpleframework.xml.Root; + +/** + * @author ebanks + * @version 1.0 + */ +@Root +public class StandardVariantContextInputArgumentCollection { + + /** + * Variants from this VCF file are used by this tool as input. + * The file must at least contain the standard VCF header lines, but + * can be empty (i.e., no variants are contained in the file). + */ + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public RodBinding variants; + +} + diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java index 223659a46..d065635c8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java @@ -1,8 +1,10 @@ package org.broadinstitute.sting.gatk.datasources.providers; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import java.util.ArrayList; @@ -49,11 +51,14 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView { * @param loc Locus at which to track. * @return A tracker containing information about this locus. */ - public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { - RefMetaDataTracker tracks = new RefMetaDataTracker(states.size()); + public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) { + List bindings = states.isEmpty() ? Collections.emptyList() : new ArrayList(states.size()); + for ( ReferenceOrderedDataState state: states ) - tracks.bind( state.dataSource.getName(), state.iterator.seekForward(loc) ); - return tracks; + // todo -- warning, I removed the reference to the name from states + bindings.add( state.iterator.seekForward(loc) ); + + return new RefMetaDataTracker(bindings, referenceContext); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java index 2d46a85ac..939cbfe35 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedView.java @@ -1,8 +1,9 @@ package org.broadinstitute.sting.gatk.datasources.providers; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; public interface ReferenceOrderedView extends View { - RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ); + RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext refContext ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java index 39c632539..c38b09334 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; @@ -45,7 +46,8 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { */ private RODMergingIterator rodQueue = null; - RefMetaDataTracker tracker = null; + Collection allTracksHere; + GenomeLoc lastLoc = null; RODRecordList interval = null; @@ -94,12 +96,12 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { } rodQueue = new RODMergingIterator(iterators); - - //throw new StingException("RodLocusView currently disabled"); } - public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { - return tracker; + public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) { + // special case the interval again -- add it into the ROD + if ( interval != null ) { allTracksHere.add(interval); } + return new RefMetaDataTracker(allTracksHere, referenceContext); } public boolean hasNext() { @@ -122,10 +124,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { if ( DEBUG ) System.out.printf("In RodLocusView.next(): creating tracker...%n"); - // Update the tracker here for use - Collection allTracksHere = getSpanningTracks(datum); - tracker = createTracker(allTracksHere); - + allTracksHere = getSpanningTracks(datum); GenomeLoc rodSite = datum.getLocation(); GenomeLoc site = genomeLocParser.createGenomeLoc( rodSite.getContig(), rodSite.getStart(), rodSite.getStart()); @@ -137,19 +136,6 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { return new AlignmentContext(site, new ReadBackedPileupImpl(site), skippedBases); } - private RefMetaDataTracker createTracker( Collection allTracksHere ) { - RefMetaDataTracker t = new RefMetaDataTracker(allTracksHere.size()); - for ( RODRecordList track : allTracksHere ) { - if ( ! t.hasROD(track.getName()) ) - t.bind(track.getName(), track); - } - - // special case the interval again -- add it into the ROD - if ( interval != null ) { t.bind(interval.getName(), interval); } - - return t; - } - private Collection getSpanningTracks(RODRecordList marker) { return rodQueue.allElementsLTE(marker); } @@ -197,10 +183,6 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { return getSkippedBases(getLocOneBeyondShard()); } - public RefMetaDataTracker getTracker() { - return tracker; - } - /** * Closes the current view. */ @@ -209,6 +191,6 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { state.dataSource.close( state.iterator ); rodQueue = null; - tracker = null; + allTracksHere = null; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java index 198f7d7d3..ba6321121 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -59,8 +59,8 @@ public class LowMemoryIntervalSharder implements Iterator { */ public FilePointer next() { FilePointer current = wrappedIterator.next(); - //while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) - // current = current.combine(parser,wrappedIterator.next()); + while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) + current = current.combine(parser,wrappedIterator.next()); return current; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 6064806f3..572970349 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -893,6 +893,7 @@ public class SAMDataSource { * Custom representation of interval bounds. * Makes it simpler to track current position. */ + private int[] intervalContigIndices; private int[] intervalStarts; private int[] intervalEnds; @@ -917,12 +918,14 @@ public class SAMDataSource { if(foundMappedIntervals) { if(keepOnlyUnmappedReads) throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads"); + this.intervalContigIndices = new int[intervals.size()]; this.intervalStarts = new int[intervals.size()]; this.intervalEnds = new int[intervals.size()]; int i = 0; for(GenomeLoc interval: intervals) { - intervalStarts[i] = (int)interval.getStart(); - intervalEnds[i] = (int)interval.getStop(); + intervalContigIndices[i] = interval.getContigIndex(); + intervalStarts[i] = interval.getStart(); + intervalEnds[i] = interval.getStop(); i++; } } @@ -961,11 +964,10 @@ public class SAMDataSource { while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) { if(!keepOnlyUnmappedReads) { // Mapped read filter; check against GenomeLoc-derived bounds. - if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] || - (candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) { - // This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval. + if(readEndsOnOrAfterStartingBound(candidateRead)) { + // This read ends after the current interval begins. // Promising, but this read must be checked against the ending bound. - if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) { + if(readStartsOnOrBeforeEndingBound(candidateRead)) { // Yes, this read is within both bounds. This must be our next read. nextRead = candidateRead; break; @@ -993,6 +995,37 @@ public class SAMDataSource { candidateRead = iterator.next(); } } + + /** + * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its + * end will be distorted, so rely only on the alignment start. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) { + return + // Read ends on a later contig, or... + read.getReferenceIndex() > intervalContigIndices[currentBound] || + // Read ends of this contig... + (read.getReferenceIndex() == intervalContigIndices[currentBound] && + // either after this location, or... + (read.getAlignmentEnd() >= intervalStarts[currentBound] || + // read is unmapped but positioned and alignment start is on or after this start point. + (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); + } + + /** + * Check whether the read lies before the end of the current bound. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) { + return + // Read starts on a prior contig, or... + read.getReferenceIndex() < intervalContigIndices[currentBound] || + // Read starts on this contig and the alignment start is registered before this end point. + (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); + } } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index ef69a8e5f..c8c79bb14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -41,7 +41,7 @@ import java.io.File; * Loads reference data from fasta file * Looks for fai and dict files, and tries to create them if they don't exist */ -public class ReferenceDataSource implements ReferenceDataSourceProgressListener { +public class ReferenceDataSource { private IndexedFastaSequenceFile index; /** our log, which we want to capture anything from this class */ @@ -75,7 +75,7 @@ public class ReferenceDataSource implements ReferenceDataSourceProgressListener // get exclusive lock if (!indexLock.exclusiveLock()) throw new UserException.CouldNotCreateReferenceIndexFileBecauseOfLock(dictFile); - FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, this); + FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, true); FastaSequenceIndex sequenceIndex = faiBuilder.createIndex(); FastaSequenceIndexBuilder.saveAsFaiFile(sequenceIndex, indexFile); } @@ -194,13 +194,4 @@ public class ReferenceDataSource implements ReferenceDataSourceProgressListener public IndexedFastaSequenceFile getReference() { return this.index; } - - /** - * Notify user of progress in creating fai file - * @param percent Percent of fasta file read as a percent - */ - public void percentProgress(int percent) { - System.out.println(String.format("PROGRESS UPDATE: file is %d percent complete", percent)); - } - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java index abd5929eb..9d5a54f58 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPool.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.rmd; import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.FlashBackIterator; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java index 60b68bda5..18679dd77 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java @@ -29,7 +29,7 @@ import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.utils.GenomeLoc; @@ -110,11 +110,11 @@ public class ReferenceOrderedDataSource { } public Class getType() { - return builder.getAvailableTrackNamesAndTypes().get(fileDescriptor.getType().toUpperCase()); + return builder.getFeatureManager().getByTriplet(fileDescriptor).getCodecClass(); } public Class getRecordType() { - return builder.createCodec(getType(),getName()).getFeatureType(); + return builder.getFeatureManager().getByTriplet(fileDescriptor).getFeatureClass(); } public File getFile() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java new file mode 100644 index 000000000..4541a0537 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.examples; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; + +/** + * [Short one sentence description of this walker] + * + *

+ * [Functionality of this walker] + *

+ * + *

Input

+ *

+ * [Input description] + *

+ * + *

Output

+ *

+ * [Output description] + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public class GATKDocsExample extends RodWalker { + /** + * Put detailed documentation about the argument here. No need to duplicate the summary information + * in doc annotation field, as that will be added before this text in the documentation page. + * + * Notes: + *
    + *
  • This field can contain HTML as a normal javadoc
  • + *
  • Don't include information about the default value, as gatkdocs adds this automatically
  • + *
  • Try your best to describe in detail the behavior of the argument, as ultimately confusing + * docs here will just result in user posts on the forum
  • + *
+ */ + @Argument(fullName="full", shortName="short", doc="Brief summary of argument [~ 80 characters of text]", required=false) + private boolean myWalkerArgument = false; + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; } + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } + public void onTraversalDone(Integer result) { } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 59fb4aa9e..3b9e35311 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -97,7 +97,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar if (!( walker instanceof TreeReducible )) throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); - traversalEngine.startTimers(); ReduceTree reduceTree = new ReduceTree(this); initializeWalker(walker); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 9466fdf75..09ab4bd44 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -44,15 +44,16 @@ public class LinearMicroScheduler extends MicroScheduler { * @param shardStrategy A strategy for sharding the data. */ public Object execute(Walker walker, ShardStrategy shardStrategy) { - traversalEngine.startTimers(); walker.initialize(); Accumulator accumulator = Accumulator.create(engine,walker); + boolean done = walker.isDone(); int counter = 0; - for (Shard shard : processingTracker.onlyOwned(shardStrategy, engine.getName())) { - if ( shard == null ) // we ran out of shards that aren't owned + for (Shard shard : shardStrategy ) { + if ( done || shard == null ) // we ran out of shards that aren't owned break; + traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { LocusWalker lWalker = (LocusWalker)walker; WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata()); @@ -61,6 +62,7 @@ public class LinearMicroScheduler extends MicroScheduler { Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); accumulator.accumulate(dataProvider,result); dataProvider.close(); + if ( walker.isDone() ) break; } windowMaker.close(); } @@ -70,6 +72,8 @@ public class LinearMicroScheduler extends MicroScheduler { accumulator.accumulate(dataProvider,result); dataProvider.close(); } + + done = walker.isDone(); } Object result = accumulator.finishTraversal(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 23e5769f1..e731b9864 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -39,14 +39,10 @@ import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.threading.*; import javax.management.JMException; import javax.management.MBeanServer; import javax.management.ObjectName; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; import java.lang.management.ManagementFactory; import java.util.Collection; @@ -83,8 +79,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { private final MBeanServer mBeanServer; private final ObjectName mBeanName; - protected GenomeLocProcessingTracker processingTracker; - /** * MicroScheduler factory function. Create a microscheduler appropriate for reducing the * selected walker. @@ -98,11 +92,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse) { - if (engine.getArguments().processingTrackerFile != null) { - if ( walker instanceof ReadWalker ) - throw new UserException.BadArgumentValue("C", String.format("Distributed GATK processing not enabled for read walkers")); - } - if (walker instanceof TreeReducible && nThreadsToUse > 1) { if(walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); @@ -157,33 +146,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { catch (JMException ex) { throw new ReviewedStingException("Unable to register microscheduler with JMX", ex); } - - // - // create the processing tracker - // - if ( engine.getArguments().processingTrackerFile != null ) { - logger.warn("Distributed GATK is an experimental engine feature, and is likely to not work correctly or reliably."); - if ( engine.getArguments().restartProcessingTracker && engine.getArguments().processingTrackerFile.exists() ) { - engine.getArguments().processingTrackerFile.delete(); - logger.info("Deleting ProcessingTracker file " + engine.getArguments().processingTrackerFile); - } - - PrintStream statusStream = null; - if ( engine.getArguments().processingTrackerStatusFile != null ) { - try { - statusStream = new PrintStream(new FileOutputStream(engine.getArguments().processingTrackerStatusFile)); - } catch ( FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(engine.getArguments().processingTrackerStatusFile, e); - } - } - - ClosableReentrantLock lock = new SharedFileThreadSafeLock(engine.getArguments().processingTrackerFile, engine.getArguments().processTrackerID); - processingTracker = new FileBackedGenomeLocProcessingTracker(engine.getArguments().processingTrackerFile, engine.getGenomeLocParser(), lock, statusStream) ; - logger.info("Creating ProcessingTracker using shared file " + engine.getArguments().processingTrackerFile + " process.id = " + engine.getName() + " CID = " + engine.getArguments().processTrackerID); - } else { - // create a NoOp version that doesn't do anything but say "yes" - processingTracker = new NoOpGenomeLocProcessingTracker(); - } } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 6136bd68d..2b6488ada 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -57,6 +57,7 @@ public class ShardTraverser implements Callable { public Object call() { try { + traversalEngine.startTimersIfNecessary(); long startTime = System.currentTimeMillis(); Object accumulator = walker.reduceInit(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java index cd77a9e7e..4ec451567 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java @@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord; * Filter out FailsVendorQualityCheck reads. */ -public class FailsVendorQualityCheckReadFilter extends ReadFilter { +public class FailsVendorQualityCheckFilter extends ReadFilter { public boolean filterOut( final SAMRecord read ) { return read.getReadFailsVendorQualityCheckFlag(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java similarity index 96% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java index 75369b306..ed9c37dca 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java @@ -35,7 +35,7 @@ import org.broadinstitute.sting.commandline.Argument; * @version 0.1 */ -public class MappingQualityReadFilter extends ReadFilter { +public class MappingQualityFilter extends ReadFilter { @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) public int MIN_MAPPING_QUALTY_SCORE = 10; diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java index 1afec36d1..ccdb40d31 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.QualityUtils; * @version 0.1 */ -public class MappingQualityUnavailableReadFilter extends ReadFilter { +public class MappingQualityUnavailableFilter extends ReadFilter { public boolean filterOut(SAMRecord rec) { return (rec.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java index e49d4117c..57db8419c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java @@ -33,7 +33,7 @@ import net.sf.samtools.SAMRecord; * @version 0.1 */ -public class MappingQualityZeroReadFilter extends ReadFilter { +public class MappingQualityZeroFilter extends ReadFilter { public boolean filterOut(SAMRecord rec) { return (rec.getMappingQuality() == 0); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java rename to public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java index 31c2144ce..50cd30f71 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java @@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord; * Filter out duplicate reads. */ -public class NotPrimaryAlignmentReadFilter extends ReadFilter { +public class NotPrimaryAlignmentFilter extends ReadFilter { public boolean filterOut( final SAMRecord read ) { return read.getNotPrimaryAlignmentFlag(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java index 30b2f828d..8e241bb2c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java @@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; * @version 0.1 */ public class PlatformFilter extends ReadFilter { - @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false) + @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false) protected String[] PLFilterNames; public boolean filterOut(SAMRecord rec) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java index 227637761..bf3ce352a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java @@ -2,10 +2,14 @@ package org.broadinstitute.sting.gatk.filters; import net.sf.picard.filter.SamRecordFilter; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; /** * A SamRecordFilter that also depends on the header. */ +@DocumentedGATKFeature( + groupName = "Read filters", + summary = "GATK Engine arguments that filter or transfer incoming SAM/BAM data files" ) public abstract class ReadFilter implements SamRecordFilter { /** * Sets the header for use by this filter. diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java new file mode 100644 index 000000000..50a1384fa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Argument; + +/** + * A read filter (transformer) that sets all reads mapping quality to a given value. + * + *

+ * If a BAM file contains erroneous or missing mapping qualities, this 'filter' will set + * all your mapping qualities to a given value. Default being 60. + *

+ * + * + *

Input

+ *

+ * BAM file(s) + *

+ * + * + *

Output

+ *

+ * BAM file(s) with all reads mapping qualities reassigned + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -rf ReassignMappingQuality
+ *      -DMQ 35
+ *  
+ * + * @author carneiro + * @since 8/8/11 + */ + +public class ReassignMappingQualityFilter extends ReadFilter { + + @Argument(fullName = "default_mapping_quality", shortName = "DMQ", doc = "Default read mapping quality to assign to all reads", required = false) + public int defaultMappingQuality = 60; + + public boolean filterOut(SAMRecord rec) { + rec.setMappingQuality(defaultMappingQuality); + return false; + } +} + diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java index 1da03e9c2..ebb4cbe66 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java @@ -87,8 +87,8 @@ public class VCFWriterStorage implements Storage, VCFWriter { writer.writeHeader(stub.getVCFHeader()); } - public void add(VariantContext vc, byte ref) { - writer.add(vc, ref); + public void add(VariantContext vc) { + writer.add(vc); } /** @@ -117,7 +117,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { BasicFeatureSource source = BasicFeatureSource.getFeatureSource(file.getAbsolutePath(), new VCFCodec(), false); for ( VariantContext vc : source.iterator() ) { - target.writer.add(vc, vc.getReferenceBaseForIndel()); + target.writer.add(vc); } source.close(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java index 8bc97c886..da4eb3955 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.File; import java.io.OutputStream; import java.lang.reflect.Constructor; +import java.lang.reflect.Type; /** * Insert an OutputStreamStub instead of a full-fledged concrete OutputStream implementations. @@ -69,16 +70,21 @@ public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor { } @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source,Class type) { + public String typeDefaultDocString(ArgumentSource source) { + return "stdout"; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { if(!source.isRequired()) throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default."); OutputStreamStub stub = new OutputStreamStub(defaultOutputStream); engine.addOutput(stub); - return createInstanceOfClass(type,stub); + return createInstanceOfClass((Class)type,stub); } @Override - public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Class type, ArgumentMatches matches ) { + public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { ArgumentDefinition definition = createDefaultArgumentDefinition(source); String fileName = getArgumentValue( definition, matches ); @@ -91,7 +97,7 @@ public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor { engine.addOutput(stub); - Object result = createInstanceOfClass(type,stub); + Object result = createInstanceOfClass(makeRawTypeIfNecessary(type),stub); // WARNING: Side effects required by engine! parsingEngine.addTags(result,getArgumentTags(matches)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java index f124c2302..83d1b7eb2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.SAMFileReaderBuilder; import java.io.File; +import java.lang.reflect.Type; /** * Describe how to parse SAMFileReaders. @@ -52,14 +53,13 @@ public class SAMFileReaderArgumentTypeDescriptor extends ArgumentTypeDescriptor this.engine = engine; } - @Override public boolean supports( Class type ) { return SAMFileReader.class.isAssignableFrom(type); } @Override - public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Class type, ArgumentMatches matches ) { + public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { SAMFileReaderBuilder builder = new SAMFileReaderBuilder(); String readerFileName = getArgumentValue( createDefaultArgumentDefinition(source), matches ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java index 38640eda0..43ec934ed 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.File; import java.io.OutputStream; import java.lang.annotation.Annotation; +import java.lang.reflect.Type; import java.util.Arrays; import java.util.List; @@ -93,7 +94,12 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor } @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source,Class type) { + public String typeDefaultDocString(ArgumentSource source) { + return "stdout"; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { if(!source.isRequired()) throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default."); SAMFileWriterStub stub = new SAMFileWriterStub(engine,defaultOutputStream); @@ -102,7 +108,7 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor } @Override - public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Class type, ArgumentMatches matches ) { + public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { // Extract all possible parameters that could be passed to a BAM file writer? ArgumentDefinition bamArgumentDefinition = createBAMArgumentDefinition(source); String writerFileName = getArgumentValue( bamArgumentDefinition, matches ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java index 615841f02..98026554b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.File; import java.io.OutputStream; +import java.lang.reflect.Type; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; @@ -108,7 +109,12 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { } @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source,Class type) { + public String typeDefaultDocString(ArgumentSource source) { + return "stdout"; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { if(!source.isRequired()) throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default."); VCFWriterStub stub = new VCFWriterStub(engine, defaultOutputStream, false, argumentSources, false, false); @@ -124,7 +130,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { * @return Transform from the matches into the associated argument. */ @Override - public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Class type, ArgumentMatches matches ) { + public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { ArgumentDefinition defaultArgumentDefinition = createDefaultArgumentDefinition(source); // Get the filename for the genotype file, if it exists. If not, we'll need to send output to out. String writerFileName = getArgumentValue(defaultArgumentDefinition,matches); diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index bb84f9457..936243f9d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.io.stubs; +import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.OutputTracker; @@ -177,14 +178,23 @@ public class VCFWriterStub implements Stub, VCFWriter { vcfHeader = header; // Check for the command-line argument header line. If not present, add it in. - VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine(); - boolean foundCommandLineHeaderLine = false; - for(VCFHeaderLine line: vcfHeader.getMetaData()) { - if(line.getKey().equals(commandLineArgHeaderLine.getKey())) - foundCommandLineHeaderLine = true; + if ( !skipWritingHeader ) { + VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine(); + boolean foundCommandLineHeaderLine = false; + for (VCFHeaderLine line: vcfHeader.getMetaData()) { + if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) ) + foundCommandLineHeaderLine = true; + } + if ( !foundCommandLineHeaderLine ) + vcfHeader.addMetaDataLine(commandLineArgHeaderLine); + + // also put in the reference contig header lines + String assembly = getReferenceAssembly(engine.getArguments().referenceFile.getName()); + for ( SAMSequenceRecord contig : engine.getReferenceDataSource().getReference().getSequenceDictionary().getSequences() ) + vcfHeader.addMetaDataLine(getContigHeaderLine(contig, assembly)); + + vcfHeader.addMetaDataLine(new VCFHeaderLine("reference", "file://" + engine.getArguments().referenceFile.getAbsolutePath())); } - if(!foundCommandLineHeaderLine && !skipWritingHeader) - vcfHeader.addMetaDataLine(commandLineArgHeaderLine); outputTracker.getStorage(this).writeHeader(vcfHeader); } @@ -192,8 +202,8 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * @{inheritDoc} */ - public void add(VariantContext vc, byte ref) { - outputTracker.getStorage(this).add(vc,ref); + public void add(VariantContext vc) { + outputTracker.getStorage(this).add(vc); } /** @@ -220,4 +230,27 @@ public class VCFWriterStub implements Stub, VCFWriter { CommandLineExecutable executable = JVMUtils.getObjectOfType(argumentSources,CommandLineExecutable.class); return new VCFHeaderLine(executable.getAnalysisName(), "\"" + engine.createApproximateCommandLineArgumentString(argumentSources.toArray()) + "\""); } + + private VCFHeaderLine getContigHeaderLine(SAMSequenceRecord contig, String assembly) { + String val; + if ( assembly != null ) + val = String.format("", contig.getSequenceName(), contig.getSequenceLength(), assembly); + else + val = String.format("", contig.getSequenceName(), contig.getSequenceLength()); + return new VCFHeaderLine("contig", val); + } + + private String getReferenceAssembly(String refPath) { + // This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot + String assembly = null; + if ( refPath.indexOf("b37") != -1 || refPath.indexOf("v37") != -1 ) + assembly = "b37"; + else if ( refPath.indexOf("b36") != -1 ) + assembly = "b36"; + else if ( refPath.indexOf("hg18") != -1 ) + assembly = "hg18"; + else if ( refPath.indexOf("hg19") != -1 ) + assembly = "hg19"; + return assembly; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 69c0b3e0a..4d94130a8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -46,7 +46,6 @@ import org.simpleframework.xml.stream.Format; import org.simpleframework.xml.stream.HyphenStyle; import java.io.*; -import java.net.InetAddress; import java.security.NoSuchAlgorithmException; import java.text.DateFormat; import java.text.SimpleDateFormat; @@ -154,9 +153,13 @@ public class GATKRunReport { private long nReads; public enum PhoneHomeOption { + /** Disable phone home */ NO_ET, + /** Standard option. Writes to local repository if it can be found, or S3 otherwise */ STANDARD, + /** Force output to STDOUT. For debugging only */ STDOUT, + /** Force output to S3. For debugging only */ AWS_S3 // todo -- remove me -- really just for testing purposes } @@ -226,22 +229,6 @@ public class GATKRunReport { } - /** - * Helper utility that calls into the InetAddress system to resolve the hostname. If this fails, - * unresolvable gets returned instead. - * - * @return - */ - private String resolveHostname() { - try { - return InetAddress.getLocalHost().getCanonicalHostName(); - } - catch (java.net.UnknownHostException uhe) { // [beware typo in code sample -dmw] - return "unresolvable"; - // handle exception - } - } - public void postReport(PhoneHomeOption type) { logger.debug("Posting report of type " + type); switch (type) { @@ -321,7 +308,7 @@ public class GATKRunReport { private void postReportToAWSS3() { // modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html - this.hostName = resolveHostname(); // we want to fill in the host name + this.hostName = Utils.resolveHostname(); // we want to fill in the host name File localFile = postReportToLocalDisk(new File("./")); logger.debug("Generating GATK report to AWS S3 based on local file " + localFile); if ( localFile != null ) { // we succeeded in creating the local file diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordIterator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordIterator.java deleted file mode 100644 index ce924fd87..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordIterator.java +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata; - -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.lang.reflect.Constructor; -import java.util.Iterator; -import java.util.regex.Pattern; - -/** - * This is a low-level iterator designed to provide system-wide generic support for reading record-oriented data - * files. The only assumption made is that every line in the file provides a complete and separate data record. The records - * can be associated with coordinates or coordinate intervals, there can be one or more records associated with a given - * position/interval, or intervals can overlap. The records must be comprised of delimited fields, but the format is - * otherwise free. For any specific line-based data format, an appropriate implementation of ReferenceOrderedDatum must be - * provided that is capable of parsing itself from a single line of data. This implementation will be used, - * through reflection mechanism, as a callback to do all the work. - * - * The model is, hence, as follows: - * - * String dataRecord <---> RodImplementation ( ::parseLine(dataRecord.split(delimiter)) is aware of the format and fills - * an instance of RodImplementation with data values from dataRecord line). - * - * - * instantiation of RODRecordIterator(dataFile, trackName, RodImplementation.class) will immediately provide an iterator - * that walks along the dataFile line by line, and on each call to next() returns a new RodImplementation object - * representing a single line (record) of data. The returned object will be initialized with "track name" trackName - - * track names (as returned by ROD.getName()) are often used in other parts of the code to distinguish between - * multiple streams of (possibly heterogeneous) annotation data bound to an application. - * - * This generic iterator skips and ignores a) empty lines, b) lines starting with '#' (comments): they are never sent back - * to the ROD implementation class for processing. - * - * This iterator does not actually check if the ROD records (lines) in the file are indeed ordedered by coordinate, - * and it does not depend on such an order as it still implements a low-level line-based traversal of the data. Higher-level - * iterators/wrappers will perform all the necessary checks. - * - * Note: some data formats/ROD implementations may require a header line in the file. In this case the current (ugly) - * mechanism is as follows: - * 1) rod implementation's ::initialize(file) method should be able to open the file, find and read the header line - * and return the header object (to be kept by the iterator) - * 2) rod implementation's ::parseLine(header,line) method should be capable of making use of that saved header object now served to it - * and - * 3) ::parseLine(header,line) should be able to recognize the original header line in the file and skip it (after ROD's initialize() - * method is called, the iterator will re-open the file and start reading it from the very beginning; there is no - * other way, except for "smart" ::parseLine(), to avoid reading in the header line as "data"). - * - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 10, 2009 - * Time: 1:22:23 PM - * To change this template use File | Settings | File Templates. - */ -public class RODRecordIterator implements Iterator { - - private PushbackIterator reader; - - // stores name of the track this iterator reads (will be also returned by getName() of ROD objects - // generated by this iterator) - private String name; - - // we keep the file object, only to use file name in error reports - private File file; - - // rod type; this is what we will instantiate for RODs at runtime - private Class type; - - private Object header = null; // Some RODs may use header - - // field delimiter in the file. Should it be the job of the iterator to split the lines though? RODs can do that! - private String fieldDelimiter; - - // constructor for the ROD objects we are going to return. Constructor that takes the track name as its single arg is required. - private Constructor named_constructor; - - // keep track of the lines we are reading. used for error messages only. - private long linenum = 0; - - private boolean allow_empty = true; - private boolean allow_comments = true; - public static Pattern EMPTYLINE_PATTERN = Pattern.compile("^\\s*$"); - - public RODRecordIterator(File file, String name, Class type) { - try { - reader = new PushbackIterator(new XReadLines(file)); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(file, e); - } - this.file = file; - this.name = name; - this.type = type; - try { - named_constructor = type.getConstructor(String.class); - } - catch (java.lang.NoSuchMethodException e) { - throw new ReviewedStingException("ROD class "+type.getName()+" does not have constructor that accepts a single String argument (track name)"); - } - ROD rod = instantiateROD(name); - fieldDelimiter = rod.delimiterRegex(); // get delimiter from the ROD itself - try { - header = rod.initialize(file); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(file, "ROD "+type.getName() + " failed to initialize properly from file "+file); - } - - } - - - /** - * Returns true if the iteration has more elements. (In other - * words, returns true if next would return an element - * rather than throwing an exception.) - * - * @return true if the iterator has more elements. - */ - public boolean hasNext() { - if ( allow_empty || allow_comments ) { - while ( reader.hasNext() ) { - String line = reader.next(); - if ( allow_empty && EMPTYLINE_PATTERN.matcher(line).matches() ) continue; // skip empty line - if ( allow_comments && line.charAt(0) == '#' ) continue; // skip comment lines - // the line is not empty and not a comment line, so we have next after all - reader.pushback(line); - return true; - } - return false; // oops, we end up here if there's nothing left - } else { - return reader.hasNext(); - } - } - - /** - * Returns the next valid ROD record in the file, skipping empty and comment lines. - * - * @return the next element in the iteration. - * @throws java.util.NoSuchElementException - * iteration has no more elements. - */ - public ROD next() { - ROD n = null; - boolean parsed_ok = false; - String line ; - - while ( ! parsed_ok && reader.hasNext() ) { - line = reader.next(); - linenum++; - while ( allow_empty && EMPTYLINE_PATTERN.matcher(line).matches() || - allow_comments && line.charAt(0) == '#' ) { - if ( reader.hasNext() ) { - line = reader.next(); - linenum++; - } else { - line = null; - break; - } - } - - if ( line == null ) break; // if we ran out of lines while skipping empty lines/comments, then we are done - - String parts[] = line.split(fieldDelimiter); - - try { - n = instantiateROD(name); - parsed_ok = n.parseLine(header,parts) ; - } - catch ( Exception e ) { - throw new UserException.MalformedFile(file, "Failed to parse ROD data ("+type.getName()+") from file "+ file + " at line #"+linenum+ - "\nOffending line: "+line+ - "\nReason ("+e.getClass().getName()+")", e); - } - } - - - return n; - } - - /** - * Removes from the underlying collection the last element returned by the - * iterator (optional operation). This method can be called only once per - * call to next. The behavior of an iterator is unspecified if - * the underlying collection is modified while the iteration is in - * progress in any way other than by calling this method. - * - * @throws UnsupportedOperationException if the remove - * operation is not supported by this Iterator. - * @throws IllegalStateException if the next method has not - * yet been called, or the remove method has already - * been called after the last call to the next - * method. - */ - public void remove() { - throw new UnsupportedOperationException("remove() operation is not supported by RODRecordIterator"); - } - - /** Instantiates appropriate implementation of the ROD used by this iteratot. The 'name' argument is the name - * of the ROD track. - * @param name - * @return - */ - private ROD instantiateROD(final String name) { - try { - return (ROD) named_constructor.newInstance(name); - } catch (Exception e) { - throw new DynamicClassResolutionException(named_constructor.getDeclaringClass(), e); - } - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index d03b122e2..286e22369 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -1,13 +1,15 @@ package org.broadinstitute.sting.gatk.refdata; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; @@ -18,348 +20,402 @@ import java.util.*; * The standard interaction model is: * * Traversal system arrives at a site, which has a bunch of RMDs covering it -Genotype * Traversal calls tracker.bind(name, RMD) for each RMDs in RMDs - * Traversal passes tracker to the walker - * walker calls lookup(name, default) to obtain the RMDs values at this site, or default if none was - * bound at this site. + * Traversal passes creates a tracker and passes it to the walker + * walker calls get(rodBinding) to obtain the RMDs values at this site for the track + * associated with rodBinding. + * + * Note that this is an immutable class. Once created the underlying data structures + * cannot be modified * * User: mdepristo * Date: Apr 3, 2009 * Time: 3:05:23 PM */ public class RefMetaDataTracker { + // TODO: this should be a list, not a map, actually + private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY"); + final Map map; - protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); + final ReferenceContext ref; + final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); - public RefMetaDataTracker(int nBindings) { - if ( nBindings == 0 ) + // ------------------------------------------------------------------------------------------ + // + // + // Special ENGINE interaction functions + // + // + // ------------------------------------------------------------------------------------------ + + public RefMetaDataTracker(final Collection allBindings, final ReferenceContext ref) { + this.ref = ref; + + // set up the map + if ( allBindings.isEmpty() ) map = Collections.emptyMap(); - else - map = new HashMap(nBindings); + else { + Map tmap = new HashMap(allBindings.size()); + for ( RODRecordList rod : allBindings ) { + if ( rod != null && ! rod.isEmpty() ) + tmap.put(canonicalName(rod.getName()), rod); + } + + // ensure that no one modifies the map itself + map = Collections.unmodifiableMap(tmap); + } + } + + // ------------------------------------------------------------------------------------------ + // + // + // Generic accessors + // + // + // ------------------------------------------------------------------------------------------ + + /** + * Gets all of the Tribble features spanning this locus, returning them as a list of specific + * type T extending Feature. This function looks across all tracks to find the Features, so + * if you have two tracks A and B each containing 1 Feature, then getValues will return + * a list containing both features. + * + * Note that this function assumes that all of the bound features are instances of or + * subclasses of T. A ClassCastException will occur if this isn't the case. If you want + * to get all Features without any danger of such an exception use the root Tribble + * interface Feature. + * + * @param type The type of the underlying objects bound here + * @param as above + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"type != null"}) + @Ensures("result != null") + public List getValues(final Class type) { + return addValues(map.keySet(), type, new ArrayList(), null, false, false); } /** - * get all the reference meta data associated with a track name. - * @param name the name of the track we're looking for - * @return a list of objects, representing the underlying objects that the tracks produce. I.e. for a - * dbSNP RMD this will be a RodDbSNP, etc. + * Provides the same functionality as @link #getValues(Class) but will only include + * Features that start as the GenomeLoc provide onlyAtThisLoc. * - * Important: The list returned by this function is guaranteed not to be null, but may be empty! + * @param type The type of the underlying objects bound here + * @param onlyAtThisLoc + * @param as above + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. */ - public List getReferenceMetaData(final String name) { - RODRecordList list = getTrackDataByName(name, true); - List objects = new ArrayList(); - if (list == null) return objects; - for (GATKFeature feature : list) - objects.add(feature.getUnderlyingObject()); - return objects; + @Requires({"type != null", "onlyAtThisLoc != null"}) + @Ensures("result != null") + public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) { + return addValues(map.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); } /** - * get all the reference meta data associated with a track name. - * @param name the name of the track we're looking for - * @param requireExactMatch do we require an exact match for the name (true) or do we require only that the name starts with - * the passed in parameter (false). - * @return a list of objects, representing the underlying objects that the tracks produce. I.e. for a - * dbSNP rod this will be a RodDbSNP, etc. + * Uses the same logic as @link #getValues(Class) but arbitrary select one of the resulting + * elements of the list to return. That is, if there would be two elements in the result of + * @link #getValues(Class), one of these two is selected, and which one it will be isn't + * specified. Consequently, this method is only really safe if (1) you absolutely know + * that only one binding will meet the constraints of @link #getValues(Class) or (2) + * you truly don't care which of the multiple bindings available you are going to examine. * - * Important: The list returned by this function is guaranteed not to be null, but may be empty! + * If there are no bindings here, getFirstValue() return null + * + * @param type The type of the underlying objects bound here + * @param as above + * @return A random single element the RODs bound here, or null if none are bound. */ - public List getReferenceMetaData(final String name, boolean requireExactMatch) { - RODRecordList list = getTrackDataByName(name, requireExactMatch); - List objects = new ArrayList(); - if (list == null) return objects; - for (GATKFeature feature : list) - objects.add(feature.getUnderlyingObject()); - return objects; + @Requires({"type != null"}) + public T getFirstValue(final Class type) { + return safeGetFirst(getValues(type)); } /** - * get all the GATK features associated with a specific track name - * @param name the name of the track we're looking for - * @param requireExactMatch do we require an exact match for the name (true) or do we require only that the name starts with - * the passed in parameter (false). - * @return a list of GATKFeatures for the target rmd + * Uses the same logic as @link #getValue(Class,GenomeLoc) to determine the list + * of eligible Features and @link #getFirstValue(Class) to select a single + * element from the interval list. * - * Important: The list returned by this function is guaranteed not to be null, but may be empty! + * @param type The type of the underlying objects bound here + * @param as above + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A random single element the RODs bound here starting at onlyAtThisLoc, or null if none are bound. */ - public List getGATKFeatureMetaData(final String name, boolean requireExactMatch) { - List feat = getTrackDataByName(name,requireExactMatch); - return (feat == null) ? new ArrayList() : feat; // to satisfy the above requirement that we don't return null + @Requires({"type != null", "onlyAtThisLoc != null"}) + public T getFirstValue(final Class type, final GenomeLoc onlyAtThisLoc) { + return safeGetFirst(getValues(type, onlyAtThisLoc)); + } /** - * get a singleton record, given the name and a type. This function will return the first record at the current position seen, - * and emit a logger warning if there were more than one option. + * Gets all of the Tribble features bound to RodBinding spanning this locus, returning them as + * a list of specific type T extending Feature. * - * WARNING: this method is deprecated, since we now suppport more than one RMD at a single position for all tracks. If there are - * are multiple RMD objects at this location, there is no contract for which object this method will pick, and which object gets - * picked may change from time to time! BE WARNED! - * - * @param name the name of the track - * @param clazz the underlying type to return - * @param the type to parameterize on, matching the clazz argument - * @return a record of type T, or null if no record is present. + * Note that this function assumes that all of the bound features are instances of or + * subclasses of T. A ClassCastException will occur if this isn't the case. + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. */ - @Deprecated - public T lookup(final String name, Class clazz) { - RODRecordList objects = getTrackDataByName(name, true); + @Requires({"rodBinding != null"}) + @Ensures("result != null") + public List getValues(final RodBinding rodBinding) { + return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), null, false, false); + } - // if emtpy or null return null; - if (objects == null || objects.size() < 1) return null; + /** + * Gets all of the Tribble features bound to any RodBinding in rodBindings, + * spanning this locus, returning them as a list of specific type T extending Feature. + * + * Note that this function assumes that all of the bound features are instances of or + * subclasses of T. A ClassCastException will occur if this isn't the case. + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null"}) + @Ensures("result != null") + public List getValues(final Collection> rodBindings) { + List results = new ArrayList(1); + for ( RodBinding rodBinding : rodBindings ) + results.addAll(getValues(rodBinding)); + return results; + } - if (objects.size() > 1) - logger.info("lookup is choosing the first record from " + (objects.size() - 1) + " options"); + /** + * The same logic as @link #getValues(RodBinding) but enforces that each Feature start at onlyAtThisLoc + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) + @Ensures("result != null") + public List getValues(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { + return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), onlyAtThisLoc, true, false); + } - Object obj = objects.get(0).getUnderlyingObject(); - if (!(clazz.isAssignableFrom(obj.getClass()))) - throw new UserException.CommandLineException("Unable to case track named " + name + " to type of " + clazz.toString() - + " it's of type " + obj.getClass()); + /** + * The same logic as @link #getValues(List) but enforces that each Feature start at onlyAtThisLoc + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) + @Ensures("result != null") + public List getValues(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { + List results = new ArrayList(1); + for ( RodBinding rodBinding : rodBindings ) + results.addAll(getValues(rodBinding, onlyAtThisLoc)); + return results; + } - return (T)obj; + /** + * Uses the same logic as @getValues(RodBinding) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param as above + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBinding != null"}) + public T getFirstValue(final RodBinding rodBinding) { + return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), null, false, true)); + } + + /** + * Uses the same logic as @getValues(RodBinding, GenomeLoc) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param as above + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) + public T getFirstValue(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { + return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), onlyAtThisLoc, true, true)); + } + + /** + * Uses the same logic as @getValues(List) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched + * @param as above + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBindings != null"}) + public T getFirstValue(final Collection> rodBindings) { + for ( RodBinding rodBinding : rodBindings ) { + T val = getFirstValue(rodBinding); + if ( val != null ) + return val; + } + return null; + } + + /** + * Uses the same logic as @getValues(RodBinding,GenomeLoc) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched + * @param as above + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) + public T getFirstValue(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { + for ( RodBinding rodBinding : rodBindings ) { + T val = getFirstValue(rodBinding, onlyAtThisLoc); + if ( val != null ) + return val; + } + return null; } /** * Is there a binding at this site to a ROD/track with the specified name? * - * @param name the name of the rod - * @return true if it has the rod + * @param rodBinding the rod binding we want to know about + * @return true if any Features are bound in this tracker to rodBinding */ - public boolean hasROD(final String name) { - return map.containsKey(canonicalName(name)); - } - - - /** - * Get all of the RMDs at the current site. The collection is "flattened": for any track that has multiple records - * at the current site, they all will be added to the list as separate elements. - * - * @return collection of all rods - */ - public Collection getAllRods() { - List l = new ArrayList(); - for ( RODRecordList rl : map.values() ) { - if ( rl == null ) continue; // how do we get null value stored for a track? shouldn't the track be missing from the map alltogether? - l.addAll(rl); - } - return l; - + @Requires({"rodBinding != null"}) + public boolean hasValues(final RodBinding rodBinding) { + return map.containsKey(canonicalName(rodBinding.getName())); } /** * Get all of the RMD tracks at the current site. Each track is returned as a single compound * object (RODRecordList) that may contain multiple RMD records associated with the current site. * - * @return collection of all tracks + * @return List of all tracks */ - public Collection getBoundRodTracks() { - LinkedList bound = new LinkedList(); - - for ( RODRecordList value : map.values() ) { - if ( value != null && value.size() != 0 ) bound.add(value); - } - - return bound; + public List getBoundRodTracks() { + return new ArrayList(map.values()); } /** - * @return the number of ROD bindings (name -> value) where value is not empty in this tracker + * The number of tracks with at least one value bound here + * @return the number of tracks with at least one bound Feature */ - public int getNBoundRodTracks() { - return getNBoundRodTracks(null); + public int getNTracksWithBoundFeatures() { + return map.size(); } - public int getNBoundRodTracks(final String excludeIn ) { - final String exclude = excludeIn == null ? null : canonicalName(excludeIn); + // ------------------------------------------------------------------------------------------ + // + // + // old style accessors + // + // TODO -- DELETE ME + // + // + // ------------------------------------------------------------------------------------------ - int n = 0; - for ( RODRecordList value : map.values() ) { - if ( value != null && ! value.isEmpty() ) { - if ( exclude == null || ! value.getName().equals(exclude) ) - n++; - } - } - - return n; + @Deprecated + public boolean hasValues(final String name) { + return map.containsKey(canonicalName(name)); } + @Deprecated + public List getValues(final Class type, final String name) { + return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false); + } + @Deprecated + public List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); + } + @Deprecated + public T getFirstValue(final Class type, final String name) { + return safeGetFirst(getValues(type, name)); + } + @Deprecated + public T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + return safeGetFirst(getValues(type, name, onlyAtThisLoc)); + } + + // ------------------------------------------------------------------------------------------ + // + // + // Private utility functions + // + // + // ------------------------------------------------------------------------------------------ /** - * Binds the list of reference ordered data records (RMDs) to track name at this site. Should be used only by the traversal - * system to provide access to RMDs in a structured way to the walkers. + * Helper function for getFirst() operations that takes a list of and + * returns the first element, or null if no such element exists. * - * @param name the name of the track - * @param rod the collection of RMD data - */ - public void bind(final String name, RODRecordList rod) { - //logger.debug(String.format("Binding %s to %s", name, rod)); - map.put(canonicalName(name), rod); - } - - - /** - * Converts all possible ROD tracks to VariantContexts objects, of all types, allowing any start and any number - * of entries per ROD. - * The name of each VariantContext corresponds to the ROD name. - * - * @param ref reference context - * @return variant context - */ - public Collection getAllVariantContexts(ReferenceContext ref) { - return getAllVariantContexts(ref, null, null, false, false); - } - - /** - * Returns all of the variant contexts that start at the current location - * @param ref - * @param curLocation + * @param l + * @param * @return */ - public Collection getAllVariantContexts(ReferenceContext ref, GenomeLoc curLocation) { - return getAllVariantContexts(ref, null, curLocation, true, false); + @Requires({"l != null"}) + final private T safeGetFirst(final List l) { + return l.isEmpty() ? null : l.get(0); } - /** - * Converts all possible ROD tracks to VariantContexts objects. If allowedTypes != null, then only - * VariantContexts in the allow set of types will be returned. If requireStartsHere is true, then curLocation - * must not be null, and only records whose start position is == to curLocation.getStart() will be returned. - * If takeFirstOnly is true, then only a single VariantContext will be converted from any individual ROD. Of course, - * this single object must pass the allowed types and start here options if provided. Note that the result - * may return multiple VariantContexts with the same name if that particular track contained multiple RODs spanning - * the current location. - * - * The name of each VariantContext corresponds to the ROD name. - * - * @param ref reference context - * @param allowedTypes allowed types - * @param curLocation location - * @param requireStartHere do we require the rod to start at this location? - * @param takeFirstOnly do we take the first rod only? - * @return variant context - */ - public Collection getAllVariantContexts(ReferenceContext ref, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { - List contexts = new ArrayList(); - - for ( RODRecordList rodList : getBoundRodTracks() ) { - addVariantContexts(contexts, rodList, ref, allowedTypes, curLocation, requireStartHere, takeFirstOnly); - } - - return contexts; - } - - /** - * Gets the variant contexts associated with track name name - * - * see getVariantContexts for more information. - * - * @param ref ReferenceContext to enable conversion to variant context - * @param name name - * @param curLocation location - * @param allowedTypes allowed types - * @param requireStartHere do we require the rod to start at this location? - * @param takeFirstOnly do we take the first rod only? - * @return variant context - */ -// public Collection getVariantContexts(String name, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { -// return getVariantContexts(null, Arrays.asList(name), allowedTypes, curLocation, requireStartHere, takeFirstOnly); -// } - - public Collection getVariantContexts(ReferenceContext ref, String name, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { - return getVariantContexts(ref, Arrays.asList(name), allowedTypes, curLocation, requireStartHere, takeFirstOnly); - } - -// public Collection getVariantContexts(Collection names, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { -// return getVariantContexts(null, names, allowedTypes, curLocation, requireStartHere, takeFirstOnly); -// } - - public Collection getVariantContexts(ReferenceContext ref, Collection names, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { - Collection contexts = new ArrayList(); - + private List addValues(final Collection names, + final Class type, + List values, + final GenomeLoc curLocation, + final boolean requireStartHere, + final boolean takeFirstOnly ) { for ( String name : names ) { - RODRecordList rodList = getTrackDataByName(name,true); // require that the name is an exact match - - if ( rodList != null ) - addVariantContexts(contexts, rodList, ref, allowedTypes, curLocation, requireStartHere, takeFirstOnly ); + RODRecordList rodList = getTrackDataByName(name); // require that the name is an exact match + values = addValues(name, type, values, rodList, curLocation, requireStartHere, takeFirstOnly ); + if ( takeFirstOnly && ! values.isEmpty() ) + break; } - return contexts; - } - - public Collection getVariantContextsByPrefix(ReferenceContext ref, Collection names, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { - Collection contexts = new ArrayList(); - - for ( String name : names ) { - RODRecordList rodList = getTrackDataByName(name,false); // require that the name is an exact match - - if ( rodList != null ) - addVariantContexts(contexts, rodList, ref, allowedTypes, curLocation, requireStartHere, takeFirstOnly ); - } - - return contexts; - } - - /** - * Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not. - * see getVariantContexts for more information. - * - * @param name name - * @param curLocation location - * @param allowedTypes allowed types - * @param requireStartHere do we require the rod to start at this location? - * @return variant context - */ - public VariantContext getVariantContext(ReferenceContext ref, String name, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere ) { - Collection contexts = getVariantContexts(ref, name, allowedTypes, curLocation, requireStartHere, false ); - - if ( contexts.size() > 1 ) - throw new ReviewedStingException("Requested a single VariantContext object for track " + name + " but multiple variants were present at position " + curLocation); - else if ( contexts.size() == 0 ) - return null; - else - return contexts.iterator().next(); - } - - /** - * Very simple accessor that gets the first (and only!) VC associated with name at the current location, or - * null if there's no binding here. - * - * @param ref - * @param name - * @param curLocation - * @return - */ - public VariantContext getVariantContext(ReferenceContext ref, String name, GenomeLoc curLocation) { - return getVariantContext(ref, name, null, curLocation, true); + return values; } - private void addVariantContexts(Collection contexts, RODRecordList rodList, ReferenceContext ref, EnumSet allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) { + + private List addValues(final String name, + final Class type, + List values, + final RODRecordList rodList, + final GenomeLoc curLocation, + final boolean requireStartHere, + final boolean takeFirstOnly ) { for ( GATKFeature rec : rodList ) { - if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec.getUnderlyingObject()) ) { - // ok, we might actually be able to turn this record in a variant context - VariantContext vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec.getUnderlyingObject(), ref); + if ( ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart() ) { // ok, we are going to keep this thing + Object obj = rec.getUnderlyingObject(); + if (!(type.isAssignableFrom(obj.getClass()))) + throw new UserException.CommandLineException("Unable to cast track named " + name + " to type of " + type.toString() + + " it's of type " + obj.getClass()); - if ( vc == null ) // sometimes the track has odd stuff in it that can't be converted - continue; + T objT = (T)obj; + if ( takeFirstOnly ) { + if ( values == null ) + values = Arrays.asList(objT); + else + values.add(objT); - // now, let's decide if we want to keep it - boolean goodType = allowedTypes == null || allowedTypes.contains(vc.getType()); - boolean goodPos = ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart(); - - if ( goodType && goodPos ) { // ok, we are going to keep this thing - contexts.add(vc); - - if ( takeFirstOnly ) - // we only want the first passing instance, so break the loop over records in rodList - break; + break; + } else { + if ( values == null ) + values = new ArrayList(); + values.add(objT); } } } + + return values == null ? Collections.emptyList() : values; } /** * Finds the reference metadata track named 'name' and returns all ROD records from that track associated - * with the current site as a RODRecordList collection object. If no data track with specified name is available, + * with the current site as a RODRecordList List object. If no data track with specified name is available, * returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up * with track name set to 'name' and location set to null; otherwise the wrapper object will have name and * location set to defaultValue.getName() and defaultValue.getLocation(), respectively (use caution, @@ -367,29 +423,16 @@ public class RefMetaDataTracker { * for instance, on locus traversal, location is usually expected to be a single base we are currently looking at, * regardless of the presence of "extended" RODs overlapping with that location). * @param name track name - * @param requireExactMatch do we require an exact match of the rod name? * @return track data for the given rod */ - private RODRecordList getTrackDataByName(final String name, boolean requireExactMatch) { - //logger.debug(String.format("Lookup %s%n", name)); - + private RODRecordList getTrackDataByName(final String name) { final String luName = canonicalName(name); - RODRecordList trackData = null; + RODRecordList l = map.get(luName); + return l == null ? EMPTY_ROD_RECORD_LIST : l; + } - if ( requireExactMatch ) { - if ( map.containsKey(luName) ) - trackData = map.get(luName); - } else { - for ( Map.Entry datum : map.entrySet() ) { - final String rodName = datum.getKey(); - if ( datum.getValue() != null && rodName.startsWith(luName) ) { - if ( trackData == null ) trackData = new RODRecordListImpl(name); - //System.out.printf("Adding bindings from %s to %s at %s%n", rodName, name, datum.getValue().getLocation()); - ((RODRecordListImpl)trackData).add(datum.getValue(), true); - } - } - } - return trackData; + private RODRecordList getTrackDataByName(final RodBinding binding) { + return getTrackDataByName(binding.getName()); } /** @@ -398,6 +441,7 @@ public class RefMetaDataTracker { * @return canonical name of the rod */ private final String canonicalName(final String name) { + // todo -- remove me after switch to RodBinding syntax return name.toLowerCase(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java deleted file mode 100644 index 5cdb6e9f7..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ /dev/null @@ -1,130 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.*; -import java.lang.reflect.Method; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -/** - * Class for representing arbitrary reference ordered data sets - *

- * User: mdepristo - * Date: Feb 27, 2009 - * Time: 10:47:14 AM - * To change this template use File | Settings | File Templates. - */ -public class ReferenceOrderedData implements Iterable { - private String name; - private File file = null; -// private String fieldDelimiter; - - /** Header object returned from the datum */ -// private Object header = null; - - private Class type = null; // runtime type information for object construction - - /** our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(ReferenceOrderedData.class); - - /** - * given an existing file, open it and append all the valid triplet lines to an existing list - * - * @param rodTripletList the list of existing triplets - * @param filename the file to attempt to extract ROD triplets from - */ - protected static void extractRodsFromFile(List rodTripletList, String filename) { - BufferedReader str; - try { - str = new BufferedReader(new FileReader(new File(filename))); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(new File(filename), "Unable to load the ROD input file", e); - } - String line = "NO LINES READ IN"; - try { - while ((line = str.readLine()) != null) { - if (line.matches(".+,.+,.+")) rodTripletList.add(line.trim()); - else logger.warn("the following file line didn't parsing into a triplet -> " + line); - } - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(new File(filename), "Failed reading the input rod file; last line read was " + line, e); - } - } - - - // ---------------------------------------------------------------------- - // - // Constructors - // - // ---------------------------------------------------------------------- - public ReferenceOrderedData(final String name, File file, Class type ) { - this.name = name; - this.file = file; - this.type = type; -// this.header = initializeROD(name, file, type); -// this.fieldDelimiter = newROD(name, type).delimiterRegex(); - } - - public String getName() { return name; } - - public File getFile() { return file; } - - public Class getType() { return type; } - - /** - * Special equals override to see if this ROD is compatible with the given - * name and type. 'Compatible' means that this ROD has the name that's passed - * in and its data can fit into the container specified by type. - * - * @param name Name to check. - * @param type Type to check. - * - * @return True if these parameters imply this rod. False otherwise. - */ - public boolean matches(String name, Class type) { - return this.name.equals(name) && type.isAssignableFrom(this.type); - } - - public Iterator iterator() { - Iterator it; - try { - Method m = type.getDeclaredMethod("createIterator", String.class, java.io.File.class); - it = (Iterator) m.invoke(null, name, file); - } catch (java.lang.NoSuchMethodException e) { - it = new RODRecordIterator(file,name,type); - } catch (java.lang.NullPointerException e) { - throw new RuntimeException(e); - } catch (java.lang.SecurityException e) { - throw new RuntimeException(e); - } catch (java.lang.IllegalAccessException e) { - throw new RuntimeException(e); - } catch (java.lang.IllegalArgumentException e) { - throw new RuntimeException(e); - } catch (java.lang.reflect.InvocationTargetException e) { - throw new RuntimeException(e); - } - // return new RODIterator(it); - return it; - } - - // ---------------------------------------------------------------------- - // - // Manipulations of all of the data - // - // ---------------------------------------------------------------------- - - public static void write(ArrayList data, File output) throws IOException { - final FileWriter out = new FileWriter(output); - - for (ReferenceOrderedDatum rec : data) { - out.write(rec.repl() + "\n"); - } - - out.close(); - } - - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/SelfScopingFeatureCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/SelfScopingFeatureCodec.java new file mode 100644 index 000000000..de781b839 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/SelfScopingFeatureCodec.java @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata; + +import java.io.File; + +/** + * An interface marking that a given Tribble codec can look at the file and determine whether the + * codec specifically parsing the contents of the file. + */ +public interface SelfScopingFeatureCodec { + /** + * This function returns true iff the File potentialInput can be parsed by this + * codec. + * + * The GATK assumes that there's never a situation where two SelfScopingFeaetureCodecs + * return true for the same file. If this occurs the GATK splits out an error. + * + * Note this function must never throw an error. All errors should be trapped + * and false returned. + * + * @param potentialInput the file to test for parsiability with this codec + * @return true if potentialInput can be parsed, false otherwise + */ + public boolean canDecode(final File potentialInput); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index ba9a10d8b..7bf518fd5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -1,12 +1,13 @@ package org.broadinstitute.sting.gatk.refdata; +import net.sf.samtools.util.SequenceUtil; import org.broad.tribble.Feature; -import org.broad.tribble.dbsnp.DbSNPFeature; +import org.broad.tribble.annotation.Strand; +import org.broad.tribble.dbsnp.OldDbSNPFeature; import org.broad.tribble.gelitext.GeliTextFeature; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.codecs.hapmap.HapMapFeature; +import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.variantcontext.*; @@ -92,28 +93,89 @@ public class VariantContextAdaptors { // -------------------------------------------------------------------------------------------------------------- private static class DBSnpAdaptor implements VCAdaptor { + private static boolean isSNP(OldDbSNPFeature feature) { + return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact"); + } + + private static boolean isMNP(OldDbSNPFeature feature) { + return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range"); + } + + private static boolean isInsertion(OldDbSNPFeature feature) { + return feature.getVariantType().contains("insertion"); + } + + private static boolean isDeletion(OldDbSNPFeature feature) { + return feature.getVariantType().contains("deletion"); + } + + private static boolean isIndel(OldDbSNPFeature feature) { + return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature); + } + + public static boolean isComplexIndel(OldDbSNPFeature feature) { + return feature.getVariantType().contains("in-del"); + } + + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + public static List getAlternateAlleleList(OldDbSNPFeature feature) { + List ret = new ArrayList(); + for (String allele : getAlleleList(feature)) + if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele); + return ret; + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + public static List getAlleleList(OldDbSNPFeature feature) { + List alleleList = new ArrayList(); + // add ref first + if ( feature.getStrand() == Strand.POSITIVE ) + alleleList = Arrays.asList(feature.getObserved()); + else + for (String str : feature.getObserved()) + alleleList.add(SequenceUtil.reverseComplement(str)); + if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase()) + && !alleleList.get(0).equals(feature.getNCBIRefBase()) ) + Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0); + + return alleleList; + } + /** * Converts non-VCF formatted dbSNP records to VariantContext. - * @return DbSNPFeature. + * @return OldDbSNPFeature. */ @Override - public Class getAdaptableFeatureType() { return DbSNPFeature.class; } + public Class getAdaptableFeatureType() { return OldDbSNPFeature.class; } @Override public VariantContext convert(String name, Object input, ReferenceContext ref) { - DbSNPFeature dbsnp = (DbSNPFeature)input; - if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp)) ) + OldDbSNPFeature dbsnp = (OldDbSNPFeature)input; + if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) ) return null; - Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true); + Allele refAllele = Allele.create(dbsnp.getNCBIRefBase(), true); - if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || DbSNPHelper.isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) { + if ( isSNP(dbsnp) || isIndel(dbsnp) || isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) { // add the reference allele List alleles = new ArrayList(); alleles.add(refAllele); // add all of the alt alleles - boolean sawNullAllele = false; - for ( String alt : DbSNPHelper.getAlternateAlleleList(dbsnp) ) { + boolean sawNullAllele = refAllele.isNull(); + for ( String alt : getAlternateAlleleList(dbsnp) ) { if ( ! Allele.acceptableAlleleBases(alt) ) { //System.out.printf("Excluding dbsnp record %s%n", dbsnp); return null; @@ -127,14 +189,13 @@ public class VariantContextAdaptors { Map attributes = new HashMap(); attributes.put(VariantContext.ID_KEY, dbsnp.getRsID()); - if ( sawNullAllele ) { - int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; - if ( index < 0 ) - return null; // we weren't given enough reference context to create the VariantContext - attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte(ref.getBases()[index])); - } - Collection genotypes = null; - VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0),dbsnp.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes); + int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; + if ( index < 0 ) + return null; // we weren't given enough reference context to create the VariantContext + Byte refBaseForIndel = new Byte(ref.getBases()[index]); + + Map genotypes = null; + VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd() - (refAllele.isNull() ? 1 : 0), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes, refBaseForIndel); return vc; } else return null; // can't handle anything else @@ -164,16 +225,6 @@ public class VariantContextAdaptors { @Override public Class getAdaptableFeatureType() { return GeliTextFeature.class; } - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a RodGeliText - * @return a VariantContext object - */ -// VariantContext convert(String name, Object input) { -// return convert(name, input, null); -// } - /** * convert to a Variant Context, given: * @param name the name of the ROD @@ -237,17 +288,7 @@ public class VariantContextAdaptors { * @return HapMapFeature. */ @Override - public Class getAdaptableFeatureType() { return HapMapFeature.class; } - - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a RodGeliText - * @return a VariantContext object - */ -// VariantContext convert(String name, Object input) { -// return convert(name, input, null); -// } + public Class getAdaptableFeatureType() { return RawHapMapFeature.class; } /** * convert to a Variant Context, given: @@ -261,7 +302,12 @@ public class VariantContextAdaptors { if ( ref == null ) throw new UnsupportedOperationException("Conversion from HapMap to VariantContext requires a reference context"); - HapMapFeature hapmap = (HapMapFeature)input; + RawHapMapFeature hapmap = (RawHapMapFeature)input; + + int index = hapmap.getStart() - ref.getWindow().getStart(); + if ( index < 0 ) + return null; // we weren't given enough reference context to create the VariantContext + Byte refBaseForIndel = new Byte(ref.getBases()[index]); HashSet alleles = new HashSet(); Allele refSNPAllele = Allele.create(ref.getBase(), true); @@ -271,7 +317,7 @@ public class VariantContextAdaptors { // use the actual alleles, if available if ( alleleMap != null ) { alleles.addAll(alleleMap.values()); - Allele deletionAllele = alleleMap.get(HapMapFeature.INSERTION); // yes, use insertion here (since we want the reference bases) + Allele deletionAllele = alleleMap.get(RawHapMapFeature.INSERTION); // yes, use insertion here (since we want the reference bases) if ( deletionAllele != null && deletionAllele.isReference() ) deletionLength = deletionAllele.length(); } else { @@ -321,7 +367,7 @@ public class VariantContextAdaptors { long end = hapmap.getEnd(); if ( deletionLength > 0 ) end += deletionLength; - VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs); + VariantContext vc = new VariantContext(name, hapmap.getChr(), hapmap.getStart(), end, alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attrs, refBaseForIndel); return vc; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java deleted file mode 100755 index 6bba754be..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.features.annotator; - -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broad.tribble.exception.CodecLineParsingException; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.StringTokenizer; - -public class AnnotatorInputTableCodec implements ReferenceDependentFeatureCodec { - - private static Logger logger = Logger.getLogger(AnnotatorInputTableCodec.class); - - public static final String DELIMITER = "\t"; - - private ArrayList header; - - /** - * The parser to use when resolving genome-wide locations. - */ - private GenomeLocParser genomeLocParser; - - /** - * Set the parser to use when resolving genetic data. - * @param genomeLocParser The supplied parser. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - /** - * Parses the header. - * - * @param reader - * - * @return The # of header lines for this file. - */ - public Object readHeader(LineReader reader) - { - int[] lineCounter = new int[1]; - try { - header = readHeader(reader, lineCounter); - } catch(IOException e) { - throw new IllegalArgumentException("Unable to read from file.", e); - } - return header; - } - - public Class getFeatureType() { - return AnnotatorInputTableFeature.class; - } - - @Override - public Feature decodeLoc(String line) { - StringTokenizer st = new StringTokenizer(line, DELIMITER); - if ( st.countTokens() < 1 ) - throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line); - - GenomeLoc loc; - String chr = st.nextToken(); - if ( chr.indexOf(":") != -1 ) { - loc = genomeLocParser.parseGenomeLoc(chr); - } else { - if ( st.countTokens() < 3 ) - throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line); - loc = genomeLocParser.createGenomeLoc(chr, Integer.valueOf(st.nextToken()), Integer.valueOf(st.nextToken())); - } - return new AnnotatorInputTableFeature(loc.getContig(), loc.getStart(), loc.getStop()); - } - - - /** - * Parses the line into an AnnotatorInputTableFeature object. - * - * @param line - */ - public AnnotatorInputTableFeature decode(String line) { - final ArrayList header = this.header; //optimization - final ArrayList values = Utils.split(line, DELIMITER, header.size()); - - if ( values.size() != header.size()) { - throw new CodecLineParsingException(String.format("Encountered a line that has %d columns while the header has %d columns.\nHeader: " + header + "\nLine: " + values, values.size(), header.size())); - } - - final AnnotatorInputTableFeature feature = new AnnotatorInputTableFeature(header); - for ( int i = 0; i < header.size(); i++ ) { - feature.putColumnValue(header.get(i), values.get(i)); - } - - GenomeLoc loc; - if ( values.get(0).indexOf(":") != -1 ) - loc = genomeLocParser.parseGenomeLoc(values.get(0)); - else - loc = genomeLocParser.createGenomeLoc(values.get(0), Integer.valueOf(values.get(1)), Integer.valueOf(values.get(2))); - - //parse the location - feature.setChr(loc.getContig()); - feature.setStart((int)loc.getStart()); - feature.setEnd((int)loc.getStop()); - - return feature; - } - - /** - * Returns the header. - * @param source - * @return - * @throws IOException - */ - public static ArrayList readHeader(final File source) throws IOException { - FileInputStream is = new FileInputStream(source); - try { - return readHeader(new AsciiLineReader(is), null); - } finally { - is.close(); - } - } - - - /** - * Returns the header, and also sets the 2nd arg to the number of lines in the header. - * @param source - * @param lineCounter An array of length 1 or null. If not null, array[0] will be set to the number of lines in the header. - * @return The header fields. - * @throws IOException - */ - private static ArrayList readHeader(final LineReader source, int[] lineCounter) throws IOException { - - ArrayList header = null; - int numLines = 0; - - //find the 1st line that's non-empty and not a comment - String line = null; - while( (line = source.readLine()) != null ) { - numLines++; - if ( line.trim().isEmpty() || line.startsWith("#") ) { - continue; - } - - //parse the header - header = Utils.split(line, DELIMITER); - break; - } - - // check that we found the header - if ( header == null ) { - throw new IllegalArgumentException("No header in " + source + ". All lines are either comments or empty."); - } - - if(lineCounter != null) { - lineCounter[0] = numLines; - } - - logger.debug(String.format("Found header line containing %d columns:\n[%s]", header.size(), Utils.join("\t", header))); - - return header; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java deleted file mode 100755 index d12badd28..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.features.annotator; - -import org.broad.tribble.Feature; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -/** - * This class represents a single record in an AnnotatorInputTable. - */ -public class AnnotatorInputTableFeature implements Feature { - - private ArrayList columnNames; - private HashMap columnValues; //maps colum names to column values - - private String chr; - private int start; - private int end; - private String strRep = null; - - /** - * Constructor. - * @param chr The chromosome name. - * @param start The start position - * @param end The end position - */ - public AnnotatorInputTableFeature(String chr, int start, int end) { - this.chr = chr; - this.start = start; - this.end = end; - } - - - /** - * Constructor. - * @param columnNames The column names as parsed out of the file header. - */ - public AnnotatorInputTableFeature(ArrayList columnNames) { - this.columnNames = columnNames; - this.columnValues = new HashMap(); - } - - - - /** - * @return the list of column names from the file header. - */ - public ArrayList getHeader() { - return columnNames; - } - - - /** - * Returns the value of the given column. - * - * @param columnName The column name as it appears in the file header. - * @return The value - */ - public String getColumnValue(final String columnName) { - return columnValues.get(columnName); - } - - - public boolean containsColumnName(final String columnName) { - return columnValues.containsKey(columnName); - } - - - /** - * Sets the value for the given column. - * - * @param columnName The column name as it appears in the file header. - * @param value The value - * @return The existing value associated with the columnName, if there is one. - */ - protected String putColumnValue(final String columnName, final String value) { - return columnValues.put(columnName, value); - } - - /** - * @return all values in this line, hashed by their column names. - */ - public Map getColumnValues() { - return Collections.unmodifiableMap(columnValues); - } - - - public String getChr() { - return chr; - } - - public int getStart() { - return start; - } - - public int getEnd() { - return end; - } - - protected void setChr(String chr) { - this.chr = chr; - } - - protected void setStart(int start) { - this.start = start; - } - - protected void setEnd(int end) { - this.end = end; - } - - @Override - public String toString() { - if ( strRep == null ) { - StringBuilder sb = new StringBuilder(); - - for(String columnName : columnNames ) { - if ( sb.length() == 0 ) - sb.append("["); - else - sb.append(", "); - sb.append(columnName + "=" + columnValues.get(columnName)); - } - sb.append("]"); - - strRep = sb.toString(); - } - - return strRep; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java index 085d6b5b3..029800aea 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java @@ -12,14 +12,13 @@ import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; import java.io.FileOutputStream; -import java.util.Map; /** * a utility class that can create an index, written to a target location. This is useful when you're unable to write to the directory @@ -83,14 +82,14 @@ public class RMDIndexer extends CommandLineProgram { RMDTrackBuilder builder = new RMDTrackBuilder(ref.getSequenceDictionary(),genomeLocParser, ValidationExclusion.TYPE.ALL); // find the types available to the track builders - Map typeMapping = builder.getAvailableTrackNamesAndTypes(); + FeatureManager.FeatureDescriptor descriptor = builder.getFeatureManager().getByName(inputFileType); // check that the type is valid - if (!typeMapping.containsKey(inputFileType)) - throw new IllegalArgumentException("The type specified " + inputFileType + " is not a valid type. Valid type list: " + Utils.join(",",typeMapping.keySet())); + if (descriptor == null) + throw new IllegalArgumentException("The type specified " + inputFileType + " is not a valid type. Valid type list: " + builder.getFeatureManager().userFriendlyListOfAvailableFeatures()); // create the codec - FeatureCodec codec = builder.createByType(typeMapping.get(inputFileType)); + FeatureCodec codec = builder.getFeatureManager().createCodec(descriptor, "foo", genomeLocParser); // check if it's a reference dependent feature codec if (codec instanceof ReferenceDependentFeatureCodec) diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java new file mode 100644 index 000000000..c99aea254 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.tracks; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.NameAwareCodec; +import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.sting.gatk.refdata.SelfScopingFeatureCodec; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.help.GATKDocUtils; +import org.broadinstitute.sting.utils.help.HelpUtils; + +import javax.mail.Header; +import java.io.File; +import java.util.*; + + +/** + * Class for managing Tribble Feature readers available to the GATK. The features + * are dynamically determined via a PluginManager. This class provides convenient + * getter methods for obtaining FeatureDescriptor objects that collect all of the + * useful information about the Tribble Codec, Feature, and name in one place. + * + * @author depristo + */ +public class FeatureManager { + public static class FeatureDescriptor implements Comparable { + final String name; + final FeatureCodec codec; + + public FeatureDescriptor(final String name, final FeatureCodec codec) { + this.name = name; + this.codec = codec; + } + + public String getName() { + return name; + } + public String getSimpleFeatureName() { return getFeatureClass().getSimpleName(); } + public FeatureCodec getCodec() { + return codec; + } + public Class getCodecClass() { return codec.getClass(); } + public Class getFeatureClass() { return codec.getFeatureType(); } + + @Override + public String toString() { + return String.format("FeatureDescriptor name=%s codec=%s feature=%s", + getName(), getCodecClass().getName(), getFeatureClass().getName()); + } + + @Override + public int compareTo(FeatureDescriptor o) { + return getName().compareTo(o.getName()); + } + } + + private final PluginManager pluginManager; + private final Collection featureDescriptors = new TreeSet(); + + /** + * Construct a FeatureManager + */ + public FeatureManager() { + pluginManager = new PluginManager(FeatureCodec.class, "Codecs", "Codec"); + + for (final String rawName: pluginManager.getPluginsByName().keySet()) { + FeatureCodec codec = pluginManager.createByName(rawName); + String name = rawName.toUpperCase(); + FeatureDescriptor featureDescriptor = new FeatureDescriptor(name, codec); + featureDescriptors.add(featureDescriptor); + } + } + + /** + * Return the FeatureDescriptor whose getCodecClass().equals(codecClass). + * + * @param codecClass + * @return A FeatureDescriptor or null if none is found + */ + @Requires("codecClass != null") + public FeatureDescriptor getByCodec(Class codecClass) { + for ( FeatureDescriptor descriptor : featureDescriptors ) + if ( descriptor.getCodecClass().equals(codecClass) ) + return descriptor; + return null; + } + + /** + * Returns a collection of FeatureDescriptors that emit records of type featureClass + * + * @param featureClass + * @return A FeatureDescriptor or null if none is found + */ + @Requires("featureClass != null") + public Collection getByFeature(Class featureClass) { + Set consistentDescriptors = new TreeSet(); + + if (featureClass == null) + throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object"); + + for ( FeatureDescriptor descriptor : featureDescriptors ) { + if ( featureClass.isAssignableFrom(descriptor.getFeatureClass())) + consistentDescriptors.add(descriptor); + } + return consistentDescriptors; + } + + /** + * Return the FeatureDescriptor with getName().equals(name) + * + * @param name + * @return A FeatureDescriptor or null if none is found + */ + @Requires("name != null") + public FeatureDescriptor getByName(String name) { + for ( FeatureDescriptor descriptor : featureDescriptors ) + if ( descriptor.getName().equalsIgnoreCase(name) ) + return descriptor; + return null; + } + + /** + * Returns the FeatureDescriptor that can read the contexts of File file, is one can be determined + * + * @param file + * @return A FeatureDescriptor or null if none is found + */ + @Requires({"file != null", "file.isFile()", "file.canRead()"}) + public FeatureDescriptor getByFiletype(File file) { + List canParse = new ArrayList(); + for ( FeatureDescriptor descriptor : featureDescriptors ) + if ( descriptor.getCodec() instanceof SelfScopingFeatureCodec ) { + if ( ((SelfScopingFeatureCodec) descriptor.getCodec()).canDecode(file) ) { + canParse.add(descriptor); + } + } + + if ( canParse.size() == 0 ) + return null; + else if ( canParse.size() > 1 ) + throw new ReviewedStingException("BUG: multiple feature descriptors can read file " + file + ": " + canParse); + else + return canParse.get(0); + } + + /** + * Returns the FeatureDescriptor associated with the type described by triplet, or null if none is found + * @param triplet + * @return + */ + @Requires("triplet != null") + public FeatureDescriptor getByTriplet(RMDTriplet triplet) { + return getByName(triplet.getType()); + } + + /** + * @return all of the FeatureDescriptors available to the GATK. Never null + */ + @Ensures("result != null") + public Collection getFeatureDescriptors() { + return Collections.unmodifiableCollection(featureDescriptors); + } + + + /** + * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load + * @return + */ + @Ensures("result != null") + public String userFriendlyListOfAvailableFeatures() { + return userFriendlyListOfAvailableFeatures(Feature.class); + } + + /** + * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load + * restricted to only Codecs producting Features consistent with the requiredFeatureType + * @return + */ + @Ensures("result != null") + public String userFriendlyListOfAvailableFeatures(Class requiredFeatureType) { + final String nameHeader="Name", featureHeader = "FeatureType", docHeader="Documentation"; + + int maxNameLen = nameHeader.length(), maxFeatureNameLen = featureHeader.length(); + for ( final FeatureDescriptor descriptor : featureDescriptors ) { + if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { + maxNameLen = Math.max(maxNameLen, descriptor.getName().length()); + maxFeatureNameLen = Math.max(maxFeatureNameLen, descriptor.getSimpleFeatureName().length()); + } + } + + StringBuilder docs = new StringBuilder(); + String format = "%" + maxNameLen + "s %" + maxFeatureNameLen + "s %s%n"; + docs.append(String.format(format, nameHeader, featureHeader, docHeader)); + for ( final FeatureDescriptor descriptor : featureDescriptors ) { + if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { + String oneDoc = String.format(format, + descriptor.getName(), + descriptor.getSimpleFeatureName(), + GATKDocUtils.helpLinksToGATKDocs(descriptor.getCodecClass())); + docs.append(oneDoc); + } + } + + return docs.toString(); + } + + /** + * Create a new FeatureCodec of the type described in descriptor, assigning it the + * name (if possible) and providing it the genomeLocParser (where necessary) + * + * @param descriptor FeatureDescriptor of the Tribble FeatureCodec we want to create + * @param name the name to assign this codec + * @return the feature codec itself + */ + @Requires({"descriptor != null", "name != null", "genomeLocParser != null"}) + @Ensures("result != null") + public FeatureCodec createCodec(FeatureDescriptor descriptor, String name, GenomeLocParser genomeLocParser) { + FeatureCodec codex = pluginManager.createByType(descriptor.getCodecClass()); + if ( codex instanceof NameAwareCodec ) + ((NameAwareCodec)codex).setName(name); + if ( codex instanceof ReferenceDependentFeatureCodec ) + ((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser); + return codex; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/QueryableTrack.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/QueryableTrack.java deleted file mode 100644 index 731df997d..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/QueryableTrack.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.io.IOException; - -/** - * @author aaron - *

- * Interface QueryableTrack - *

- * a decorator interface for tracks that are queryable - */ -public interface QueryableTrack { - public CloseableIterator query(final GenomeLoc interval) throws IOException; - public CloseableIterator query(final GenomeLoc interval, final boolean contained) throws IOException; - public CloseableIterator query(final String contig, final int start, final int stop) throws IOException; - public CloseableIterator query(final String contig, final int start, final int stop, final boolean contained) throws IOException; - public void close(); -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java index ba1ca674e..7aa112961 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java @@ -25,8 +25,12 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.util.CloseableIterator; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureSource; +import org.broad.tribble.iterators.CloseableTribbleIterator; +import org.broad.tribble.source.PerformanceLoggingFeatureSource; import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.utils.GenomeLoc; @@ -45,10 +49,10 @@ import java.io.IOException; * the basics of what a reference metadata track must contain. */ public class RMDTrack { + private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); // the basics of a track: private final Class type; // our type - private final Class recordType; // the underlying records that are produced by this track private final String name; // the name private final File file; // the associated file we create the reader from @@ -90,7 +94,6 @@ public class RMDTrack { */ public RMDTrack(Class type, String name, File file, FeatureSource reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) { this.type = type; - this.recordType = codec.getFeatureType(); this.name = name; this.file = file; this.reader = reader; @@ -112,19 +115,10 @@ public class RMDTrack { } public CloseableIterator query(GenomeLoc interval) throws IOException { - return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(interval.getContig(),interval.getStart(),interval.getStop()),this.getName()); - } - - public CloseableIterator query(GenomeLoc interval, boolean contained) throws IOException { - return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(interval.getContig(),interval.getStart(),interval.getStop()),this.getName()); - } - - public CloseableIterator query(String contig, int start, int stop) throws IOException { - return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(contig,start,stop),this.getName()); - } - - public CloseableIterator query(String contig, int start, int stop, boolean contained) throws IOException { - return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(contig,start,stop),this.getName()); + CloseableTribbleIterator iter = reader.query(interval.getContig(),interval.getStart(),interval.getStop()); + if ( RMDTrackBuilder.MEASURE_TRIBBLE_QUERY_PERFORMANCE ) + logger.warn("Query " + getName() + ":" + ((PerformanceLoggingFeatureSource)reader).getPerformanceLog()); + return new FeatureToGATKFeatureIterator(genomeLocParser, iter, this.getName()); } public void close() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java similarity index 79% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java rename to public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index 19c91be1b..06d05912a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2011, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -12,37 +12,36 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.tracks.builders; +package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; -import org.broad.tribble.*; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.FeatureSource; +import org.broad.tribble.Tribble; +import org.broad.tribble.TribbleException; import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.source.BasicFeatureSource; +import org.broad.tribble.source.PerformanceLoggingFeatureSource; import org.broad.tribble.util.LittleEndianOutputStream; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SequenceDictionaryUtils; -import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -53,7 +52,10 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -import java.util.*; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; @@ -67,17 +69,16 @@ import java.util.*; * that gets iterators from the FeatureReader using Tribble. * */ -public class RMDTrackBuilder extends PluginManager { +public class RMDTrackBuilder { // extends PluginManager { /** * our log, which we use to capture anything from this class */ private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); + public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false; // a constant we use for marking sequence dictionary entries in the Tribble index property list public static final String SequenceDictionaryPropertyPredicate = "DICT:"; - private Map classes = null; - // private sequence dictionary we use to set our tracks with private SAMSequenceDictionary dict = null; @@ -91,6 +92,8 @@ public class RMDTrackBuilder extends PluginManager { */ private ValidationExclusion.TYPE validationExclusionType; + FeatureManager featureManager; + /** * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact. This is generally * used when walkers want to directly manage the ROD system for whatever reason. Before using this constructor, @@ -102,29 +105,14 @@ public class RMDTrackBuilder extends PluginManager { public RMDTrackBuilder(SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, ValidationExclusion.TYPE validationExclusionType) { - super(FeatureCodec.class, "Codecs", "Codec"); this.dict = dict; - this.genomeLocParser = genomeLocParser; this.validationExclusionType = validationExclusionType; - - classes = new HashMap(); - for (String name: this.getPluginsByName().keySet()) { - classes.put(name.toUpperCase(), getPluginsByName().get(name)); - } } - - /** @return a list of all available track types we currently have access to create */ - public Map getAvailableTrackNamesAndTypes() { - return Collections.unmodifiableMap(classes); + this.genomeLocParser = genomeLocParser; + featureManager = new FeatureManager(); } - /** @return a list of all available track record types we currently have access to create */ - public Map getAvailableTrackNamesAndRecordTypes() { - HashMap classToRecord = new HashMap(); - for (String name: this.getPluginsByName().keySet()) { - FeatureCodec codec = this.createByName(name); - classToRecord.put(name, codec.getFeatureType()); - } - return classToRecord; + public FeatureManager getFeatureManager() { + return featureManager; } /** @@ -133,45 +121,38 @@ public class RMDTrackBuilder extends PluginManager { * @param fileDescriptor a description of the type of track to build. * * @return an instance of the track - * @throws RMDTrackCreationException - * if we don't know of the target class or we couldn't create it */ - public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) throws RMDTrackCreationException { + public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { String name = fileDescriptor.getName(); File inputFile = new File(fileDescriptor.getFile()); - Class featureCodecClass = getAvailableTrackNamesAndTypes().get(fileDescriptor.getType().toUpperCase()); - if (featureCodecClass == null) + FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); + if (descriptor == null) throw new UserException.BadArgumentValue("-B",fileDescriptor.getType()); // return a feature reader track Pair pair; if (inputFile.getAbsolutePath().endsWith(".gz")) - pair = createTabixIndexedFeatureSource(featureCodecClass, name, inputFile); + pair = createTabixIndexedFeatureSource(descriptor, name, inputFile); else - pair = getFeatureSource(featureCodecClass, name, inputFile, fileDescriptor.getStorageType()); + pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); - return new RMDTrack(featureCodecClass, name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(featureCodecClass,name)); + return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name)); } /** * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. - * @param targetClass Type of Tribble class to build. + * @param codecClass Type of Tribble codec class to build. * @param inputFile Input file type to use. * @return An RMDTrack, suitable for accessing reference metadata. */ - public RMDTrack createInstanceOfTrack(Class targetClass, File inputFile) { - // TODO: Update RMDTriplet to contain an actual class object rather than a name to avoid these gymnastics. - String typeName = null; - for(Map.Entry trackType: getAvailableTrackNamesAndTypes().entrySet()) { - if(trackType.getValue().equals(targetClass)) - typeName = trackType.getKey(); - } + public RMDTrack createInstanceOfTrack(Class codecClass, File inputFile) { + final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass); - if(typeName == null) - throw new ReviewedStingException("Unable to find type name for class " + targetClass.getName()); + if (descriptor == null) + throw new ReviewedStingException("Unable to find type name for codex class " + codecClass.getName()); - return createInstanceOfTrack(new RMDTriplet("anonymous",typeName,inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); + return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); } /** @@ -179,16 +160,16 @@ public class RMDTrackBuilder extends PluginManager { * reader of the appropriate type will figure out what the right index type is, and determine if it * exists. * - * @param targetClass the codec class type + * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create * @param name the name of the track * @param inputFile the file to load * @return a feature reader implementation */ - private Pair createTabixIndexedFeatureSource(Class targetClass, String name, File inputFile) { + private Pair createTabixIndexedFeatureSource(FeatureManager.FeatureDescriptor descriptor, String name, File inputFile) { // we might not know the index type, try loading with the default reader constructor logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file"); try { - return new Pair(BasicFeatureSource.getFeatureSource(inputFile.getAbsolutePath(), createCodec(targetClass, name)),null); + return new Pair(BasicFeatureSource.getFeatureSource(inputFile.getAbsolutePath(), createCodec(descriptor, name)),null); } catch (TribbleException e) { throw new UserException(e.getMessage(), e); } @@ -196,28 +177,26 @@ public class RMDTrackBuilder extends PluginManager { /** * add a name to the codec, if it takes one - * @param targetClass the class to create a codec for + * @param descriptor the class to create a codec for * @param name the name to assign this codec * @return the feature codec itself */ - public FeatureCodec createCodec(Class targetClass, String name) { - FeatureCodec codex = this.createByType(targetClass); - if ( codex instanceof NameAwareCodec ) - ((NameAwareCodec)codex).setName(name); - if(codex instanceof ReferenceDependentFeatureCodec) - ((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser); - return codex; + private FeatureCodec createCodec(FeatureManager.FeatureDescriptor descriptor, String name) { + return featureManager.createCodec(descriptor, name, genomeLocParser); } /** * create a feature source object given: - * @param targetClass the target class + * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create * @param name the name of the codec * @param inputFile the tribble file to parse * @param storageType How the RMD is streamed into the input file. * @return the input file as a FeatureReader */ - private Pair getFeatureSource(Class targetClass, String name, File inputFile, RMDStorageType storageType) { + private Pair getFeatureSource(FeatureManager.FeatureDescriptor descriptor, + String name, + File inputFile, + RMDStorageType storageType) { // Feature source and sequence dictionary to use as the ultimate reference FeatureSource featureSource = null; SAMSequenceDictionary sequenceDictionary = null; @@ -227,7 +206,7 @@ public class RMDTrackBuilder extends PluginManager { if(canBeIndexed) { try { - Index index = loadIndex(inputFile, createCodec(targetClass, name)); + Index index = loadIndex(inputFile, createCodec(descriptor, name)); try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } catch (ReviewedStingException e) { } @@ -240,7 +219,10 @@ public class RMDTrackBuilder extends PluginManager { sequenceDictionary = getSequenceDictionaryFromProperties(index); } - featureSource = new BasicFeatureSource(inputFile.getAbsolutePath(), index, createCodec(targetClass, name)); + if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE ) + featureSource = new PerformanceLoggingFeatureSource(inputFile.getAbsolutePath(), index, createCodec(descriptor, name)); + else + featureSource = new BasicFeatureSource(inputFile.getAbsolutePath(), index, createCodec(descriptor, name)); } catch (TribbleException e) { throw new UserException(e.getMessage()); @@ -250,7 +232,7 @@ public class RMDTrackBuilder extends PluginManager { } } else { - featureSource = BasicFeatureSource.getFeatureSource(inputFile.getAbsolutePath(),createCodec(targetClass, name),false); + featureSource = BasicFeatureSource.getFeatureSource(inputFile.getAbsolutePath(),createCodec(descriptor, name),false); } return new Pair(featureSource,sequenceDictionary); @@ -385,22 +367,6 @@ public class RMDTrackBuilder extends PluginManager { return idx; } - /** - * Returns a collection of track names that match the record type. - * @param trackRecordType the record type specified in the @RMD annotation - * @return a collection of available track record type names that match the record type - */ - public Collection getTrackRecordTypeNames(Class trackRecordType) { - Set names = new TreeSet(); - if (trackRecordType == null) - throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object"); - - for (Map.Entry availableTrackRecordType: getAvailableTrackNamesAndRecordTypes().entrySet()) { - if (availableTrackRecordType.getValue() != null && trackRecordType.isAssignableFrom(availableTrackRecordType.getValue())) - names.add(availableTrackRecordType.getKey()); - } - return names; - } // --------------------------------------------------------------------------------------------------------- // static functions to work with the sequence dictionaries of indexes diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackCreationException.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackCreationException.java deleted file mode 100644 index 29aefacc6..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackCreationException.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - - -/** - * - * @author aaron - * - * Class RMDTrackCreationException - * - * if we fail for some reason to make a track, throw this exception - */ -public class RMDTrackCreationException extends ReviewedStingException { - public RMDTrackCreationException(String msg) { - super(msg); - } - - public RMDTrackCreationException(String message, Throwable throwable) { - super(message, throwable); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java index 59e8471a3..6f8c9680f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java @@ -57,6 +57,7 @@ public abstract class GATKFeature implements Feature, HasGenomeLocation { public abstract GenomeLoc getLocation(); + // TODO: this should be a Feature public abstract Object getUnderlyingObject(); /** @@ -98,48 +99,9 @@ public abstract class GATKFeature implements Feature, HasGenomeLocation { return feature.getEnd(); } + // TODO: this should be a Feature, actually public Object getUnderlyingObject() { return feature; } } - - /** - * wrapping a old style rod into the new GATK feature style - */ - public static class RODGATKFeature extends GATKFeature { - - // our data - private ReferenceOrderedDatum datum; - - public RODGATKFeature(ReferenceOrderedDatum datum) { - super(datum.getName()); - this.datum = datum; - } - - @Override - public GenomeLoc getLocation() { - return datum.getLocation(); - } - - @Override - public Object getUnderlyingObject() { - return datum; - } - - @Override - public String getChr() { - return datum.getLocation().getContig(); - } - - @Override - public int getStart() { - return (int)datum.getLocation().getStart(); - } - - @Override - public int getEnd() { - return (int)datum.getLocation().getStop(); - } - } - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeatureIterator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeatureIterator.java deleted file mode 100644 index 17c9fa718..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeatureIterator.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.utils; - -import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; - -import java.util.Iterator; - - -/** - * - * @author aaron - * - * Class GATKFeatureIterator - * - * Takes a RODatum iterator and makes it an iterator of GATKFeatures. Shazam! - */ -public class GATKFeatureIterator implements CloseableIterator { - private final Iterator iter; - public GATKFeatureIterator(Iterator iter) { - this.iter = iter; - } - - @Override - public boolean hasNext() { - return iter.hasNext(); - } - - @Override - public GATKFeature next() { - return new GATKFeature.RODGATKFeature(iter.next()); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Remove not supported"); - } - - @Override - public void close() { - // do nothing, our underlying iterator doesn't support this - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java deleted file mode 100644 index 35b0f73c6..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java +++ /dev/null @@ -1,190 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata.utils.helpers; - -import net.sf.samtools.util.SequenceUtil; -import org.broad.tribble.annotation.Strand; -import org.broad.tribble.dbsnp.DbSNPFeature; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -/** - * this class contains static helper methods for DbSNP - */ -public class DbSNPHelper { - public static final String STANDARD_DBSNP_TRACK_NAME = "dbsnp"; - - private DbSNPHelper() {} // don't make a DbSNPHelper - - public static DbSNPFeature getFirstRealSNP(List dbsnpList) { - if (dbsnpList == null) - return null; - - DbSNPFeature dbsnp = null; - for (Object d : dbsnpList) { - if (d instanceof DbSNPFeature && DbSNPHelper.isSNP((DbSNPFeature)d)) { - dbsnp = (DbSNPFeature) d; - break; - } - } - - return dbsnp; - } - - public static String rsIDOfFirstRealSNP(List featureList) { - if (featureList == null) - return null; - - String rsID = null; - for ( Object d : featureList ) { - if ( d instanceof DbSNPFeature ) { - if ( DbSNPHelper.isSNP((DbSNPFeature)d) ) { - rsID = ((DbSNPFeature)d).getRsID(); - break; - } - } else if ( d instanceof VariantContext) { - if ( ((VariantContext)d).isSNP() ) { - rsID = ((VariantContext)d).getID(); - break; - } - } - } - - return rsID; - } - - public static String rsIDOfFirstRealIndel(List featureList) { - if (featureList == null) - return null; - - String rsID = null; - for ( Object d : featureList ) { - if ( d instanceof DbSNPFeature ) { - if ( DbSNPHelper.isIndel((DbSNPFeature)d) ) { - rsID = ((DbSNPFeature)d).getRsID(); - break; - } - } else if ( d instanceof VariantContext) { - if ( ((VariantContext)d).isIndel() ) { - rsID = ((VariantContext)d).getID(); - break; - } - } - } - - return rsID; - } - - /** - * get the -1 * (log 10 of the error value) - * - * @return the log based error estimate - */ - public static double getNegLog10PError(DbSNPFeature feature) { - return 4; // -log10(0.0001) - } - - // - // What kind of variant are we? - // - // ---------------------------------------------------------------------- - public static boolean isSNP(DbSNPFeature feature) { - return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact"); - } - - public static boolean isMNP(DbSNPFeature feature) { - return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range"); - } - - public static String toMediumString(DbSNPFeature feature) { - String s = String.format("%s:%d:%s:%s", feature.getChr(), feature.getStart(), feature.getRsID(), Utils.join("",feature.getObserved())); - if (isSNP(feature)) s += ":SNP"; - if (isIndel(feature)) s += ":Indel"; - if (isHapmap(feature)) s += ":Hapmap"; - if (is2Hit2Allele(feature)) s += ":2Hit"; - return s; - } - - public static boolean isInsertion(DbSNPFeature feature) { - return feature.getVariantType().contains("insertion"); - } - - public static boolean isDeletion(DbSNPFeature feature) { - return feature.getVariantType().contains("deletion"); - } - - public static boolean isIndel(DbSNPFeature feature) { - return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || DbSNPHelper.isComplexIndel(feature); - } - - public static boolean isComplexIndel(DbSNPFeature feature) { - return feature.getVariantType().contains("in-del"); - } - - public static boolean isHapmap(DbSNPFeature feature) { - return feature.getValidationStatus().contains("by-hapmap"); - } - - public static boolean is2Hit2Allele(DbSNPFeature feature) { - return feature.getValidationStatus().contains("by-2hit-2allele"); - } - - public static boolean is1000genomes(DbSNPFeature feature) { - return feature.getValidationStatus().contains("by-1000genomes"); - } - - public static boolean isMQ1(DbSNPFeature feature) { - return feature.getWeight() == 1; - } - - /** - * gets the alternate alleles. This method should return all the alleles present at the location, - * NOT including the reference base. This is returned as a string list with no guarantee ordering - * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest - * frequency). - * - * @return an alternate allele list - */ - public static List getAlternateAlleleList(DbSNPFeature feature) { - List ret = new ArrayList(); - for (String allele : getAlleleList(feature)) - if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele); - return ret; - } - - public static boolean onFwdStrand(DbSNPFeature feature) { - return feature.getStrand() == Strand.POSITIVE; - } - - public static String getReference(DbSNPFeature feature) { - return feature.getNCBIRefBase(); - } - - public static String toSimpleString(DbSNPFeature feature) { - return String.format("%s:%s:%s", feature.getRsID(), feature.getObserved(), (feature.getStrand() == Strand.POSITIVE) ? "+" : "-"); - } - - /** - * gets the alleles. This method should return all the alleles present at the location, - * including the reference base. The first allele should always be the reference allele, followed - * by an unordered list of alternate alleles. - * - * @return an alternate allele list - */ - public static List getAlleleList(DbSNPFeature feature) { - List alleleList = new ArrayList(); - // add ref first - if ( onFwdStrand(feature) ) - alleleList = Arrays.asList(feature.getObserved()); - else - for (String str : feature.getObserved()) - alleleList.add(SequenceUtil.reverseComplement(str)); - if ( alleleList.size() > 0 && alleleList.contains(getReference(feature)) && !alleleList.get(0).equals(getReference(feature)) ) - Collections.swap(alleleList, alleleList.indexOf(getReference(feature)), 0); - - return alleleList; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index 59d496828..608b5d1d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -1,21 +1,25 @@ package org.broadinstitute.sting.gatk.report; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.io.*; +import java.util.Collection; +import java.util.List; import java.util.TreeMap; /** * Container class for GATK report tables */ public class GATKReport { - private TreeMap tables; + public static final String GATKREPORT_HEADER_PREFIX = "##:GATKReport.v"; + private TreeMap tables = new TreeMap(); /** * Create a new, empty GATKReport. */ public GATKReport() { - tables = new TreeMap(); } /** @@ -23,7 +27,7 @@ public class GATKReport { * @param filename the path to the file to load */ public GATKReport(String filename) { - loadReport(new File(filename)); + this(new File(filename)); } /** @@ -31,7 +35,6 @@ public class GATKReport { * @param file the file to load */ public GATKReport(File file) { - tables = new TreeMap(); loadReport(file); } @@ -46,11 +49,17 @@ public class GATKReport { GATKReportTable table = null; String[] header = null; int id = 0; + GATKReportVersion version = null; + List columnStarts = null; String line; while ( (line = reader.readLine()) != null ) { - if (line.startsWith("##:GATKReport.v0.1 ")) { - line = line.replaceFirst("##:GATKReport.v0.1 ", ""); + + if (line.startsWith(GATKREPORT_HEADER_PREFIX)) { + + version = GATKReportVersion.fromHeader(line); + + line = line.replaceFirst("##:GATKReport." + version.versionString + " ", ""); String[] pieces = line.split(" : "); String tableName = pieces[0]; @@ -58,14 +67,35 @@ public class GATKReport { addTable(tableName, tableDesc); table = getTable(tableName); + table.setVersion(version); header = null; - } else if ( line.isEmpty() ) { + columnStarts = null; + } else if ( line.trim().isEmpty() ) { // do nothing } else { if (table != null) { + + String[] splitLine; + + switch (version) { + case V0_1: + splitLine = TextFormattingUtils.splitWhiteSpace(line); + break; + + case V0_2: + if (header == null) { + columnStarts = TextFormattingUtils.getWordStarts(line); + } + splitLine = TextFormattingUtils.splitFixedWidth(line, columnStarts); + break; + + default: + throw new ReviewedStingException("GATK report version parsing not implemented for: " + line); + } + if (header == null) { - header = line.split("\\s+"); + header = splitLine; table.addPrimaryKey("id", false); @@ -75,10 +105,8 @@ public class GATKReport { id = 0; } else { - String[] entries = line.split("\\s+"); - for (int columnIndex = 0; columnIndex < header.length; columnIndex++) { - table.set(id, header[columnIndex], entries[columnIndex]); + table.set(id, header[columnIndex], splitLine[columnIndex]); } id++; @@ -125,7 +153,10 @@ public class GATKReport { * @return the table object */ public GATKReportTable getTable(String tableName) { - return tables.get(tableName); + GATKReportTable table = tables.get(tableName); + if (table == null) + throw new ReviewedStingException("Table is not in GATKReport: " + tableName); + return table; } /** @@ -140,4 +171,8 @@ public class GATKReport { } } } + + public Collection getTables() { + return tables.values(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 440597754..347e870c8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -37,10 +37,10 @@ public class GATKReportColumn extends TreeMap { * tables, as the table gets written properly without having to waste storage for the unset elements (usually the zero * values) in the table. * - * @param primaryKey the primary key position in the column that should be set + * @param primaryKey the primary key position in the column that should be retrieved * @return the value at the specified position in the column, or the default value if the element is not set */ - public Object getWithoutSideEffects(Object primaryKey) { + private Object getWithoutSideEffects(Object primaryKey) { if (!this.containsKey(primaryKey)) { return defaultValue; } @@ -48,6 +48,16 @@ public class GATKReportColumn extends TreeMap { return this.get(primaryKey); } + /** + * Return an object from the column, but if it doesn't exist, return the default value. + * + * @param primaryKey the primary key position in the column that should be retrieved + * @return the string value at the specified position in the column, or the default value if the element is not set + */ + public String getStringValue(Object primaryKey) { + return toString(getWithoutSideEffects(primaryKey)); + } + /** * Return the displayable property of the column. If true, the column will be displayed in the final output. * If not, printing will be suppressed for the contents of the table. @@ -67,7 +77,7 @@ public class GATKReportColumn extends TreeMap { for (Object obj : this.values()) { if (obj != null) { - int width = obj.toString().length(); + int width = toString(obj).length(); if (width > maxWidth) { maxWidth = width; @@ -77,4 +87,27 @@ public class GATKReportColumn extends TreeMap { return maxWidth; } + + /** + * Returns a string version of the values. + * @param obj The object to convert to a string + * @return The string representation of the column + */ + private static String toString(Object obj) { + String value; + if (obj == null) { + value = "null"; + } else if (obj instanceof Float) { + value = String.format("%.8f", (Float) obj); + } else if (obj instanceof Double) { + value = String.format("%.8f", (Double) obj); + } else { + value = obj.toString(); + } + return value; + } + + public String getColumnName() { + return columnName; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java new file mode 100644 index 000000000..a33631c85 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.report; + +import java.util.*; + +/** + * Tracks a linked list of GATKReportColumn in order by name. + */ +public class GATKReportColumns extends LinkedHashMap { + private List columnNames = new ArrayList(); + + /** + * Returns the column by index + * @param i the index + * @return The column + */ + public GATKReportColumn getByIndex(int i) { + return get(columnNames.get(i)); + } + + @Override + public GATKReportColumn remove(Object key) { + columnNames.remove(key); + return super.remove(key); + } + + @Override + public GATKReportColumn put(String key, GATKReportColumn value) { + columnNames.add(key); + return super.put(key, value); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java deleted file mode 100644 index 6915d5cb2..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.report; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; - -public class GATKReportParser { - private List tables = new ArrayList(); - - public void parse(File file) throws IOException { - InputStream stream = FileUtils.openInputStream(file); - try { - parse(stream); - } finally { - IOUtils.closeQuietly(stream); - } - } - - public void parse(InputStream input) throws IOException { - GATKReportTableParser table = null; - - for (String line: new XReadLines(input)) { - if (line.startsWith("##:GATKReport.v0.1 ")) { - table = newTableParser(line); - tables.add(table); - table.parse(line); - } else if (table != null) { - if (line.trim().length() == 0) - table = null; - else - table.parse(line); - } - } - } - - public String getValue(String tableName, String[] key, String column) { - for (GATKReportTableParser table: tables) - if (table.getTableName().equals(tableName)) - return table.getValue(key, column); - return null; - } - - public String getValue(String tableName, String key, String column) { - for (GATKReportTableParser table: tables) - if (table.getTableName().equals(tableName)) - return table.getValue(key, column); - return null; - } - - private GATKReportTableParser newTableParser(String header) { - return new GATKReportTableParser(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index f7ea25696..3e3aa29a7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.report; +import org.apache.commons.lang.ObjectUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.PrintStream; @@ -88,17 +89,22 @@ import java.util.regex.Pattern; * but at least the prototype contained herein works. * * @author Kiran Garimella + * @author Khalid Shakir */ public class GATKReportTable { + /** REGEX that matches any table with an invalid name */ + public final static String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]"; + private static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V0_2; private String tableName; private String tableDescription; + private GATKReportVersion version = LATEST_REPORT_VERSION; private String primaryKeyName; private Collection primaryKeyColumn; private boolean primaryKeyDisplay; - boolean sortByPrimaryKey = true; + private boolean sortByPrimaryKey = true; - private LinkedHashMap columns; + private GATKReportColumns columns; /** * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed @@ -107,12 +113,25 @@ public class GATKReportTable { * @return true if the name is valid, false if otherwise */ private boolean isValidName(String name) { - Pattern p = Pattern.compile("[^a-zA-Z0-9_\\-\\.]"); + Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX); Matcher m = p.matcher(name); return !m.find(); } + /** + * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed + * + * @param description the name of the table or column + * @return true if the name is valid, false if otherwise + */ + private boolean isValidDescription(String description) { + Pattern p = Pattern.compile("\\r|\\n"); + Matcher m = p.matcher(description); + + return !m.find(); + } + /** * Construct a new GATK report table with the specified name and description * @@ -128,11 +147,23 @@ public class GATKReportTable { throw new ReviewedStingException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed."); } + if (!isValidDescription(tableDescription)) { + throw new ReviewedStingException("Attempted to set a GATKReportTable description of '" + tableDescription + "'. GATKReportTable descriptions must not contain newlines."); + } + this.tableName = tableName; this.tableDescription = tableDescription; this.sortByPrimaryKey = sortByPrimaryKey; - columns = new LinkedHashMap(); + columns = new GATKReportColumns(); + } + + public GATKReportVersion getVersion() { + return version; + } + + protected void setVersion(GATKReportVersion version) { + this.version = version; } /** @@ -161,6 +192,57 @@ public class GATKReportTable { primaryKeyDisplay = display; } + /** + * Returns the first primary key matching the dotted column values. + * Ex: dbsnp.eval.called.all.novel.all + * @param dottedColumnValues Period concatenated values. + * @return The first primary key matching the column values or throws an exception. + */ + public Object getPrimaryKey(String dottedColumnValues) { + Object key = findPrimaryKey(dottedColumnValues); + if (key == null) + throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + dottedColumnValues); + return key; + } + + /** + * Returns true if there is at least on row with the dotted column values. + * Ex: dbsnp.eval.called.all.novel.all + * @param dottedColumnValues Period concatenated values. + * @return true if there is at least one row matching the columns. + */ + public boolean containsPrimaryKey(String dottedColumnValues) { + return findPrimaryKey(dottedColumnValues) != null; + } + + /** + * Returns the first primary key matching the dotted column values. + * Ex: dbsnp.eval.called.all.novel.all + * @param dottedColumnValues Period concatenated values. + * @return The first primary key matching the column values or null. + */ + private Object findPrimaryKey(String dottedColumnValues) { + return findPrimaryKey(dottedColumnValues.split("\\.")); + } + + /** + * Returns the first primary key matching the column values. + * Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" } + * @param columnValues column values. + * @return The first primary key matching the column values. + */ + private Object findPrimaryKey(Object[] columnValues) { + for (Object primaryKey : primaryKeyColumn) { + boolean matching = true; + for (int i = 0; matching && i < columnValues.length; i++) { + matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i+1)); + } + if (matching) + return primaryKey; + } + return null; + } + /** * Add a column to the report and specify the default value that should be supplied if a given position in the table is never explicitly set. * @@ -230,6 +312,17 @@ public class GATKReportTable { return columns.get(columnName).get(primaryKey); } + /** + * Get a value from the given position in the table + * + * @param primaryKey the primary key value + * @param columnIndex the index of the column + * @return the value stored at the specified position in the table + */ + private Object get(Object primaryKey, int columnIndex) { + return columns.getByIndex(columnIndex).get(primaryKey); + } + /** * Increment an element in the table. This implementation is awful - a functor would probably be better. * @@ -515,7 +608,7 @@ public class GATKReportTable { String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s"; // Emit the table definition - out.printf("##:GATKReport.v0.1 %s : %s%n", tableName, tableDescription); + out.printf("##:GATKReport.%s %s : %s%n", LATEST_REPORT_VERSION.versionString, tableName, tableDescription); // Emit the table header, taking into account the padding requirement if the primary key is a hidden column boolean needsPadding = false; @@ -545,22 +638,8 @@ public class GATKReportTable { for (String columnName : columns.keySet()) { if (columns.get(columnName).isDisplayable()) { - Object obj = columns.get(columnName).getWithoutSideEffects(primaryKey); - if (needsPadding) { out.printf(" "); } - - String value = "null"; - if (obj != null) { - if (obj instanceof Float) { - value = String.format("%.8f", (Float) obj); - } else if (obj instanceof Double) { - value = String.format("%.8f", (Double) obj); - } else { - value = obj.toString(); - } - } - - //out.printf(columnWidths.get(columnName), obj == null ? "null" : obj.toString()); + String value = columns.get(columnName).getStringValue(primaryKey); out.printf(columnWidths.get(columnName), value); needsPadding = true; @@ -577,4 +656,16 @@ public class GATKReportTable { public int getNumRows() { return primaryKeyColumn.size(); } + + public String getTableName() { + return tableName; + } + + public String getTableDescription() { + return tableDescription; + } + + public GATKReportColumns getColumns() { + return columns; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java deleted file mode 100644 index 6fd9f9627..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.report; - -import org.apache.commons.lang.StringUtils; - -import java.util.*; - -public class GATKReportTableParser { - private int lineNum = 0; - private String[] descriptions; - private Map headers = new HashMap(); - private List values = new ArrayList(); - - public void parse(String line) { - lineNum++; - switch (lineNum) { - case 1: - descriptions = parseLine(line); - case 2: - String[] columnHeaders = parseLine(line); - for (int i = 0; i < columnHeaders.length; i++) - headers.put(columnHeaders[i], i); - default: - values.add(parseLine(line)); - } - } - - public String getTableName() { - return descriptions[1]; - } - - public String getValue(String[] key, String column) { - if (!headers.containsKey(column)) - return null; - for (String[] row: values) - if (Arrays.equals(key, Arrays.copyOfRange(row, 1, key.length + 1))) - return row[headers.get(column)]; - return null; - } - - public String getValue(String key, String column) { - return getValue(key.split("\\."), column); - } - - private String generateKey(String[] row, int i) { - return StringUtils.join(row, ".", 0, i); - } - - private String[] parseLine(String line) { - return line.split(" +"); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java new file mode 100644 index 000000000..5f1159a43 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.report; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +public enum GATKReportVersion { + /** + * Differences between other versions: + * - Does not allow spaces in cells. + * - Mostly fixed width but has a bug where the string width of floating point + * values was not measured correctly leading to columns that aren't aligned + */ + V0_1("v0.1"), + + /** + * Differences between other versions: + * - Spaces allowed in cells, for example in sample names with spaces in them ex: "C507/FG-CR 6". + * - Fixed width fixed for floating point values + */ + V0_2("v0.2"); + + public final String versionString; + + private GATKReportVersion(String versionString) { + this.versionString = versionString; + } + + @Override + public String toString() { + return versionString; + } + + /** + * Returns the GATK Report Version from the file header. + * @param header Header from the file starting with ##:GATKReport.v[version] + * @return The version as an enum. + */ + public static GATKReportVersion fromHeader(String header) { + if (header.startsWith("##:GATKReport.v0.1 ")) + return GATKReportVersion.V0_1; + + if (header.startsWith("##:GATKReport.v0.2 ")) + return GATKReportVersion.V0_2; + + throw new ReviewedStingException("Unknown GATK report version in header: " + header); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 89a179d0e..27fd173cb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -115,12 +115,13 @@ public abstract class TraversalEngine,Provide LinkedList history = new LinkedList(); /** We use the SimpleTimer to time our run */ - private SimpleTimer timer = new SimpleTimer("Traversal"); + private SimpleTimer timer = null; // How long can we go without printing some progress info? private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; private int printProgressCheckCounter = 0; private long lastProgressPrintTime = -1; // When was the last time we printed progress log? + private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 120 * 1000; // in milliseconds private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; @@ -209,11 +210,16 @@ public abstract class TraversalEngine,Provide } } /** - * Should be called to indicate that we're going to process records and the timer should start ticking + * Should be called to indicate that we're going to process records and the timer should start ticking. This + * function should be called right before any traversal work is done, to avoid counting setup costs in the + * processing costs and inflating the estimated runtime. */ - public void startTimers() { - timer.start(); - lastProgressPrintTime = timer.currentTime(); + public void startTimersIfNecessary() { + if ( timer == null ) { + timer = new SimpleTimer("Traversal"); + timer.start(); + lastProgressPrintTime = timer.currentTime(); + } } /** @@ -224,7 +230,8 @@ public abstract class TraversalEngine,Provide * @return true if the maximum interval (in millisecs) has passed since the last printing */ private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) { - return (curTime - lastPrintTime) > printFreq; + long elapsed = curTime - lastPrintTime; + return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 1ba48ca5f..046003154 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -173,7 +173,9 @@ public class TraverseDuplicates extends TraversalEngine those with the same mate pair position, for paired reads * -> those flagged as unpaired and duplicated but having the same start and end */ + boolean done = walker.isDone(); for (SAMRecord read : iter) { + if ( done ) break; // get the genome loc from the read GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); @@ -194,6 +196,7 @@ public class TraverseDuplicates extends TraversalEngine extends TraversalEngine,Locu logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); LocusView locusView = getLocusView( walker, dataProvider ); + boolean done = false; if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all @@ -46,7 +47,7 @@ public class TraverseLoci extends TraversalEngine,Locu LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); // We keep processing while the next reference location is within the interval - while( locusView.hasNext() ) { + while( locusView.hasNext() && ! done ) { AlignmentContext locus = locusView.next(); GenomeLoc location = locus.getLocation(); @@ -65,26 +66,28 @@ public class TraverseLoci extends TraversalEngine,Locu referenceView.expandBoundsToAccomodateLoc(location); } - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation()); - // create reference context. Note that if we have a pileup of "extended events", the context will // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). ReferenceContext refContext = referenceView.getReferenceContext(location); + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + final boolean keepMeP = walker.filter(tracker, refContext, locus); if (keepMeP) { M x = walker.map(tracker, refContext, locus); sum = walker.reduce(x, sum); + done = walker.isDone(); } printProgress(dataProvider.getShard(),locus.getLocation()); } } - // We have a final map call to execute here to clean up the skipped based from the - // last position in the ROD to that in the interval - if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA ) { + // We have a final map call to execute here to clean up the skipped based from the + // last position in the ROD to that in the interval + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { + // only do this if the walker isn't done! RodLocusView rodLocusView = (RodLocusView)locusView; long nSkipped = rodLocusView.getLastSkippedBases(); if ( nSkipped > 0 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java index 196d54036..dd4402d82 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java @@ -50,7 +50,9 @@ public class TraverseReadPairs extends TraversalEngine pairs = new ArrayList(); + boolean done = walker.isDone(); for(SAMRecord read: reads) { + if ( done ) break; dataProvider.getShard().getReadMetrics().incrementNumReadsSeen(); if(pairs.size() == 0 || pairs.get(0).getReadName().equals(read.getReadName())) { @@ -65,6 +67,8 @@ public class TraverseReadPairs extends TraversalEngine extends TraversalEngine,Read // get the reference ordered data ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + boolean done = walker.isDone(); // while we still have more reads for (SAMRecord read : reads) { + if ( done ) break; // ReferenceContext -- the reference bases covered by the read ReferenceContext refContext = null; @@ -106,6 +108,7 @@ public class TraverseReads extends TraversalEngine,Read GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); printProgress(dataProvider.getShard(),locus); + done = walker.isDone(); } return sum; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java index 2541921e9..80cb30598 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Allows.java @@ -23,5 +23,4 @@ import java.lang.annotation.*; @Target(ElementType.TYPE) public @interface Allows { DataSource[] value(); - RMD[] referenceMetaData() default {}; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index ca4e3f5e3..bb65d9b09 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -30,7 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFileFactory; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; @@ -50,44 +52,158 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * This ReadWalker provides simple, yet powerful read clipping capabilities. It allows the user to clip bases in reads - * with poor quality scores, that match particular sequences, or that were generated by particular machine cycles. + * This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences. + * + * + *

+ * It allows the user to clip bases in reads with poor quality scores, that match particular + * sequences, or that were generated by particular machine cycles. + * + *

+ *
Quality score based clipping
+ *
+ * Clip bases from the read in clipper from + *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
+ * to the end of the read. This is blatantly stolen from BWA. + * + * Walk through the read from the end (in machine cycle order) to the beginning, calculating the + * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this + * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the + * clipping index in the read (from the end). + *
+ *
Cycle based clipping
+ *
Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. + * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions). + * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12. + *
+ *
Sequence matching
+ *
Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm, + * filtering only bases whose sequence exactly matches SEQ.
+ *
+ * + *

+ * + *

Input

+ *

+ * Any number of BAM files. + *

+ * + *

Output

+ *

+ * A new BAM file containing all of the reads from the input BAMs with the user-specified clipping + * operation applied to each read. + *

+ *

+ *

Summary output

+ *
+ *     Number of examined reads              13
+ *     Number of clipped reads               13
+ *     Percent of clipped reads              100.00
+ *     Number of examined bases              988
+ *     Number of clipped bases               126
+ *     Percent of clipped bases              12.75
+ *     Number of quality-score clipped bases 126
+ *     Number of range clipped bases         0
+ *     Number of sequence clipped bases      0
+ *     
+ *

+ * + *

+ *

Example clipping

+ * Suppose we are given this read: + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
+ *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ * + * If we are clipping reads with -QT 10 and -CR WRITE_NS, we get: + * + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
+ *          NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ * + * Whereas with -CR WRITE_Q0S: + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
+ *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ * + * Or -CR SOFTCLIP_BASES: + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3133    29      17S59M  *       *       *
+ *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ *

+ * + *

Examples

+ *
+ *     -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
+ *     -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
+ * 
+ + * @author Mark DePristo + * @since 2010 */ @Requires({DataSource.READS}) -public class ClipReadsWalker extends ReadWalker { - @Output - PrintStream out; +public class ClipReadsWalker extends ReadWalker { + /** + * If provided, ClipReads will write summary statistics about the clipping operations applied + * to the reads to this file. + */ + @Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false) + PrintStream out = null; /** - * an optional argument to dump the reads out to a BAM file + * The output SAM/BAM file will be written here */ - @Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false) - StingSAMFileWriter outputBam = null; + @Output(doc = "Write BAM output here", required = true) + StingSAMFileWriter outputBam; - @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false) + /** + * If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this + * quality score threshold. + */ + @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "If provided, the Q-score clipper will be applied", required = false) int qTrimmingThreshold = -1; - @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false) + /** + * Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. + * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based + * values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, + * and 12. + */ + @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String indicating machine cycles to clip from the reads", required = false) String cyclesToClipArg = null; - @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false) + /** + * Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the + * sequences in the file. + */ + @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching the sequences in this FASTA file", required = false) String clipSequenceFile = null; + /** + * Clips bases from the reads matching the provided SEQ. Can be provided any number of times on the command line + */ @Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false) String[] clipSequencesArgs = null; - @Argument(fullName="read", doc="", required=false) - String onlyDoRead = null; - - //@Argument(fullName = "keepCompletelyClipped", shortName = "KCC", doc = "Unfortunately, sometimes a read is completely clipped away but with SOFTCLIP_BASES this results in an invalid CIGAR string. ", required = false) - //boolean keepCompletelyClippedReads = false; - -// @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false) -// boolean onlyClipFirstSeqMatch = false; - + /** + * The different values for this argument determines how ClipReads applies clips to the reads. This can range + * from writing Ns over the clipped bases to hard clipping away the bases from the BAM. + */ @Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false) ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS; + @Hidden + @Advanced + @Argument(fullName="read", doc="", required=false) + String onlyDoRead = null; /** * List of sequence that should be clipped from the reads @@ -180,12 +296,12 @@ public class ClipReadsWalker extends ReadWalker p : cyclesToClip) { // iterate over each cycle range int cycleStart = p.first; @@ -270,10 +391,13 @@ public class ClipReadsWalker extends ReadWalker clipSeqs) { + super(read); + data = new ClippingData(clipSeqs); + } + + public ClippingData getData() { + return data; + } + + public void setData(ClippingData data) { + this.data = data; + } + + public void addData(ClippingData data) { + this.data.addData(data); + } + } + + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java index 4bfedb672..e2db1dc52 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java @@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.utils.GenomeLoc; @@ -17,7 +17,7 @@ import java.util.Set; * To change this template use File | Settings | File Templates. */ @Requires({DataSource.READS,DataSource.REFERENCE}) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentReadFilter.class}) +@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class}) public abstract class DuplicateWalker extends Walker { // Do we actually want to operate on the context? public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index b0b2687f4..8152f74c2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -3,8 +3,8 @@ package org.broadinstitute.sting.gatk.walkers; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; -import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckReadFilter; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter; +import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -18,7 +18,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @By(DataSource.READS) @Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.INTERVAL) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class}) +@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) public abstract class LocusWalker extends Walker { // Do we actually want to operate on the context? public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java index 508d1f6ee..4d8be4800 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java @@ -25,15 +25,14 @@ package org.broadinstitute.sting.gatk.walkers; -import org.broad.tribble.dbsnp.DbSNPFeature; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; @@ -41,6 +40,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.io.PrintStream; import java.util.ArrayList; +import java.util.Collections; import java.util.List; /** @@ -68,6 +68,9 @@ public class PileupWalker extends LocusWalker implements TreeR @Argument(fullName="showIndelPileups",shortName="show_indels",doc="In addition to base pileups, generate pileups of extended indel events") public boolean SHOW_INDEL_PILEUPS = false; + @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) + public List> rods = Collections.emptyList(); + public void initialize() { } @@ -112,18 +115,11 @@ public class PileupWalker extends LocusWalker implements TreeR */ private String getReferenceOrderedData( RefMetaDataTracker tracker ) { ArrayList rodStrings = new ArrayList(); - for ( GATKFeature datum : tracker.getAllRods() ) { - if ( datum != null && datum.getUnderlyingObject() instanceof ReferenceOrderedDatum ) { - rodStrings.add(((ReferenceOrderedDatum)datum.getUnderlyingObject()).toSimpleString()); // TODO: Aaron: this line still survives, try to remove it - } + for ( Feature datum : tracker.getValues(rods) ) { + rodStrings.add(datum.toString()); } String rodString = Utils.join(", ", rodStrings); - DbSNPFeature dbsnp = tracker.lookup(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, DbSNPFeature.class); - - if ( dbsnp != null) - rodString += DbSNPHelper.toMediumString(dbsnp); - if ( !rodString.equals("") ) rodString = "[ROD: " + rodString + "]"; @@ -132,8 +128,6 @@ public class PileupWalker extends LocusWalker implements TreeR @Override public void onTraversalDone(Integer result) { - // Double check traversal result to make count is the same. - // TODO: Is this check necessary? out.println("[REDUCE RESULT] Traversal result is: " + result); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java index 158992a22..7960f5c35 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintRODsWalker.java @@ -25,21 +25,24 @@ package org.broadinstitute.sting.gatk.walkers; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import java.io.PrintStream; -import java.util.Iterator; /** * Prints out all of the RODs in the input data set. Data is rendered using the toString() method * of the given ROD. */ public class PrintRODsWalker extends RodWalker { + @Input(fullName="input", shortName = "input", doc="The input ROD which should be printed out.", required=true) + public RodBinding input; + @Output PrintStream out; @@ -61,11 +64,8 @@ public class PrintRODsWalker extends RodWalker { if ( tracker == null ) return 0; - Iterator rods = tracker.getAllRods().iterator(); - while ( rods.hasNext() ) { - Object rod = rods.next().getUnderlyingObject(); - if (VariantContextAdaptors.canBeConvertedToVariantContext(rod) ) - out.println(rod.toString()); + for ( Feature feature : tracker.getValues(Feature.class, context.getLocation()) ) { + out.println(feature.toString()); } return 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 7e1dcd707..4f072e88c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -40,26 +40,72 @@ import java.util.TreeSet; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; + /** - * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear - * in the input file. It can dynamically merge the contents of multiple input BAM files, resulting - * in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter - * command line argument. + * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. + * + *

+ * PrintReads can dynamically merge the contents of multiple input BAM files, resulting + * in merged output sorted in coordinate order. Can also optionally filter reads based on the + * --read_filter command line argument. + * + *

Input

+ *

+ * One or more bam files. + *

+ * + *

Output

+ *

+ * A single processed bam file. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PrintReads \
+ *   -o output.bam \
+ *   -I input1.bam \
+ *   -I input2.bam \
+ *   --read_filter MappingQualityZero
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PrintReads \
+ *   -o output.bam \
+ *   -I input.bam \
+ *   -n 2000
+ * 
+ * */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @Requires({DataSource.READS, DataSource.REFERENCE}) public class PrintReadsWalker extends ReadWalker { - /** an optional argument to dump the reads out to a BAM file */ + @Output(doc="Write output to this BAM filename instead of STDOUT") SAMFileWriter out; + @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false) String readGroup = null; + + /** + * For example, --platform ILLUMINA or --platform 454. + */ @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false) - String platform = null; // E.g. ILLUMINA, 454 + String platform = null; + @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) int nReadsToPrint = -1; + + /** + * Only reads from samples listed in the provided file(s) will be included in the output. + */ @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) public Set sampleFile = new TreeSet(); + + /** + * Only reads from the sample(s) will be included in the output. + */ @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) public Set sampleNames = new TreeSet(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java index ff3b6d82f..5f11686a1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.picard.filter.SamRecordFilter; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.lang.annotation.*; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 384742302..c88c7c3c4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -26,11 +26,14 @@ package org.broadinstitute.sting.gatk.walkers; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.MalformedReadFilter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.GenericDocumentationHandler; import java.util.List; @@ -44,6 +47,10 @@ import java.util.List; @ReadFilters(MalformedReadFilter.class) @PartitionBy(PartitionType.NONE) @BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) +@DocumentedGATKFeature( + groupName = "GATK walkers", + summary = "General tools available for running on the command line as part of the GATK package", + extraDocs = {CommandLineGATK.class}) public abstract class Walker { final protected static Logger logger = Logger.getLogger(Walker.class); private GenomeAnalysisEngine toolkit; @@ -119,6 +126,17 @@ public abstract class Walker { public void initialize() { } + /** + * A function for overloading in subclasses providing a mechanism to abort early from a walker. + * + * If this ever returns true, then the Traversal engine will stop executing map calls + * and start the process of shutting down the walker in an orderly fashion. + * @return + */ + public boolean isDone() { + return false; + } + /** * Provide an initial value for reduce computations. * @return Initial value of reduce. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index 3144098a8..cf68a9121 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -42,9 +43,9 @@ import java.util.List; import java.util.Map; -public class AlleleBalance implements InfoFieldAnnotation { +public class AlleleBalance extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; @@ -89,7 +90,7 @@ public class AlleleBalance implements InfoFieldAnnotation { } // todo -- actually care about indel length from the pileup (agnostic at the moment) int refCount = indelPileup.size(); - int altCount = vc.isInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions(); + int altCount = vc.isSimpleInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions(); if ( refCount + altCount == 0 ) { continue; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index a99f87a70..ddb7ab828 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.utils.MathUtils; @@ -15,9 +16,9 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -public class AlleleBalanceBySample implements GenotypeAnnotation, ExperimentalAnnotation { +public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { Double ratio = annotateSNP(stratifiedContext, vc, g); if (ratio == null) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java index 6c14e7445..dc41dbc81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java @@ -8,7 +8,7 @@ import java.util.Map; -public abstract class AnnotationByDepth implements InfoFieldAnnotation { +public abstract class AnnotationByDepth extends InfoFieldAnnotation { protected int annotationByVariantDepth(final Map genotypes, Map stratifiedContexts) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java index 66416ce11..ecfd9b707 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java @@ -34,6 +34,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -46,9 +47,9 @@ import java.util.List; import java.util.Map; -public class BaseCounts implements InfoFieldAnnotation { +public class BaseCounts extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 74f7f9d80..ad06dcf52 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -43,14 +44,14 @@ import java.util.List; import java.util.Map; -public class ChromosomeCounts implements InfoFieldAnnotation, StandardAnnotation { +public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation { private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed"), new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes") }; - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( ! vc.hasGenotypes() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index c384e0d09..a4d8db5bd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -16,9 +17,9 @@ import java.util.List; import java.util.Map; -public class DepthOfCoverage implements InfoFieldAnnotation, StandardAnnotation { +public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index e3e8bc258..1652c8de7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; @@ -22,13 +23,13 @@ import java.util.List; import java.util.Map; -public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnotation { +public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { private static String REF_ALLELE = "REF"; private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { if ( g == null || !g.isCalled() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 97ed221e7..0cfca48fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -28,6 +28,7 @@ import cern.jet.math.Arithmetic; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; @@ -42,11 +43,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -public class FisherStrand implements InfoFieldAnnotation, StandardAnnotation { +public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( ! vc.isVariant() || vc.isFiltered() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index 48677bbe5..a46473f60 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.BaseUtils; @@ -16,9 +17,9 @@ import java.util.List; import java.util.Map; -public class GCContent implements InfoFieldAnnotation, ExperimentalAnnotation { +public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { double content = computeGCContent(ref); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", content)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java index cca0ad4bc..5295d6d21 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.MathUtils; @@ -23,11 +24,11 @@ import java.util.Map; */ // A set of annotations calculated directly from the GLs -public class GLstats implements InfoFieldAnnotation, StandardAnnotation { +public class GLstats extends InfoFieldAnnotation implements StandardAnnotation { private static final int MIN_SAMPLES = 10; - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { final Map genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index b175579f1..9af3b8e8e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -29,6 +29,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; @@ -48,13 +49,13 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { +public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation { private final static boolean DEBUG = false; private final static int MIN_CONTEXT_WING_SIZE = 10; private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50; private final static char REGEXP_WILDCARD = '.'; - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if (stratifiedContexts.size() == 0 ) // size 0 means that call was made by someone else and we have no data here return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index d86728d5e..045505698 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -4,6 +4,7 @@ import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation; import org.broadinstitute.sting.utils.QualityUtils; @@ -18,13 +19,13 @@ import java.util.List; import java.util.Map; -public class HardyWeinberg implements InfoFieldAnnotation, WorkInProgressAnnotation { +public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation { private static final int MIN_SAMPLES = 10; private static final int MIN_GENOTYPE_QUALITY = 10; private static final int MIN_NEG_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10; - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { final Map genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index 02efd854c..463f7a645 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.GenomeLoc; @@ -16,11 +17,11 @@ import java.util.List; import java.util.Map; -public class HomopolymerRun implements InfoFieldAnnotation, StandardAnnotation { +public class HomopolymerRun extends InfoFieldAnnotation implements StandardAnnotation { private boolean ANNOTATE_INDELS = true; - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( !vc.isBiallelic() ) return null; @@ -78,7 +79,7 @@ public class HomopolymerRun implements InfoFieldAnnotation, StandardAnnotation { GenomeLoc locus = ref.getLocus(); GenomeLoc window = ref.getWindow(); int refBasePos = (int) (locus.getStart() - window.getStart())+1; - if ( vc.isDeletion() ) { + if ( vc.isSimpleDeletion() ) { // check that deleted bases are the same byte dBase = bases[refBasePos]; for ( int i = 0; i < vc.getReference().length(); i ++ ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java index 2fd62ddf3..bfede40d2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.IndelUtils; @@ -19,9 +20,9 @@ import java.util.*; * Time: 11:47:33 AM * To change this template use File | Settings | File Templates. */ -public class IndelType implements InfoFieldAnnotation, ExperimentalAnnotation { +public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { int run; if (vc.isMixed()) { @@ -35,9 +36,9 @@ public class IndelType implements InfoFieldAnnotation, ExperimentalAnnotation { if (!vc.isBiallelic()) type = "MULTIALLELIC_INDEL"; else { - if (vc.isInsertion()) + if (vc.isSimpleInsertion()) type = "INS."; - else if (vc.isDeletion()) + else if (vc.isSimpleDeletion()) type = "DEL."; else type = "OTHER."; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java index 1d999c531..09ffe0fb6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -16,9 +17,9 @@ import java.util.List; import java.util.Map; -public class LowMQ implements InfoFieldAnnotation { +public class LowMQ extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index f240d02bc..f9caae227 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -18,9 +19,9 @@ import java.util.List; import java.util.Map; -public class MappingQualityZero implements InfoFieldAnnotation, StandardAnnotation { +public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index 0ca53adf2..3d234a1e3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; @@ -49,9 +50,9 @@ import java.util.Map; * Time: 6:46:25 PM * To change this template use File | Settings | File Templates. */ -public class MappingQualityZeroBySample implements GenotypeAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, - AlignmentContext context, VariantContext vc, Genotype g) { +public class MappingQualityZeroBySample extends GenotypeAnnotation { + public Map annotate(RefMetaDataTracker tracker, + AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) { if ( g == null || !g.isCalled() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java index 08a25a7e3..3e8fe8998 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -18,9 +19,9 @@ import java.util.Map; -public class MappingQualityZeroFraction implements InfoFieldAnnotation, ExperimentalAnnotation { +public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java index 1c70a1b33..74c562045 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -21,8 +22,8 @@ import java.util.Map; * Date: 5/16/11 */ -public class NBaseCount implements InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { +public class NBaseCount extends InfoFieldAnnotation { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 2175d39e6..9a292c39a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -3,7 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -16,9 +16,9 @@ import java.util.List; import java.util.Map; -public class QualByDepth extends AnnotationByDepth implements InfoFieldAnnotation, StandardAnnotation { +public class QualByDepth extends AnnotationByDepth implements StandardAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index d52f07b58..668129888 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.MathUtils; @@ -20,9 +21,9 @@ import java.util.List; import java.util.Map; -public class RMSMappingQuality implements InfoFieldAnnotation, StandardAnnotation { +public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 5466828f6..52c704055 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; @@ -21,11 +22,11 @@ import java.util.Map; -public abstract class RankSumTest implements InfoFieldAnnotation, StandardAnnotation { +public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation { static final double INDEL_LIKELIHOOD_THRESH = 0.1; static final boolean DEBUG = false; - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java index c56e2622d..26ca08380 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; @@ -52,13 +53,13 @@ import java.util.Map; * Time: 3:59:27 PM * To change this template use File | Settings | File Templates. */ -public class ReadDepthAndAllelicFractionBySample implements GenotypeAnnotation { +public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation { private static String REF_ALLELE = "REF"; private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { if ( g == null || !g.isCalled() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java index a5ebd8db2..180bed24d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -18,7 +19,7 @@ import java.util.Map; public class SBByDepth extends AnnotationByDepth { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index ff9092a71..cd396036f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -41,9 +42,9 @@ import java.util.List; import java.util.Map; -public class SampleList implements InfoFieldAnnotation { +public class SampleList extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( vc.isMonomorphic() || !vc.hasGenotypes() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java new file mode 100644 index 000000000..4ead77506 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -0,0 +1,473 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +/** + * A set of genomic annotations based on the output of the SnpEff variant effect predictor tool + * (http://snpeff.sourceforge.net/). + * + * For each variant, chooses one of the effects of highest biological impact from the SnpEff + * output file (which must be provided on the command line via --snpEffFile filename.vcf), + * and adds annotations on that effect. + * + * @author David Roazen + */ +public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotation { + + private static Logger logger = Logger.getLogger(SnpEff.class); + + // We refuse to parse SnpEff output files generated by unsupported versions, or + // lacking a SnpEff version number in the VCF header: + public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" }; + public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; + + // SnpEff aggregates all effects (and effect metadata) together into a single INFO + // field annotation with the key EFF: + public static final String SNPEFF_INFO_FIELD_KEY = "EFF"; + public static final String SNPEFF_EFFECT_METADATA_DELIMITER = "[()]"; + public static final String SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER = "\\|"; + + // Key names for the INFO field annotations we will add to each record, along + // with parsing-related information: + public enum InfoFieldKey { + EFFECT_KEY ("SNPEFF_EFFECT", -1), + IMPACT_KEY ("SNPEFF_IMPACT", 0), + CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1), + AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2), + GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3), + GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4), + TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6), + EXON_ID_KEY ("SNPEFF_EXON_ID", 7); + + // Actual text of the key + private final String keyName; + + // Index within the effect metadata subfields from the SnpEff EFF annotation + // where each key's associated value can be found during parsing. + private final int fieldIndex; + + InfoFieldKey ( String keyName, int fieldIndex ) { + this.keyName = keyName; + this.fieldIndex = fieldIndex; + } + + public String getKeyName() { + return keyName; + } + + public int getFieldIndex() { + return fieldIndex; + } + } + + // Possible SnpEff biological effects. All effect names found in the SnpEff input file + // are validated against this list. + public enum EffectType { + NONE, + CHROMOSOME, + INTERGENIC, + UPSTREAM, + UTR_5_PRIME, + UTR_5_DELETED, + START_GAINED, + SPLICE_SITE_ACCEPTOR, + SPLICE_SITE_DONOR, + START_LOST, + SYNONYMOUS_START, + NON_SYNONYMOUS_START, + CDS, + GENE, + TRANSCRIPT, + EXON, + EXON_DELETED, + NON_SYNONYMOUS_CODING, + SYNONYMOUS_CODING, + FRAME_SHIFT, + CODON_CHANGE, + CODON_INSERTION, + CODON_CHANGE_PLUS_CODON_INSERTION, + CODON_DELETION, + CODON_CHANGE_PLUS_CODON_DELETION, + STOP_GAINED, + SYNONYMOUS_STOP, + NON_SYNONYMOUS_STOP, + STOP_LOST, + INTRON, + UTR_3_PRIME, + UTR_3_DELETED, + DOWNSTREAM, + INTRON_CONSERVED, + INTERGENIC_CONSERVED, + REGULATION, + CUSTOM, + WITHIN_NON_CODING_GENE + } + + // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. + public enum EffectImpact { + LOW (1), + MODERATE (2), + HIGH (3); + + private final int severityRating; + + EffectImpact ( int severityRating ) { + this.severityRating = severityRating; + } + + public boolean isHigherImpactThan ( EffectImpact other ) { + return this.severityRating > other.severityRating; + } + } + + // SnpEff labels most effects as either CODING or NON_CODING, but sometimes omits this information. + public enum EffectCoding { + CODING, + NON_CODING, + UNKNOWN + } + + + public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { + validateRodBinding(walker.getSnpEffRodBinding()); + checkSnpEffVersion(walker, toolkit); + } + + public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) { + RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); + + // Get only SnpEff records that start at this locus, not merely span it: + List snpEffRecords = tracker.getValues(snpEffRodBinding, ref.getLocus()); + + // Within this set, look for a SnpEff record whose ref/alt alleles match the record to annotate. + // If there is more than one such record, we only need to pick the first one, since the biological + // effects will be the same across all such records: + VariantContext matchingRecord = getMatchingSnpEffRecord(snpEffRecords, vc); + if ( matchingRecord == null ) { + return null; + } + + // Parse the SnpEff INFO field annotation from the matching record into individual effect objects: + List effects = parseSnpEffRecord(matchingRecord); + if ( effects.size() == 0 ) { + return null; + } + + // Add only annotations for one of the most biologically-significant effects from this set: + SnpEffEffect mostSignificantEffect = getMostSignificantEffect(effects); + return mostSignificantEffect.getAnnotations(); + } + + private void validateRodBinding ( RodBinding snpEffRodBinding ) { + if ( snpEffRodBinding == null || ! snpEffRodBinding.isBound() ) { + throw new UserException("The SnpEff annotator requires that a SnpEff VCF output file be provided " + + "as a rodbinding on the command line via the --snpEffFile option, but " + + "no SnpEff rodbinding was found."); + } + } + + private void checkSnpEffVersion ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { + RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); + + VCFHeader snpEffVCFHeader = VCFUtils.getVCFHeadersFromRods(toolkit, Arrays.asList(snpEffRodBinding.getName())).get(snpEffRodBinding.getName()); + VCFHeaderLine snpEffVersionLine = snpEffVCFHeader.getOtherHeaderLine(SNPEFF_VCF_HEADER_VERSION_LINE_KEY); + + if ( snpEffVersionLine == null || snpEffVersionLine.getValue() == null || snpEffVersionLine.getValue().trim().length() == 0 ) { + throw new UserException("Could not find a " + SNPEFF_VCF_HEADER_VERSION_LINE_KEY + " entry in the VCF header for the SnpEff " + + "input file, and so could not verify that the file was generated by a supported version of SnpEff (" + + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS) + ")"); + } + + String snpEffVersionString = snpEffVersionLine.getValue().replaceAll("\"", "").split(" ")[0]; + + if ( ! isSupportedSnpEffVersion(snpEffVersionString) ) { + throw new UserException("The version of SnpEff used to generate the SnpEff input file (" + snpEffVersionString + ") " + + "is not currently supported by the GATK. Supported versions are: " + Arrays.toString(SUPPORTED_SNPEFF_VERSIONS)); + } + } + + private boolean isSupportedSnpEffVersion ( String versionString ) { + for ( String supportedVersion : SUPPORTED_SNPEFF_VERSIONS ) { + if ( supportedVersion.equals(versionString) ) { + return true; + } + } + + return false; + } + + private VariantContext getMatchingSnpEffRecord ( List snpEffRecords, VariantContext vc ) { + for ( VariantContext snpEffRecord : snpEffRecords ) { + if ( snpEffRecord.hasSameAlternateAllelesAs(vc) && snpEffRecord.getReference().equals(vc.getReference()) ) { + return snpEffRecord; + } + } + + return null; + } + + private List parseSnpEffRecord ( VariantContext snpEffRecord ) { + List parsedEffects = new ArrayList(); + + Object effectFieldValue = snpEffRecord.getAttribute(SNPEFF_INFO_FIELD_KEY); + List individualEffects; + + // The VCF codec stores multi-valued fields as a List, and single-valued fields as a String. + // We can have either in the case of SnpEff, since there may be one or more than one effect in this record. + if ( effectFieldValue instanceof List ) { + individualEffects = (List)effectFieldValue; + } + else { + individualEffects = Arrays.asList((String)effectFieldValue); + } + + for ( String effectString : individualEffects ) { + String[] effectNameAndMetadata = effectString.split(SNPEFF_EFFECT_METADATA_DELIMITER); + + if ( effectNameAndMetadata.length != 2 ) { + logger.warn(String.format("Malformed SnpEff effect field at %s:%d, skipping: %s", + snpEffRecord.getChr(), snpEffRecord.getStart(), effectString)); + continue; + } + + String effectName = effectNameAndMetadata[0]; + String[] effectMetadata = effectNameAndMetadata[1].split(SNPEFF_EFFECT_METADATA_SUBFIELD_DELIMITER, -1); + + SnpEffEffect parsedEffect = new SnpEffEffect(effectName, effectMetadata); + + if ( parsedEffect.isWellFormed() ) { + parsedEffects.add(parsedEffect); + } + else { + logger.warn(String.format("Skipping malformed SnpEff effect field at %s:%d. Error was: \"%s\". Field was: \"%s\"", + snpEffRecord.getChr(), snpEffRecord.getStart(), parsedEffect.getParseError(), effectString)); + } + } + + return parsedEffects; + } + + private SnpEffEffect getMostSignificantEffect ( List effects ) { + SnpEffEffect mostSignificantEffect = null; + + for ( SnpEffEffect effect : effects ) { + if ( mostSignificantEffect == null || + effect.isHigherImpactThan(mostSignificantEffect) ) { + + mostSignificantEffect = effect; + } + } + + return mostSignificantEffect; + } + + public List getKeyNames() { + return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(), + InfoFieldKey.IMPACT_KEY.getKeyName(), + InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), + InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), + InfoFieldKey.GENE_NAME_KEY.getKeyName(), + InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), + InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), + InfoFieldKey.EXON_ID_KEY.getKeyName() + ); + } + + public List getDescriptions() { + return Arrays.asList( + new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), + new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") + ); + } + + /** + * Helper class to parse, validate, and store a single SnpEff effect and its metadata. + */ + protected static class SnpEffEffect { + private EffectType effect; + private EffectImpact impact; + private String codonChange; + private String aminoAcidChange; + private String geneName; + private String geneBiotype; + private EffectCoding coding; + private String transcriptID; + private String exonID; + + private String parseError = null; + private boolean isWellFormed = true; + + private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10; + + // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header, + // errors come after warnings, not vice versa: + private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1; + private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1; + + private static final int SNPEFF_CODING_FIELD_INDEX = 5; + + public SnpEffEffect ( String effectName, String[] effectMetadata ) { + parseEffectName(effectName); + parseEffectMetadata(effectMetadata); + } + + private void parseEffectName ( String effectName ) { + try { + effect = EffectType.valueOf(effectName); + } + catch ( IllegalArgumentException e ) { + parseError(String.format("%s is not a recognized effect type", effectName)); + } + } + + private void parseEffectMetadata ( String[] effectMetadata ) { + if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) { + if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) { + parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX])); + } + else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) { + parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX])); + } + else { + parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d", + EXPECTED_NUMBER_OF_METADATA_FIELDS, effectMetadata.length)); + } + + return; + } + + try { + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); + } + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + } + + codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()]; + aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()]; + geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()]; + geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()]; + + if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { + try { + coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]); + } + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect coding: %s", effectMetadata[SNPEFF_CODING_FIELD_INDEX])); + } + } + else { + coding = EffectCoding.UNKNOWN; + } + + transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()]; + exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()]; + } + + private void parseError ( String message ) { + isWellFormed = false; + + // Cache only the first error encountered: + if ( parseError == null ) { + parseError = message; + } + } + + public boolean isWellFormed() { + return isWellFormed; + } + + public String getParseError() { + return parseError == null ? "" : parseError; + } + + public boolean isCoding() { + return coding == EffectCoding.CODING; + } + + public boolean isHigherImpactThan ( SnpEffEffect other ) { + // If one effect is within a coding gene and the other is not, the effect that is + // within the coding gene has higher impact: + + if ( isCoding() && ! other.isCoding() ) { + return true; + } + else if ( ! isCoding() && other.isCoding() ) { + return false; + } + + // Otherwise, both effects are either in or not in a coding gene, so we compare the impacts + // of the effects themselves: + + return impact.isHigherImpactThan(other.impact); + } + + public Map getAnnotations() { + Map annotations = new LinkedHashMap(Utils.optimumHashSize(InfoFieldKey.values().length)); + + addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString()); + addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange); + addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange); + addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName); + addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype); + addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID); + addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID); + + return annotations; + } + + private void addAnnotation ( Map annotations, String keyName, String keyValue ) { + // Only add annotations for keys associated with non-empty values: + if ( keyValue != null && keyValue.trim().length() > 0 ) { + annotations.put(keyName, keyValue); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index a4668eeb6..42203824f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -16,9 +17,9 @@ import java.util.List; import java.util.Map; -public class SpanningDeletions implements InfoFieldAnnotation, StandardAnnotation { +public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java index b46d82d8b..fa48c57a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -24,12 +25,12 @@ import java.util.Map; * Time: 3:14 PM * To change this template use File | Settings | File Templates. */ -public class TechnologyComposition implements ExperimentalAnnotation,InfoFieldAnnotation { +public class TechnologyComposition extends InfoFieldAnnotation implements ExperimentalAnnotation { private String nSLX = "NumSLX"; private String n454 ="Num454"; private String nSolid = "NumSOLiD"; private String nOther = "NumOther"; - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index acbeee3b2..fb3dbc3cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -25,15 +25,16 @@ package org.broadinstitute.sting.gatk.walkers.annotator; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationType; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.BaseUtils; @@ -47,26 +48,103 @@ import java.util.*; /** - * Annotates variant calls with context information. Users can specify which of the available annotations to use. + * Annotates variant calls with context information. + * + *

+ * VariantAnnotator is a GATK tool for annotating variant calls based on their context. + * The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself. + * + *

Input

+ *

+ * A variant set to annotate and optionally one or more BAM files. + *

+ * + *

Output

+ *

+ * An annotated VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantAnnotator \
+ *   -I input.bam \
+ *   -o output.vcf \
+ *   -A DepthOfCoverage
+ *   --variant input.vcf \
+ *   --dbsnp dbsnp.vcf
+ * 
+ * */ -@Requires(value={},referenceMetaData=@RMD(name="variant",type=VariantContext.class)) +@Requires(value={}) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @Reference(window=@Window(start=-50,stop=50)) @By(DataSource.REFERENCE) -public class VariantAnnotator extends RodWalker { +public class VariantAnnotator extends RodWalker implements AnnotatorCompatibleWalker { + + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + public RodBinding getVariantRodBinding() { return variantCollection.variants; } + + /** + * The INFO field will be annotated with information on the most biologically-significant effect + * listed in the SnpEff output file for each variant. + */ + @Input(fullName="snpEffFile", shortName = "snpEffFile", doc="A SnpEff output file from which to add annotations", required=false) + public RodBinding snpEffFile; + public RodBinding getSnpEffRodBinding() { return snpEffFile; } + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + /** + * If a record in the 'variant' track overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + /** + * An external resource VCF file or files from which to annotate. + * + * One can add annotations from one of the resource VCFs to the output. + * For example, if you want to annotate your 'variant' VCF with the AC field value from the rod bound to 'resource', + * you can specify '-E resource.AC' and records in the output VCF will be annotated with 'resource.AC=N' when a record exists in that rod at the given position. + * If multiple records in the rod overlap the given position, one is chosen arbitrarily. + */ + @Input(fullName="resource", shortName = "resource", doc="external resource VCF file", required=false) + public List> resources = Collections.emptyList(); + public List> getResourceRodBindings() { return resources; } @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) - protected String sampleName = null; - + /** + * See the -list argument to view available annotations. + */ @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) protected List annotationsToUse = new ArrayList(); + /** + * See the -list argument to view available groups. + */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) protected List annotationGroupsToUse = new ArrayList(); + /** + * This option enables you to add annotations from one VCF to another. + * + * For example, if you want to annotate your 'variant' VCF with the AC field value from the rod bound to 'resource', + * you can specify '-E resource.AC' and records in the output VCF will be annotated with 'resource.AC=N' when a record exists in that rod at the given position. + * If multiple records in the rod overlap the given position, one is chosen arbitrarily. + */ @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false) protected List expressionsToUse = new ArrayList(); @@ -84,8 +162,6 @@ public class VariantAnnotator extends RodWalker { @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) protected boolean indelsOnly = false; - private HashMap nonVCFsampleName = new HashMap(); - private VariantAnnotatorEngine engine; private Collection indelBufferContext; @@ -118,32 +194,27 @@ public class VariantAnnotator extends RodWalker { listAnnotationsAndExit(); // get the list of all sample names from the variant VCF input rod, if applicable - Set rodName = new HashSet(); - rodName.add("variant"); + List rodName = Arrays.asList(variantCollection.variants.getName()); Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); - // add the non-VCF sample from the command-line, if applicable - if ( sampleName != null ) { - nonVCFsampleName.put(sampleName.toUpperCase(), "variant"); - samples.add(sampleName.toUpperCase()); - } - // if there are no valid samples, warn the user if ( samples.size() == 0 ) { logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired."); } if ( USE_ALL_ANNOTATIONS ) - engine = new VariantAnnotatorEngine(getToolkit()); + engine = new VariantAnnotatorEngine(this, getToolkit()); else - engine = new VariantAnnotatorEngine(getToolkit(), annotationGroupsToUse, annotationsToUse); + engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit()); engine.initializeExpressions(expressionsToUse); + engine.invokeAnnotationInitializationMethods(); + // setup the header fields // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones Set hInfo = new HashSet(); hInfo.addAll(engine.getVCFAnnotationDescriptions()); - for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList("variant")) ) { + for ( VCFHeaderLine line : VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } @@ -202,7 +273,7 @@ public class VariantAnnotator extends RodWalker { if ( tracker == null ) return 0; - Collection VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false); + Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); if ( VCs.size() == 0 ) return 0; @@ -219,18 +290,18 @@ public class VariantAnnotator extends RodWalker { if ( stratifiedContexts != null ) { annotatedVCs = new ArrayList(VCs.size()); for ( VariantContext vc : VCs ) - annotatedVCs.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); + annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); } } if ( ! indelsOnly ) { for ( VariantContext annotatedVC : annotatedVCs ) - vcfWriter.add(annotatedVC, ref.getBase()); + vcfWriter.add(annotatedVC); } else { // check to see if the buffered context is different (in location) this context if ( indelBufferContext != null && ! VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),indelBufferContext.iterator().next()).equals(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),annotatedVCs.iterator().next())) ) { for ( VariantContext annotatedVC : indelBufferContext ) - vcfWriter.add(annotatedVC, ref.getBase()); + vcfWriter.add(annotatedVC); indelBufferContext = annotatedVCs; } else { indelBufferContext = annotatedVCs; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index fdf498a3d..68cd07803 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -25,85 +25,75 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.GenomicAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.JoinTable; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -import java.util.Map.Entry; public class VariantAnnotatorEngine { - public static final String dbPrefix = "comp"; - private List requestedInfoAnnotations; private List requestedGenotypeAnnotations; private List requestedExpressions = new ArrayList(); - private HashMap dbAnnotations = new HashMap(); - - // command-line option from GenomicAnnotator. - private Map> requestedColumnsMap; - - // command-line option from GenomicAnnotator. - private boolean oneToMany; - - // command-line option from GenomicAnnotator. - private List joinTables; - - // used by GenomicAnnotator. Maps binding name to number of output VCF records - // annotated with records from the input table with this binding name. Only used for - // printing out stats at the end. - private Map inputTableHitCounter = new HashMap(); + private HashMap, String> dbAnnotations = new HashMap, String>(); + private AnnotatorCompatibleWalker walker; + private GenomeAnalysisEngine toolkit; private static class VAExpression { - public String fullName, bindingName, fieldName; - public VAExpression(String fullEpression) { + public String fullName, fieldName; + public RodBinding binding; + + public VAExpression(String fullEpression, List> bindings) { int indexOfDot = fullEpression.lastIndexOf("."); if ( indexOfDot == -1 ) throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format"); fullName = fullEpression; - bindingName = fullEpression.substring(0, indexOfDot); fieldName = fullEpression.substring(indexOfDot+1); + + String bindingName = fullEpression.substring(0, indexOfDot); + for ( RodBinding rod : bindings ) { + if ( rod.getName().equals(bindingName) ) { + binding = rod; + break; + } + } } } // use this constructor if you want all possible annotations - public VariantAnnotatorEngine(GenomeAnalysisEngine engine) { + public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { + this.walker = walker; + this.toolkit = toolkit; requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations(); requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations(); - initializeDBs(engine); + initializeDBs(); } // use this constructor if you want to select specific annotations (and/or interfaces) - public VariantAnnotatorEngine(GenomeAnalysisEngine engine, List annotationGroupsToUse, List annotationsToUse) { + public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { + this.walker = walker; + this.toolkit = toolkit; initializeAnnotations(annotationGroupsToUse, annotationsToUse); - initializeDBs(engine); + initializeDBs(); } // select specific expressions to use public void initializeExpressions(List expressionsToUse) { // set up the expressions for ( String expression : expressionsToUse ) - requestedExpressions.add(new VAExpression(expression)); + requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); } private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse) { @@ -112,17 +102,25 @@ public class VariantAnnotatorEngine { requestedGenotypeAnnotations = AnnotationInterfaceManager.createGenotypeAnnotations(annotationGroupsToUse, annotationsToUse); } - private void initializeDBs(GenomeAnalysisEngine engine) { + private void initializeDBs() { // check to see whether comp rods were included - List dataSources = engine.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { - dbAnnotations.put(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, VCFConstants.DBSNP_KEY); - } - else if ( source.getName().startsWith(dbPrefix) ) { - dbAnnotations.put(source.getName(), source.getName().substring(dbPrefix.length())); - } + final RodBinding dbsnp = walker.getDbsnpRodBinding(); + if ( dbsnp != null && dbsnp.isBound() ) + dbAnnotations.put(dbsnp, VCFConstants.DBSNP_KEY); + + final List> comps = walker.getCompRodBindings(); + for ( RodBinding rod : comps ) + dbAnnotations.put(rod, rod.getName()); + } + + public void invokeAnnotationInitializationMethods() { + for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) { + annotation.initialize(walker, toolkit); + } + + for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) { + annotation.initialize(walker, toolkit); } } @@ -134,13 +132,13 @@ public class VariantAnnotatorEngine { descriptions.addAll(annotation.getDescriptions()); for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) descriptions.addAll(annotation.getDescriptions()); - for ( Map.Entry dbSet : dbAnnotations.entrySet() ) - descriptions.add(new VCFInfoHeaderLine(dbSet.getValue(), 0, VCFHeaderLineType.Flag, (dbSet.getKey().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ? "dbSNP" : dbSet.getValue()) + " Membership")); + for ( String db : dbAnnotations.values() ) + descriptions.add(new VCFInfoHeaderLine(db, 0, VCFHeaderLineType.Flag, (db.equals(VCFConstants.DBSNP_KEY) ? "dbSNP" : db) + " Membership")); return descriptions; } - public Collection annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); @@ -150,60 +148,31 @@ public class VariantAnnotatorEngine { // annotate expressions where available annotateExpressions(tracker, ref, infoAnnotations); - // process the info field - List> infoAnnotationOutputsList = new LinkedList>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file - infoAnnotationOutputsList.add(new LinkedHashMap(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line. - infoAnnotationOutputsList.get(0).putAll(infoAnnotations); // put the DB membership info in - // go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) - { - Map annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc); - if ( annotationsFromCurrentType == null ) { - continue; - } - - if(annotationType instanceof GenomicAnnotation) - { - infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType ); - } - else - { - // add the annotations to each output line. - for(Map infoAnnotationOutput : infoAnnotationOutputsList) { - infoAnnotationOutput.putAll(annotationsFromCurrentType); - } - } + for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc); + if ( annotationsFromCurrentType != null ) + infoAnnotations.putAll(annotationsFromCurrentType); } - // annotate genotypes - Map genotypes = annotateGenotypes(tracker, ref, stratifiedContexts, vc); + // generate a new annotated VC + final VariantContext annotatedVC = VariantContext.modifyAttributes(vc, infoAnnotations); - // create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList - Collection returnValue = new LinkedList(); - for(Map infoAnnotationOutput : infoAnnotationOutputsList) { - returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) ); - } - - return returnValue; + // annotate genotypes, creating another new VC in the process + return VariantContext.modifyGenotypes(annotatedVC, annotateGenotypes(tracker, ref, stratifiedContexts, vc)); } private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { - for ( Map.Entry dbSet : dbAnnotations.entrySet() ) { - if ( dbSet.getKey().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { - String rsID = null; - - if (vc.isSNP()) - rsID = DbSNPHelper.rsIDOfFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); - else if (vc.isIndel()) - rsID = DbSNPHelper.rsIDOfFirstRealIndel(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); - infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null ); + for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { + if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { + String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); + infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null); // annotate dbsnp id if available and not already there if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) ) infoAnnotations.put(VariantContext.ID_KEY, rsID); } else { boolean overlapsComp = false; - for ( VariantContext comp : tracker.getVariantContexts(ref, dbSet.getKey(), null, ref.getLocus(), false, false) ) { + for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) { if ( !comp.isFiltered() ) { overlapsComp = true; break; @@ -216,7 +185,7 @@ public class VariantAnnotatorEngine { private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map infoAnnotations) { for ( VAExpression expression : requestedExpressions ) { - Collection VCs = tracker.getVariantContexts(ref, expression.bindingName, null, ref.getLocus(), false, true); + Collection VCs = tracker.getValues(expression.binding, ref.getLocus()); if ( VCs.size() == 0 ) continue; @@ -241,7 +210,7 @@ public class VariantAnnotatorEngine { Map genotypeAnnotations = new HashMap(genotype.getAttributes()); for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { - Map result = annotation.annotate(tracker, ref, context, vc, genotype); + Map result = annotation.annotate(tracker, walker, ref, context, vc, genotype); if ( result != null ) genotypeAnnotations.putAll(result); } @@ -250,390 +219,4 @@ public class VariantAnnotatorEngine { return genotypes; } - - // Finish processing data from GenomicAnnotation. - private List> processGenomicAnnotation( List> infoAnnotationOutputsList, Map annotationsForCurrentLocusFromAllAnnotatorInputTables) - { - - //process the map returned by GenomicAnnotation. This completes processing of the -B args. - for( Map.Entry annotationsFromOneInputTable : annotationsForCurrentLocusFromAllAnnotatorInputTables.entrySet() ) - { - final String inputTableBindingName = annotationsFromOneInputTable.getKey(); - final List> matchingRecords = (List>) annotationsFromOneInputTable.getValue(); - - if( matchingRecords.size() > 1 && oneToMany) - { - //More than one record matched in this file. After this, infoAnnotationOutputsList.size() will be infoAnnotationOutputsList.size()*matchingRecords.size(). - infoAnnotationOutputsList = explodeInfoAnnotationOutputsList( infoAnnotationOutputsList, matchingRecords, inputTableBindingName ); - } - else - { - //This doesn't change infoAnnotationOutputsList.size(). If more than one record matched, their annotations will - //all be added to the same output line, with keys disambiguated by appending _i . - addToExistingAnnotationOutputs( infoAnnotationOutputsList, matchingRecords, inputTableBindingName ); - } - } - - //process -J args - if(joinTables != null) - { - //for each joinTable, join it with the data in the info-field of each output line. - for(JoinTable joinTable : joinTables) - { - //for each info field, join it to the current join table - final List> previousInfoAnnotationOutputsList = new LinkedList>(infoAnnotationOutputsList); //create a shallow copy because infoAnnotationOutputsList will change during the iteration. - for(Map outputRecordInfoField : previousInfoAnnotationOutputsList) - { - infoAnnotationOutputsList = performJoin( infoAnnotationOutputsList, outputRecordInfoField, joinTable ); - } - } - } - - //apply -S args last to select the columns requested by the user - if(requestedColumnsMap != null) { - infoAnnotationOutputsList = applySelectArg(infoAnnotationOutputsList); - } - - return infoAnnotationOutputsList; - } - - // Performs a join between the an info field record represented by outputRecordInfoField and the infoAnnotationOutputsList. - private List> performJoin( List> infoAnnotationOutputsList, Map outputRecordInfoField, JoinTable joinTable) - { - //System.err.println("Looking at: " + joinTable.getLocalBindingName()+ "- join to " + joinTable.getExternalBindingName() + "." + joinTable.getExternalColumnName() ); - //for the current joinTable, for each output line, find the externalJoinColumnValue and see if it matches the joinColumnValue of any record(s) in this joinTable. - final String externalBindingName = joinTable.getExternalBindingName(); - final String externalColumnName = joinTable.getExternalColumnName(); - final String fullyQualifiedExternalColumnName = GenomicAnnotation.generateInfoFieldKey(externalBindingName, externalColumnName); - - //find the externalJoinColumnValue in the current info field, and then look up any joinTable records that have this value for the localJoinColumnValue - ArrayList matchingJoinTableRecord = null; //record in the join table whose joinColumnValue matches the joinColumnValue inside the current outputRecordInfoField. - final Object numInfoFieldKeysToCheckObj = outputRecordInfoField.get(GenomicAnnotation.generateInfoFieldKey(externalBindingName, GenomicAnnotation.NUM_MATCHES_SPECIAL_INFO_FIELD)); - if(numInfoFieldKeysToCheckObj == null) { - //only 1 record in the externalBindingName -B AnnotationInfoTable overlapped the current position - Object externalColumnValue = outputRecordInfoField.get(fullyQualifiedExternalColumnName); - if(externalColumnValue != null) { - matchingJoinTableRecord = joinTable.get(externalColumnValue.toString()); - //System.err.println("Found matching record in join table for record: " + outputRecordInfoField + " where " + fullyQualifiedExternalColumnName + "==" + externalColumnValue + ": " + matchingJoinTableRecords); - } - } else { - //multiple records in the externalBindingName -B AnnotationInfoTable overlapped the current position - final int numInfoFieldKeysToCheck = Integer.parseInt(numInfoFieldKeysToCheckObj.toString()); - for (int i = 0; i < numInfoFieldKeysToCheck; i++) { - final Object externalColumnValue = outputRecordInfoField.get(fullyQualifiedExternalColumnName + "_" + i); - if ( externalColumnValue != null ) { - matchingJoinTableRecord = joinTable.get(externalColumnValue.toString()); - if ( matchingJoinTableRecord != null ) { - //System.err.println("Found matching record(s) in join table for record: " + outputRecordInfoField + " where " + fullyQualifiedExternalColumnName + "==" + externalColumnValue + ": " + matchingJoinTableRecords); - break; - } - } - } - } - - //if a match for the externalJoinColumnValue in the current outputRecordInfoField has been found in the join table, perform the join. - if ( matchingJoinTableRecord != null ) - { - final String joinTableBindingName = joinTable.getLocalBindingName(); - - //convert the List> to List> by hashing the values from the ArrayList by their column names. - final List> matchingJoinTableRecordsConverted = new LinkedList>(); - final List columnNames = joinTable.getColumnNames(); - - final Map matchingRecord = new LinkedHashMap(); - for (int i = 0; i < columnNames.size(); i++) - matchingRecord.put(columnNames.get(i), matchingJoinTableRecord.get(i)); - - matchingJoinTableRecordsConverted.add(GenomicAnnotation.convertRecordToAnnotations(joinTableBindingName, matchingRecord)); - - // do the join between the outputRecordInfoField and the matchingJoinTableRecords, then add the results to to infoAnnotationOutputsList - List> tempList = new LinkedList>(); - tempList.add(outputRecordInfoField); - if( matchingJoinTableRecordsConverted.size() > 1 && oneToMany) - { - //More than one record in the joinTable matched the current info field. After this, infoAnnotationOutputsList.size() will be infoAnnotationOutputsList.size()*matchingRecords.size(). - tempList = explodeInfoAnnotationOutputsList( tempList, matchingJoinTableRecordsConverted, joinTableBindingName ); - } - else - { - //This doesn't change infoAnnotationOutputsList.size(). If more than one record matched, their annotations will - //all be added to the same output line, with keys disambiguated by appending _i . - addToExistingAnnotationOutputs( tempList, matchingJoinTableRecordsConverted, joinTableBindingName ); - } - - infoAnnotationOutputsList.remove(outputRecordInfoField); //remove the old info field - infoAnnotationOutputsList.addAll(tempList); //add the new info field(s) that have been joined with the matchingJoinTableRecords - } - return infoAnnotationOutputsList; - } - - - // Implements not-oneToMany mode, where the output lines have a one-to-one relationship - // with the input variants, and all multiple-match records are collapsed into the single info field. - // The collapsing is done by appending an _i to each key name (where 'i' is a record counter), as well - // as a special bindingName.numMatchingRecords=n key-value pair which specifies the upper limit of the counter. - private void addToExistingAnnotationOutputs( - final List> infoAnnotationOutputsList, - final List> matchingRecords, - final String bindingName) { - //For each matching record, just add its annotations to all existing output lines. - final boolean renameKeys = matchingRecords.size() > 1; - for(int i = 0; i < matchingRecords.size(); i++) { - Map currentRecord = matchingRecords.get(i); - - if(renameKeys) { - //Rename keys to avoid naming conflicts. After this all keys from the i'th matching record will have _i appended to them. - // (This solves the following problem: if you have multiple dbsnp matches - such as dbSNP.avHet=value1 from record 1 and - // dbSNP.avHet=value2 from record 2, the keys will be renamed to dbSNP.avHet_1=value1 and dbSNP.avHet_2=value2 ) - Map currentRecordWithRenamedKeys = new LinkedHashMap(); - for(final Map.Entry annotation : currentRecord.entrySet()) { - currentRecordWithRenamedKeys.put(annotation.getKey() + "_" + (i + 1), annotation.getValue()); - } - currentRecordWithRenamedKeys.put(GenomicAnnotation.generateInfoFieldKey(bindingName, GenomicAnnotation.NUM_MATCHES_SPECIAL_INFO_FIELD), - Integer.toString(matchingRecords.size())); //add the special field that specifies how many matchingRecords there were. - currentRecord = currentRecordWithRenamedKeys; - } - - //Add the annotations from this record to each output line. - for(Map outputRecordInfoField : infoAnnotationOutputsList) { - outputRecordInfoField.putAll(currentRecord); - } - } - - incrementStatsCounter(bindingName, infoAnnotationOutputsList.size()); - } - - /** - * Records statistics that will be printed when GenomicAnnotator finishes. - * - * @param bindingName The table from which annotations were gotten - * @param numNewRecords The number of new output VCF records created with annotations from this table - */ - private void incrementStatsCounter( final String bindingName, int numNewRecords) { - //record some stats - there were infoAnnotationOutputsList.size() output VCF records annotated with data from the 'bindingName' input table. - Integer counter = inputTableHitCounter.get(bindingName); - if( counter == null ) { - inputTableHitCounter.put(bindingName, numNewRecords); //init the counter - } else { - inputTableHitCounter.put(bindingName, counter + numNewRecords); //increment the counter - } - } - - // Implements oneToMany mode. Takes the current infoAnnotationOutputsList - // (where each element represents a line in the output VCF file), and - // generates a new infoAnnotationOutputsList which contains one copy of the current - // infoAnnotationOutputs for each record matchingRecords. - // The returned list will have size: - // infoAnnotationOutputsList.size() * matchingRecords.size() - private List> explodeInfoAnnotationOutputsList( - final List> infoAnnotationOutputsList, - final List> matchingRecords, - final String bindingName) { - - - //This is the return value. It represents the new list of lines in the output VCF file. - final List> newInfoAnnotationOutputsList = new LinkedList>(); - - //For each matching record, generate a new output line - for(int i = 0; i < matchingRecords.size(); i++) { - Map annotationsForRecord = matchingRecords.get(i); - - //Add the annotations from this record to each output line. - for(Map outputRecordInfoField : infoAnnotationOutputsList) { - Map outputRecordInfoFieldCopy = new LinkedHashMap(outputRecordInfoField); //create a new copy of this line. - outputRecordInfoFieldCopy.putAll(annotationsForRecord); //Adds the column-value pairs from this record to this line. - - newInfoAnnotationOutputsList.add(outputRecordInfoFieldCopy); //Add the line to the new list of lines. - } - } - - recordStats(bindingName, newInfoAnnotationOutputsList.size(), infoAnnotationOutputsList, matchingRecords.size()); - - return newInfoAnnotationOutputsList; - } - - - /** - * Records statistics for the explodeInfoAnnotationOutputsList(..) calculation. - * @param bindingName The table from which annotations were gotten - * @param numNewVCFRecordsAnnotatedWithBindingNameData The number of new output VCF records created with annotations from this table - * @param infoAnnotationOutputsList output list - * @param matchingRecordsSize matching records size - */ - private void recordStats( final String bindingName, int numNewVCFRecordsAnnotatedWithBindingNameData, final List> infoAnnotationOutputsList, int matchingRecordsSize ) { - - //update stats for the 'bindingName' table - incrementStatsCounter(bindingName, numNewVCFRecordsAnnotatedWithBindingNameData); //All records in newInfoAnnotationOutputsList were annotated with data from bindingName. - - //update stats for all other tables besides 'bindingName' - for(String otherBindingName : inputTableHitCounter.keySet()) { - if(otherBindingName.equals(bindingName)) { - continue; - } - - //count how many records in the initial infoAnnotationOutputsList were annotated with data from otherBindingName - int numAnnotatedWithOtherBindingNameData = 0; - for(Map outputRecordInfoField : infoAnnotationOutputsList) { - for(String outputRecordInfoFieldKey : outputRecordInfoField.keySet()) { - if(outputRecordInfoFieldKey.contains(otherBindingName)) { - //this record has some annotations from the otherBindingName table - numAnnotatedWithOtherBindingNameData++; - break; - } - } - } - - if(numAnnotatedWithOtherBindingNameData > 0) { - //numAnnotatedWithOtherBindingNameData * (matchingRecordsSize - 1) is how many additional output VCF records were created with annotations from otherBindingName - incrementStatsCounter(otherBindingName, numAnnotatedWithOtherBindingNameData * (matchingRecordsSize - 1)); - } - } - } - - - // Applies the -S arg to the results - private List> applySelectArg( final List> infoAnnotationOutputsList ) - { - final List> newInfoAnnotationOutputList = new LinkedList>(); - for(final Map outputRecordInfoField : infoAnnotationOutputsList) { - final Map newOutputRecordInfoField = new LinkedHashMap(); - for(final Entry keyValue : outputRecordInfoField.entrySet()) { - if(!isKeyFilteredOutBySelectArg(keyValue.getKey())) { - newOutputRecordInfoField.put(keyValue.getKey(), keyValue.getValue()); - } - } - newInfoAnnotationOutputList.add(newOutputRecordInfoField); - } - - return newInfoAnnotationOutputList; - } - - - /** - * Determines whether to exclude the given column from the annotations. - * @param key The fully qualified columnName - * @return Whether the -S arg specifies that this column should be included in the annotations. - * - * TODO this function can be optimized through memoization - */ - private boolean isKeyFilteredOutBySelectArg(String key) - { - for(final String bindingName : requestedColumnsMap.keySet()) { - - if(key.contains(bindingName)) { - final Set selectArgsWithThisBindingName = requestedColumnsMap.get(bindingName); - for(final String selectArgWithThisBindingName : selectArgsWithThisBindingName) { - if(key.contains(selectArgWithThisBindingName)) { - return false; //this key matches one of the -s args, so the user explicitly requested this key - } - } - if(!selectArgsWithThisBindingName.isEmpty()) { - return true; //the -S arg contains some keys with this binding name, but doesn't include this key - } - } - } - - return false; //the -S arg doesn't have anything with the same binding name as this key, so the user implicitly requested this key - } - - - - - /** - * Determines how the engine will handle the case where multiple records in a ROD file - * overlap a particular single locus. If oneToMany is set to true, the output will be - * one-to-many, so that each locus in the input VCF file could result in multiple - * entries in the output VCF file. Otherwise, the output will be one-to-one, and - * all multiple-match records will be collapsed into the single info field. - * The collapsing is done by appending an _i to each key name (where 'i' is a - * record counter). - * - * See class-level comments for more details. - * - * @param oneToMany true if we should break out from one to many - */ - public void setOneToMany(boolean oneToMany) { - this.oneToMany = oneToMany; - } - - /** - * Sets the columns that will be used for the info annotation field. - * Column names should be of the form bindingName.columnName (eg. dbsnp.avHet). - * - * @param columns An array of strings where each string is a comma-separated list - * of columnNames (eg ["dbsnp.avHet,dbsnp.valid", "file2.col1,file3.col1"] ). - */ - public void setRequestedColumns(String[] columns) { - if(columns == null) { - throw new IllegalArgumentException("columns arg is null. Please check the -s command-line arg."); - } - - //System.err.println("COLUMNS: "+Arrays.asList(columns).toString()); - - this.requestedColumnsMap = parseColumnsArg(columns); - } - - - /** - * Passes in a pointer to the JoinTables. - * - * @param joinTables The list of JoinTables. There should be one JoinTable object for each -J arg. - */ - public void setJoinTables(List joinTables) { - this.joinTables = joinTables; - } - - - /** - * Parses the columns arg and returns a Map of columns hashed by their binding name. - * For example: - * The command line: - * -s dbSnp.valid,dbsnp.avHet -s refGene.txStart,refGene.txEnd - * - * will be passed to this method as: - * ["dbSnp.valid,dbsnp.avHet", "refGene.txStart,refGene.txEnd"] - * - * resulting in a return value of: - * { - * "dbSnp" -> "dbSnp.valid" , - * "dbSnp" -> "dbsnp.avHet" , - * "refGene" -> "refGene.txStart", - * "refGene" -> "refGene.txEnd" - * } - * - * @param columnsArg The -s command line arg value. - * - * @return Map representing a parsed version of this arg - see above. - */ - private static Map> parseColumnsArg(String[] columnsArg) { - Map> result = new HashMap>(); - - for(String s : columnsArg) { - for(String columnSpecifier : s.split(",") ) { - String[] rodNameColumnName = columnSpecifier.split("\\."); - if(rodNameColumnName.length != 2) { - throw new IllegalArgumentException("The following column specifier in the -s arg is invalid: [" + columnSpecifier + "]. It must be of the form 'bindingName.columnName'."); - } - String rodName = rodNameColumnName[0]; - //String columnName = rodNameColumnName[1]; - - Set requestedColumns = result.get(rodName); - if(requestedColumns == null) { - requestedColumns = new HashSet(); - result.put(rodName, requestedColumns); - } - requestedColumns.add(columnSpecifier); - } - } - - return result; - } - - - //Returns a map containing stats on how many output vcf records were annotated from each database - public Map getInputTableHitCounter() { - return Collections.unmodifiableMap(inputTableHitCounter); - } - - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java deleted file mode 100644 index 05c1b3c52..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; -import java.util.Map.Entry; - -/** - * This plugin for {@link VariantAnnotatorEngine} serves as the core - * of the {@link GenomicAnnotator}. It finds all records in the -B input files - * that match the given variant's position and, optionally, the variant's reference and alternate alleles. - * - * For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -public class GenomicAnnotation implements InfoFieldAnnotation { - - public static final String CHR_COLUMN = "chr"; - public static final String START_COLUMN = "start"; - public static final String END_COLUMN = "end"; - public static final String HAPLOTYPE_REFERENCE_COLUMN = "haplotypeReference"; - public static final String HAPLOTYPE_ALTERNATE_COLUMN = "haplotypeAlternate"; - - public static final String NUM_MATCHES_SPECIAL_INFO_FIELD = "numMatchingRecords"; - - /** Characters that aren't allowed within VCF info field key-value pairs */ - public static final char[] ILLEGAL_INFO_FIELD_VALUES = { ' ', '=', ';' }; - /** Replacement for each character in ILLEGAL_INFO_FIELD_VALUES */ - public static final char[] ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES = { '_', '-', '!' }; - - - private void modifyAnnotationsForIndels(VariantContext vc, String featureName, Map annotationsForRecord) { - String inCodingRegionKey = featureName + ".inCodingRegion"; - String referenceCodonKey = featureName + ".referenceCodon"; - String variantCodonKey = featureName + ".variantCodon"; - String codingCoordStrKey = featureName + ".codingCoordStr"; - String proteinCoordStrKey = featureName + ".proteinCoordStr"; - String haplotypeReferenceKey = featureName + "." + HAPLOTYPE_REFERENCE_COLUMN; - String haplotypeAlternateKey = featureName + "." + HAPLOTYPE_ALTERNATE_COLUMN; - String functionalClassKey = featureName + ".functionalClass"; - String startKey = featureName + "." + START_COLUMN; - String endKey = featureName + "." + END_COLUMN; - String referenceAAKey = featureName + ".referenceAA"; - String variantAAKey = featureName + ".variantAA"; - String changesAAKey = featureName + ".changesAA"; - - annotationsForRecord.put(variantCodonKey, "unknown"); - annotationsForRecord.put(codingCoordStrKey, "unknown"); - annotationsForRecord.put(proteinCoordStrKey, "unknown"); - annotationsForRecord.put(referenceAAKey, "unknown"); - annotationsForRecord.put(variantAAKey, "unknown"); - - String refAllele = vc.getReference().getDisplayString(); - if (refAllele.length() == 0) { refAllele = "-"; } - - String altAllele = vc.getAlternateAllele(0).toString(); - if (altAllele.length() == 0) { altAllele = "-"; } - - annotationsForRecord.put(haplotypeReferenceKey, refAllele); - annotationsForRecord.put(haplotypeAlternateKey, altAllele); - annotationsForRecord.put(startKey, String.format("%d", vc.getStart())); - annotationsForRecord.put(endKey, String.format("%d", vc.getEnd())); - - boolean isCodingRegion = annotationsForRecord.containsKey(inCodingRegionKey) && annotationsForRecord.get(inCodingRegionKey).equalsIgnoreCase("true") ? true : false; - boolean isFrameshift = (vc.getIndelLengths().get(0) % 3 == 0) ? false : true; - - String functionalClass; - if (isCodingRegion) { - functionalClass = isFrameshift ? "frameshift" : "inframe"; - annotationsForRecord.put(changesAAKey, "true"); - } else { - functionalClass = "noncoding"; - } - - annotationsForRecord.put(functionalClassKey, functionalClass); - } - - /** - * For each -B input file, for each record which overlaps the current locus, generates a - * set of annotations of the form: - * - * bindingName.columnName1=columnValue, bindingName.columnName2=columnValue2, etc. - * - * For example: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc. - * - * @return The following is an explanation of this method's return value: - * - * The annotations from a matching in a particular file are stored in a Map - * where the key is bindingName.columnName and the value is the columnValue. - * Since a single input file can have multiple records that overlap the current - * locus (eg. dbSNP can have multiple entries for the same genomic position), a different - * Map is created for each matching record in a particular file. - * The set of matching records for each file is then represented as a List> - * - * The return value of this method is a Map of the form: - * rodName1 -> List> - * rodName2 -> List> - * rodName3 -> List> - * ... - * Where the rodNames are the -B binding names for each file that were specified on the command line (eg. -B bindingName,AnnotatorInputTable,/path/to/file). - * - * NOTE: The lists (List>) are guaranteed to have size > 0 - * because a rodName -> List> entry will only - * be created in Map if the List has at least one element. - */ - public Map annotate(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc) { - - //iterate over each record that overlaps the current locus, and, if it passes certain filters, - //add its values to the list of annotations for this locus. - final Map annotations = new HashMap(); - for(final GATKFeature gatkFeature : tracker.getAllRods()) - { - final String name = gatkFeature.getName(); - if( name.equals("variant") || name.equals("interval") ) { - continue; - } - - if( ! (gatkFeature.getUnderlyingObject() instanceof AnnotatorInputTableFeature) ) { - continue; //GenericAnnotation only works with TabularRODs because it needs to be able to select individual columns. - } - - final Map annotationsForRecord = convertRecordToAnnotations( gatkFeature.getName(), ((AnnotatorInputTableFeature) gatkFeature.getUnderlyingObject()).getColumnValues()); - - //If this record contains the HAPLOTYPE_REFERENCE_COLUMN and/or HAPLOTYPE_ALTERNATE_COLUMN, check whether the - //alleles specified match the the variant's reference allele and alternate allele. - //If they don't match, this record will be skipped, and its values will not be used for annotations. - // - //If one of these columns doesn't exist in the current rod, or if its value is * (star), then this is treated as an automatic match. - //Otherwise, the HAPLOTYPE_REFERENCE_COLUMN is only considered to be matching the variant's reference if the string values of the two - //are exactly equal (case-insensitive). - - //The HAPLOTYPE_REFERENCE_COLUMN matches the variant's reference allele based on a case-insensitive string comparison. - //The HAPLOTYPE_ALTERNATE_COLUMN can optionally list more than allele separated by one of these chars: ,\/:| - // only check this value for SNPs - String hapAltValue = vc.isSNP() ? annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_ALTERNATE_COLUMN) ) : null; - if ( hapAltValue != null && !hapAltValue.equals("*") ) { - Set alternateAlleles = vc.getAlternateAlleles(); - //if(alternateAlleles.isEmpty()) { - //handle a site that has been called monomorphic reference - //alternateAlleles.add(vc.getReference()); - //continue; //TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right? - //} else - if(alternateAlleles.size() > 1) { - throw new UserException.MalformedFile("File associated with " + vc.getSource() + " contains record [" + vc + "] contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele."); - } - - Allele vcAlt; - if(alternateAlleles.isEmpty()) { - vcAlt = vc.getReference(); - } else { - vcAlt = alternateAlleles.iterator().next(); - } - - boolean matchFound = false; - for(String hapAlt : hapAltValue.split("[,\\\\/:|]")) { - - if(!hapAlt.isEmpty() && vcAlt.basesMatch(hapAlt)) { - matchFound = true; - break; - } - } - if(!matchFound) { - continue; //skip record - none of its alternate alleles match the variant's alternate allele - } - } - - // only check this value for SNPs - String hapRefValue = vc.isSNP() ? annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_REFERENCE_COLUMN) ) : null; - if(hapRefValue != null) - { - hapRefValue = hapRefValue.trim(); - if(!hapRefValue.equals("*")) - { - //match against hapolotypeReference. - Allele vcRef = vc.getReference(); - if(!vcRef.basesMatch(hapRefValue)) { - continue; //skip record - } - } - } - - if (vc.isIndel()) { - modifyAnnotationsForIndels(vc, name, annotationsForRecord); - } - - //filters passed, so add this record. - List> listOfMatchingRecords = (List>) annotations.get( name ); - if(listOfMatchingRecords == null) { - listOfMatchingRecords = new LinkedList>(); - listOfMatchingRecords.add( annotationsForRecord ); - annotations.put(name, listOfMatchingRecords); - } else { - listOfMatchingRecords.add( annotationsForRecord ); - } - } - - return annotations; - } - - - - - /** - * Converts the given record to a set of key-value pairs of the form: - * bindingName.columnName1=column1Value, bindingName.columnName2=column2Value - * (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A) - * - * @param record AnnotatorInputTableFeature corresponding to one record in one -B input file. - * @param bindingName The binding name of the given AnnotatorInputTableFeature. - * @return The map of columnName -> columnValue pairs. - */ - public static Map convertRecordToAnnotations( String bindingName, Map record) { - final Map result = new HashMap(); - - for(final Entry entry : record.entrySet()) { - final String value = entry.getValue(); - if(!value.trim().isEmpty()) { - result.put( generateInfoFieldKey(bindingName, entry.getKey()), scrubInfoFieldValue(entry.getValue())); - } - } - - return result; - } - - /** - * Combines the 2 values into a full key. - * @param rodBindingName -B name - * @param columnName column name - * @return info field key - */ - public static String generateInfoFieldKey(String rodBindingName, String columnName ) { - return rodBindingName + '.' + columnName; - } - - - - /** - * Replaces any characters that are not allowed in the info field of a VCF file. - * - * @param value info field value - * @return the value with any illegal characters replaced by legal ones. - */ - private static String scrubInfoFieldValue(String value) { - for(int i = 0; i < GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES.length; i++) { - value = value.replace(GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES[i], GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES[i]); - } - - return value; - } - - - - public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine("GenericAnnotation", 1, VCFHeaderLineType.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position.")); - } - - public List getKeyNames() { - return Arrays.asList("GenericAnnotation"); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java deleted file mode 100644 index b42310780..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.File; -import java.io.IOException; -import java.util.*; -import java.util.Map.Entry; - -/** - * Annotates variant calls with information from user-specified tabular files. - * - * For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -@Requires(value={DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariantContext.class)) -@By(DataSource.REFERENCE) -public class GenomicAnnotator extends RodWalker implements TreeReducible { - - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter vcfWriter = null; - - @Argument(fullName="vcfOutput", shortName="vcf", doc="Please use --out instead", required=false) - @Deprecated - protected String oldOutArg; - - @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) - protected String sampleName = null; - - @Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B:mydbsnp,AnnotatorInputTable /path/to/mydbsnp.txt -B:mytable,AnnotatorInputTable /path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false) - protected String[] SELECT_COLUMNS = {}; - - @Argument(fullName="join", shortName="J", doc="Optionally specifies a file and column within that file that should be LEFT-JOIN'ed to a column in a previously-specified file. The file provided to -J must be tab-delimited, with the first non-comment/non-empty line containing column names. (example: -B:name,AnnotatorInputTable /path/to/file1 -J name2,/path/to/file2,name.columnName=name2.columnName2 - this will join the table in file2 to the table in file1) ", required=false) - protected String[] JOIN_ARGS = {}; - - @Argument(fullName="oneToMany", shortName="m", doc="If more than one record from the same file matches a particular locus (for example, multiple dbSNP records with the same position), create multiple entries in the ouptut VCF file - one for each match. If a particular tabular file has J matches, and another tabular file has K matches for a given locus, then J*K output VCF records will be generated - one for each pair of K, J. If this flag is not provided, the multiple records are still generated, but they are stored in the INFO field of a single output VCF record, with their annotation keys differentiated by appending '_i' with i varying from 1 to K*J. ", required=false) - protected Boolean ONE_TO_MANY = false; - - @Argument(fullName="maxJoinTableSize", shortName="maxJoin", doc="The maximum allowed size (i.e. number of rows) for a table provided with the -J argument", required=false) - protected Integer MAX_JOIN_TABLE_SIZE = 500000; - - @Argument(fullName="ignoreFilteredSites", shortName="noFilt", doc="If specified, don't annotate sites marked as filtered out") - protected Boolean IGNORE_FILTERED_SITES = false; - - private VariantAnnotatorEngine engine; - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - - //read all ROD file headers and construct a set of all column names to be used for validation of command-line args - final Set allFullyQualifiedColumnNames = new LinkedHashSet(); - final Set allBindingNames = new LinkedHashSet(); - for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) { - if(! ds.getType().equals(AnnotatorInputTableCodec.class)) { - continue; //skip all non-AnnotatorInputTable files. - } - final String bindingName = ds.getName(); - File file = ds.getFile(); - allBindingNames.add(bindingName); - try { - final ArrayList header = AnnotatorInputTableCodec.readHeader(file); - for(String columnName : header) { - allFullyQualifiedColumnNames.add(bindingName + "." + columnName); - } - } catch(IOException e) { - throw new UserException.CouldNotReadInputFile(file, "Failed when attempting to read file header. ", e); - } - } - - //parse the JOIN_COLUMNS args, read in the specified files, and validate column names in the = relation. This end result of this loop is to populate the List of joinTables with one entry per -J arg. - final List joinTables = new LinkedList(); - for(String joinArg : JOIN_ARGS) { - - //parse the tokens - final String[] arg = joinArg.split(","); - if(arg.length != 3) { - throw new UserException.BadArgumentValue("-J", "The following -J arg: \"" + joinArg + "\" must contain 3 comma-separated values. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - final String bindingName = arg[0]; - final String filename = arg[1]; - final String columnsToJoin = arg[2]; - - if(allBindingNames.contains(bindingName)) { - throw new UserException.BadArgumentValue("-J", "The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used in another binding."); - } - - String[] splitOnEquals = columnsToJoin.split("=+"); - if(splitOnEquals.length != 2) { - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" must specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - - String[] splitOnDot1 = splitOnEquals[0].split("\\."); - String[] splitOnDot2 = splitOnEquals[1].split("\\."); - if(splitOnDot1.length != 2 || splitOnDot2.length != 2) { - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - - final String bindingName1 = splitOnDot1[0]; - final String columnName1 = splitOnDot1[1]; - final String bindingName2 = splitOnDot2[0]; - final String columnName2 = splitOnDot2[1]; - - //figure out which of the 2 binding names within the = relation matches the -J bindingName - final String localBindingName = bindingName; //alias - final String localColumnName; - final String externalBindingName; - final String externalColumnName; - if(bindingName1.equals(bindingName)) { - localColumnName = columnName1; - externalBindingName = bindingName2; - externalColumnName = columnName2; - } else if(bindingName2.equals(bindingName)) { - localColumnName = columnName2; - externalBindingName = bindingName1; - externalColumnName = columnName1; - } else { - throw new UserException.BadArgumentValue("-J", "The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" must be specified in one the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - - //validate externalColumnName - final String fullyQualifiedExternalColumnName = externalBindingName + '.' + externalColumnName; - if( !allFullyQualifiedColumnNames.contains(fullyQualifiedExternalColumnName) ) { - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + fullyQualifiedExternalColumnName + "\""); - } - - //read in the file contents into a JoinTable object - final JoinTable joinTable = new JoinTable(MAX_JOIN_TABLE_SIZE); - joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName); - joinTables.add(joinTable); - - //validate localColumnName, and add all column names in this file to the list of allFullyQualifiedColumnNames so that they can be referenced from subsequent -J args. - final List columnNames = joinTable.getColumnNames(); - final List fullyQualifiedColumnNames = new LinkedList(); - boolean found = false; - for ( String columnName : columnNames ) { - if ( columnName.equals(localColumnName) ) - found = true; - fullyQualifiedColumnNames.add(localBindingName + '.' + columnName); - } - if ( !found ) - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename); - - allFullyQualifiedColumnNames.addAll(fullyQualifiedColumnNames); - } - - //parse the SELECT_COLUMNS arg and validate the column names - List parsedSelectColumns = new LinkedList(); - for ( String token : SELECT_COLUMNS ) - parsedSelectColumns.addAll(Arrays.asList(token.split(","))); - SELECT_COLUMNS = parsedSelectColumns.toArray(SELECT_COLUMNS); - - for ( String columnName : SELECT_COLUMNS ) { - if ( !allFullyQualifiedColumnNames.contains(columnName) ) - throw new UserException.BadArgumentValue("-s", "The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allFullyQualifiedColumnNames); - } - - //instantiate the VariantAnnotatorEngine - ArrayList annotationsToUse = new ArrayList(); - annotationsToUse.add("GenomicAnnotation"); - engine = new VariantAnnotatorEngine(getToolkit(), new ArrayList(), annotationsToUse); - engine.setOneToMany(ONE_TO_MANY); - engine.setRequestedColumns(SELECT_COLUMNS); - engine.setJoinTables(joinTables); - - // set up the header fields - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList("variant"))); - hInfo.addAll(engine.getVCFAnnotationDescriptions()); - - Set rodName = new HashSet(); - rodName.add("variant"); - Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); - VCFHeader vcfHeader = new VCFHeader(hInfo, samples); - vcfWriter.writeHeader(vcfHeader); - } - - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - /** - * We want reads that span deletions - * - * @return true - */ - public boolean includeReadsWithDeletionAtLoci() { return true; } - - /** - * For each site of interest, annotate based on the requested annotation types - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - Set results = new LinkedHashSet(); - for (VariantContext vc : tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false)) { - if ( (vc.isFiltered() && IGNORE_FILTERED_SITES) || - (vc.isVariant() && !vc.isBiallelic()) ) { - results.add(vc); - } else { - Map stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context); - if ( stratifiedContexts != null ) - results.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); - else - results.add(vc); - } - } - - for ( VariantContext vc : results ) - vcfWriter.add(vc ,ref.getBase()); - - return 1; - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public Integer treeReduce(Integer lhs, Integer rhs) { - return lhs + rhs; - } - - public void onTraversalDone(Integer sum) { - - //out.printf("Generated %d annotated VCF records.\n", totalOutputVCFRecords); - Map inputTableHitCounter = engine.getInputTableHitCounter(); - for ( Entry e : inputTableHitCounter.entrySet() ) { - final String bindingName = e.getKey(); - final int counter = e.getValue(); - //final float percent = 100 * counter /(float) totalOutputVCFRecords; - //out.printf(" %-6.1f%% (%d) annotated with %s.\n", percent, counter, bindingName ); - System.out.printf(" %d annotated with %s.\n", counter, bindingName ); - } - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java deleted file mode 100755 index 714f374cf..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - -/** - * This is a container that holds all data corresponding to a single join table as specified by one -J arg (ex: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2). - * Some terminology: - * 'bindingName' is an arbitrary label for a given table that is specified on the command line with either the -B or -J arg. - * In the example above, bindingName1 is the 'local' binding name because it is attached to the join table file provided with this -J arg. bindingName2 is the 'external' binding name because - * it corresponds to some other table specified previously with another -B or -J arg. - * - * The JoinTable object stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName1.columnName in the above example), - * and the entry value is an ArrayList representing the entire join table record. - * The JoinTable object also stores some other join table parameters such as the column names that were parsed out of the file header, and the bindingNames and columnNames from the -J arg. - * - * The join operation is performed by looking up the value of the join column in the external table (the one that this table is being joined to), and then using this value to do a lookup - * on the map - if there's a hit, it will provide the record from the join table that is to be joined with the record in the external table. - * - * More information can be found here: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -public class JoinTable -{ - //the list of join table column names parsed out of the file header. - private List columnNames; //not fully-qualified - - private String localBindingName; - private String externalBindingName; - private String externalColumnName; - - //stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName.columnName in the above example), - //and the entry value is an ArrayList representing the entire join table record. - private HashMap> joinColumnValueToRecords = new HashMap>(); - - private int maxSize; - private boolean parsedFromFile = false; - - public JoinTable(int maxSize) { - this.maxSize = maxSize; - } - - /** - * Parses the table from the given file using the JoinTableParser. - * - * @param filename The file containing the table. - * @param localBindingName The binding name within the given file to join on. - * @param localColumnName The column name within the given file to join on. - * @param externalBindingName The binding name of another file (previously specified with either -B or -J). - * @param externalColumnName The column name in this other file to join on. - */ - public void parseFromFile(String filename, String localBindingName, String localColumnName, String externalBindingName, String externalColumnName) { - if(parsedFromFile) { - throw new ReviewedStingException("parseFromFile(" + filename +", ..) called more than once"); - } - parsedFromFile = true; - - setLocalBindingName(localBindingName); - setExternalBindingName(externalBindingName); - setExternalColumnName(externalColumnName); - - BufferedReader br = null; - try - { - br = new BufferedReader(new FileReader(filename)); - final JoinTableParser parser = new JoinTableParser(); - - //read in the header - columnNames = parser.readHeader(br); - - //get the index of the localJoinColumnName - int localColumnNameIdx = -1; - for(int i = 0; i < columnNames.size(); i++) { - final String columnName = columnNames.get(i); - if(columnName.equals(localColumnName)) { - localColumnNameIdx = i; - break; - } - } - - if(localColumnNameIdx == -1) { - throw new UserException.BadArgumentValue("-J", "The -J arg specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename); - } - - //read in all records and create a map entry for each - String line; - while((line = br.readLine()) != null) { - final ArrayList columnValues = parser.parseLine(line); - if ( columnValues.size() < columnNames.size() ) - throw new UserException.BadInput("the file: " + filename + " is malformed as there are not a sufficient number of columns for this line: " + line); - final String joinColumnValue = columnValues.get(localColumnNameIdx); - put(joinColumnValue, columnValues, filename); - } - } - catch(IOException e) - { - throw new UserException.CouldNotReadInputFile(new File(filename), "Unable to parse file", e); - } - finally - { - try { - if(br != null) { - br.close(); - } - } catch(IOException e) { - throw new ReviewedStingException("Unable to close file: " + filename, e); - } - } - } - - /** - * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, - * this returns bindingName1. - * @return local binding name - */ - public String getLocalBindingName() { - return localBindingName; - } - - public void setLocalBindingName(String localBindingName) { - this.localBindingName = localBindingName; - } - - /** - * @return the list of join table column names parsed out of the file header. - */ - public List getColumnNames() { - return columnNames; //not fully-qualified - } - - protected void setColumnNames(List columnNames) { - this.columnNames = columnNames; - } - - /** - * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, - * this returns columnName2. - * @return external column name - */ - public String getExternalColumnName() { - return externalColumnName; - } - - protected void setExternalColumnName( - String externalColumnName) { - this.externalColumnName = externalColumnName; - } - - /** - * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, - * this returns bindingName2. - * @return external binding name - */ - public String getExternalBindingName() { - return externalBindingName; - } - - protected void setExternalBindingName( - String externalBindingName) { - this.externalBindingName = externalBindingName; - } - - /** - * Whether any join table records have the given value in the join column. - * @param joinColumnValue value - * @return true if the given name value exists in the file - */ - public boolean containsJoinColumnValue(String joinColumnValue) { - return joinColumnValueToRecords.containsKey(joinColumnValue); - } - - /** - * Returns all records in the table where the join column has the given value. - * @param joinColumnValue column value - * @return row - */ - public ArrayList get(String joinColumnValue) { - return joinColumnValueToRecords.get(joinColumnValue); - } - - /** - * Adds the given record to the map. - * @param joinColumnValue value - * @param record row - * @param filename the source file name - */ - protected void put(String joinColumnValue, ArrayList record, String filename) { - if ( joinColumnValueToRecords.containsKey(joinColumnValue) ) - throw new UserException.BadInput("the file " + filename + " contains non-unique entries for the requested column, which isn't allowed."); - joinColumnValueToRecords.put(joinColumnValue, record); - if ( joinColumnValueToRecords.size() > maxSize ) - throw new UserException.BadInput("the file " + filename + " contains more than the maximum number (" + maxSize + ") of allowed rows (see the --maxJoinTableSize argument)."); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java deleted file mode 100755 index 3b6c87f90..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.BufferedReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Used to parse files passed to the GenomicAnnotator via the -J arg. - * The files must be tab-delimited, and the first non-empty/non-commented line - * must be a header containing column names. - * - * More information can be found here: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -public class JoinTableParser -{ - public static final String DELIMITER = "\t"; - - private List header; //column names parsed out of the header line - - - /** - * Constructor. - */ - public JoinTableParser() {} - - /** - * Returns the header and returns it. - * @param br source - * @return column names - * @throws IOException on read - */ - public List readHeader(BufferedReader br) throws IOException - { - if(header != null) { - throw new ReviewedStingException("readHeader(..) called more than once. Header is currently set to: " + header); - } - - header = Collections.unmodifiableList(parseHeader(br)); - - return header; - } - - - /** - * @return A list containing the column names. - */ - public List getHeader() { - return header; - } - - - /** - * Parses the line into an ArrayList containing the values for each column. - * - * @param line to parse - * @return tokens - */ - public ArrayList parseLine(String line) { - - final ArrayList values = Utils.split(line, DELIMITER, header.size()); - - if ( values.size() != header.size() ) { - throw new UserException.MalformedFile(String.format("Encountered a row with %d columns which is different from the number or columns in the header: %d\nHeader: " + header + "\nLine: " + values, values.size(), header.size())); - } - - return values; - } - - - /** - * Returns the header. - * @param br The file to read. - * @return ArrayList containing column names from the header. - * @throws IOException on reading - */ - public static ArrayList parseHeader(final BufferedReader br) throws IOException - { - ArrayList header = null; - - //find the 1st line that's non-empty and not a comment - String line; - while( (line = br.readLine()) != null ) { - line = line.trim(); - if ( line.isEmpty() || line.startsWith("#") ) { - continue; - } - - //parse the header - header = Utils.split(line, DELIMITER); - break; - } - - // check that header was found - if ( header == null ) { - throw new IllegalArgumentException("No header in " + br + ". All lines are either comments or empty."); - } - - return header; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java deleted file mode 100755 index 0bbfa51b4..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java +++ /dev/null @@ -1,1032 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.IOException; -import java.io.PrintStream; -import java.util.*; - -/** - * Takes a table of transcripts (eg. UCSC refGene, knownGene, and CCDS tables) and generates the big table which contains - * annotations for each possible variant at each transcript position (eg. 4 variants at each genomic position). - * - * Required args: - * -B - specifies the input file (ex. -B transcripts,AnnotatorInputTable,/path/to/transcript_table_file.txt) - * -n - Specifies which column(s) from the transcript table contain the gene name(s). (ex. -n name,name2 (for the UCSC refGene table)) - * WARNING: The gene names for each record, when taken together, should provide a unique id for that record relative to all other records in the file. - * - * - * The map & reduce types are both TreeMap. - * Each TreeMap entry represents one line in the output file. The TreeMap key is a combination of a given output line's position (so that this key can be used to sort all output lines - * by reference order), as well as allele and gene names (so that its unique across all output lines). The String value is the output line itself. - */ -@Reference(window=@Window(start=-4,stop=4)) -@By(DataSource.REFERENCE) -@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name=TranscriptToGenomicInfo.ROD_NAME,type=AnnotatorInputTableFeature.class) } ) -public class TranscriptToGenomicInfo extends RodWalker { - public static final String ROD_NAME = "transcripts"; - - //@Argument(fullName="pass-through", shortName="t", doc="Optionally specifies which columns from the transcript table should be copied verbatim (aka. passed-through) to the records in the output table. For example, -B transcripts,AnnotatorInputTable,/data/refGene.txt -t id will cause the refGene id column to be copied to the output table.", required=false) - //protected String[] PASS_THROUGH_COLUMNS = {}; - - @Output - private PrintStream out; - - @Argument(fullName="unique-gene-name-columns", shortName="n", doc="Specifies which column(s) from the transcript table contains the gene name(s). For example, -B transcripts,AnnotatorInputTable,/data/refGene.txt -n name,name2 specifies that the name and name2 columns are gene names. WARNING: the gene names for each record, when taken together, should provide a unique id for that record relative to all other records in the file. If this is not the case, an error will be thrown. ", required=true) - private String[] GENE_NAME_COLUMNS = {}; - - private final char[] ALLELES = {'A','C','G','T'}; - - /** Output columns */ - private static final String[] GENOMIC_ANNOTATION_COLUMNS = { - GenomicAnnotation.CHR_COLUMN, - GenomicAnnotation.START_COLUMN, - GenomicAnnotation.END_COLUMN, - GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, - GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN}; - - private static final String OUTPUT_TRANSCRIPT_STRAND = "transcriptStrand"; //rg. +/- - private static final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true - private static final String OUTPUT_FRAME = "frame"; //eg. 0,1,2 - private static final String OUTPUT_POSITION_TYPE = "positionType"; //eg. utr5, cds, utr3, intron, intergenic - private static final String OUTPUT_MRNA_COORD = "mrnaCoord"; //1-based offset within the transcript - private static final String OUTPUT_SPLICE_DISTANCE = "spliceDist"; //eg. integer, bp to nearest exon/intron boundary - private static final String OUTPUT_CODON_NUMBER = "codonCoord"; //eg. 20 - private static final String OUTPUT_REFERENCE_CODON = "referenceCodon"; - private static final String OUTPUT_REFERENCE_AA = "referenceAA"; - private static final String OUTPUT_VARIANT_CODON = "variantCodon"; - private static final String OUTPUT_VARIANT_AA = "variantAA"; - private static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true - private static final String OUTPUT_FUNCTIONAL_CLASS = "functionalClass"; //eg. missense - private static final String OUTPUT_CODING_COORD_STR = "codingCoordStr"; - private static final String OUTPUT_PROTEIN_COORD_STR = "proteinCoordStr"; - private static final String OUTPUT_SPLICE_INFO = "spliceInfo"; //(eg "splice-donor -4", or "splice-acceptor 3") for the 10bp surrounding each exon/intron boundary - private static final String OUTPUT_UORF_CHANGE = "uorfChange"; // (eg +1 or -1, indicating the addition or interruption of an ATG trinucleotide in the annotated utr5) - private static final String[] TRANSCRIPT_COLUMNS = { - OUTPUT_TRANSCRIPT_STRAND, - OUTPUT_POSITION_TYPE, - OUTPUT_FRAME, - OUTPUT_MRNA_COORD, - OUTPUT_CODON_NUMBER, - OUTPUT_SPLICE_DISTANCE, - OUTPUT_REFERENCE_CODON, - OUTPUT_REFERENCE_AA, - OUTPUT_VARIANT_CODON, - OUTPUT_VARIANT_AA, - OUTPUT_CHANGES_AMINO_ACID, - OUTPUT_FUNCTIONAL_CLASS, - OUTPUT_CODING_COORD_STR, - OUTPUT_PROTEIN_COORD_STR, - OUTPUT_IN_CODING_REGION, - OUTPUT_SPLICE_INFO, - OUTPUT_UORF_CHANGE }; - - //This list specifies the order of output columns in the big table. - private final List outputColumnNames = new LinkedList(); - - private int transcriptsProcessedCounter = 0; - - private long transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter = 0; - private long transcriptsThatDontStartWithMethionineCounter = 0; - private long transcriptsThatDontEndWithStopCodonCounter = 0; - private long skippedTranscriptCounter = 0; - - private long skippedPositionsCounter = 0; - private long totalPositionsCounter = 0; - - /** Possible values for the "POSITION_TYPE" output column. */ - private enum PositionType { - intergenic, intron, utr5, CDS, utr3, non_coding_exon, non_coding_intron - } - - /** - * Store rods until we hit their ends so that we don't have to recompute - * basic information every time we see them in map(). - */ - private Map storedTranscriptInfo = new HashMap(); - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - - //parse the GENE_NAME_COLUMNS arg and validate the column names - final List parsedGeneNameColumns = new LinkedList(); - for(String token : GENE_NAME_COLUMNS) { - parsedGeneNameColumns.addAll(Arrays.asList(token.split(","))); - } - GENE_NAME_COLUMNS = parsedGeneNameColumns.toArray(GENE_NAME_COLUMNS); - - ReferenceOrderedDataSource transcriptsDataSource = null; - for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) { - if(ds.getName().equals(ROD_NAME)) { - transcriptsDataSource = ds; - break; - } - } - - // sanity check - if ( transcriptsDataSource == null ) - throw new IllegalStateException("No rod bound to " + ROD_NAME + " found in rod sources"); - - final ArrayList header; - try { - header = AnnotatorInputTableCodec.readHeader(transcriptsDataSource.getFile()); - } catch(Exception e) { - throw new UserException.MalformedFile(transcriptsDataSource.getFile(), "Failed when attempting to read header from file", e); - } - - for ( String columnName : GENE_NAME_COLUMNS ) { - if ( !header.contains(columnName) ) - throw new UserException.CommandLineException("The column name '" + columnName + "' provided to -n doesn't match any of the column names in: " + transcriptsDataSource.getFile()); - } - - //init outputColumnNames list - outputColumnNames.addAll(Arrays.asList(GENOMIC_ANNOTATION_COLUMNS)); - outputColumnNames.addAll(Arrays.asList(GENE_NAME_COLUMNS)); - outputColumnNames.addAll(Arrays.asList(TRANSCRIPT_COLUMNS)); - - //init OUTPUT_HEADER_LINE - StringBuilder outputHeaderLine = new StringBuilder(); - for( final String column : outputColumnNames ) { - if(outputHeaderLine.length() != 0) { - outputHeaderLine.append( AnnotatorInputTableCodec.DELIMITER ); - } - outputHeaderLine.append(column); - } - - out.println(outputHeaderLine.toString()); - } - - public Integer reduceInit() { return 0; } - - /** - * For each site of interest, generate the appropriate fields. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - final Collection rods = tracker.getBoundRodTracks(); - //if there's nothing overlapping this locus, skip it. - if ( rods.size() == 0 ) - return 0; - - final List transcriptRODs = tracker.getReferenceMetaData(ROD_NAME); - - //there may be multiple transcriptRODs that overlap this locus - for ( Object transcriptRodObject : transcriptRODs ) { - //parse this ROD if it hasn't been already. - final AnnotatorInputTableFeature transcriptRod = (AnnotatorInputTableFeature) transcriptRodObject; - String featureKey = transcriptRod.toString(); - - TranscriptTableRecord parsedTranscriptRod = storedTranscriptInfo.get(featureKey); - if ( parsedTranscriptRod == null ) { - parsedTranscriptRod = new TranscriptTableRecord(transcriptRod, GENE_NAME_COLUMNS); - storedTranscriptInfo.put(featureKey, parsedTranscriptRod); - } - - //populate parsedTranscriptRod.txSequence - if(parsedTranscriptRod.positiveStrand) { - parsedTranscriptRod.txSequence.append((char)ref.getBase()); - } else { - final char complementBase = (char)BaseUtils.simpleComplement(ref.getBase()); - parsedTranscriptRod.txSequence.insert(0, complementBase); - } - - //populate parsedTranscriptRod.utr5Sequence and parsedTranscriptRod.cdsSequence - final int position = (int) ref.getLocus().getStart(); - if(parsedTranscriptRod.isProteinCodingTranscript() && parsedTranscriptRod.isWithinExon(position) ) - { - //we're within an exon of a proteinCodingTranscript - - if(parsedTranscriptRod.positiveStrand) - { - if(position < parsedTranscriptRod.cdsStart) - { - parsedTranscriptRod.utr5Sequence.append((char)ref.getBase()); //within utr5 - } - else if(position >= parsedTranscriptRod.cdsStart && position <= parsedTranscriptRod.cdsEnd) - { - parsedTranscriptRod.cdsSequence.append((char)ref.getBase()); //within CDS - } - } - else - { - final char complementBase = (char)BaseUtils.simpleComplement(ref.getBase()); - if(position > parsedTranscriptRod.cdsEnd) - { - //As we move left to right (aka. 3' to 5'), we do insert(0,..) to reverse the sequence so that it become 5' to 3' in parsedTranscriptRod.utr5Sequence. - parsedTranscriptRod.utr5Sequence.insert(0,complementBase); //within utr5. - } - else if(position >= parsedTranscriptRod.cdsStart && position <= parsedTranscriptRod.cdsEnd) - { - parsedTranscriptRod.cdsSequence.insert(0,complementBase); //within CDS - } - } - } - - if ( position == parsedTranscriptRod.txEnd ) { - //we've reached the end of the transcript - compute all data and write it out. - try { - generateOutputRecordsForROD(parsedTranscriptRod); - } - catch(IOException e) { - throw new RuntimeException(Thread.currentThread().getName() + " - Unexpected error occurred at position: [" + parsedTranscriptRod.txChrom + ":" + position + "] in transcript: " + parsedTranscriptRod, e); - } - - // remove it from the cache - storedTranscriptInfo.remove(featureKey); - - transcriptsProcessedCounter++; - if ( transcriptsProcessedCounter % 100 == 0 ) - logger.info(new Date() + ": " + transcriptsProcessedCounter + " transcripts processed"); - } - } - - return 1; - } - - private static boolean isChrM(final TranscriptTableRecord record) { - return record.txChrom.equals("chrM") || record.txChrom.equals("MT")|| record.txChrom.equals("CRS"); - } - - private void generateOutputRecordsForROD(TranscriptTableRecord parsedTranscriptRod) throws IOException { - //Transcripts that don't produce proteins are indicated in transcript by cdsStart == cdsEnd - //These will be handled by generating only one record, with haplotypeAlternate == "*". - final boolean isProteinCodingTranscript = parsedTranscriptRod.isProteinCodingTranscript(); - final boolean isMitochondrialTranscript = isChrM(parsedTranscriptRod); - - final boolean positiveStrand = parsedTranscriptRod.positiveStrand; //alias - - - if(isProteinCodingTranscript && parsedTranscriptRod.cdsSequence.length() % 3 != 0) { - if (!isMitochondrialTranscript) { - logger.error("ERROR: Transcript " + parsedTranscriptRod +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] has " + parsedTranscriptRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping..."); - //discard transcripts where CDS length is not a multiple of 3 - skippedTranscriptCounter++; - return; - } else { - - //In mitochondrial genes, the polyA tail may complete the stop codon, allowing transcript . To check for this special case: - //1. check that the CDS covers the entire transcript - //2. add 1 or 2 A's to the 3' end of the transcript (as needed to make it divisible by 3) - //3. check whether the last 3 letters now form a stop codon using the mitochondrial AA table - //4. If not, skip this gene, else incorporate the A's and process it like any other gene. - - if( parsedTranscriptRod.txSequence.length() == parsedTranscriptRod.cdsSequence.length()) { - do { //append A's until sequence length is divisible by 3 - parsedTranscriptRod.txSequence.append('*'); - parsedTranscriptRod.cdsSequence.append('a'); - if(positiveStrand) { - parsedTranscriptRod.txEnd++; - parsedTranscriptRod.cdsEnd++; - parsedTranscriptRod.exonEnds[0]++; - } else { - parsedTranscriptRod.txStart--; - parsedTranscriptRod.cdsStart--; - parsedTranscriptRod.exonStarts[0]--; - } - } while( parsedTranscriptRod.cdsSequence.length() % 3 != 0); - - } else { - logger.error("ERROR: Mitochnodrial transcript " + parsedTranscriptRod +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] has " + parsedTranscriptRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. The CDS does not cover the entire transcript, so its not possible to use A's from the polyA tail. Skipping..."); - skippedTranscriptCounter++; - return; - } - } - } - - - //warn if the first codon isn't Methionine and/or the last codon isn't a stop codon. - if(isProteinCodingTranscript) { - final int cdsSequenceLength = parsedTranscriptRod.cdsSequence.length(); - - final String firstCodon = parsedTranscriptRod.cdsSequence.substring(0, 3); - final AminoAcid firstAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( firstCodon, true ) : AminoAcidTable.getEukaryoticAA( firstCodon ) ; - - final String lastCodon = parsedTranscriptRod.cdsSequence.substring(cdsSequenceLength - 3, cdsSequenceLength); - final AminoAcid lastAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( lastCodon, false ) : AminoAcidTable.getEukaryoticAA( lastCodon ) ; - - if( firstAA != AminoAcidTable.METHIONINE && !lastAA.isStop()) { - transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter++; - logger.warn("WARNING: The CDS of transcript " + parsedTranscriptRod.geneNames[0] +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] does not start with Methionine or end in a stop codon. The first codon is: " + firstCodon + " (" + firstAA + "). The last codon is: " + lastCodon + " (" + lastAA + "). NOTE: This is just a warning - the transcript will be included in the output."); - } else if( firstAA != AminoAcidTable.METHIONINE) { - transcriptsThatDontStartWithMethionineCounter++; - logger.warn("WARNING: The CDS of transcript " + parsedTranscriptRod.geneNames[0] +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] does not start with Methionine. The first codon is: " + firstCodon + " (" + firstAA + "). NOTE: This is just a warning - the transcript will be included in the output."); - } else if(!lastAA.isStop()) { - transcriptsThatDontEndWithStopCodonCounter++; - logger.warn("WARNING: The CDS of transcript " + parsedTranscriptRod.geneNames[0] +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] does not end in a stop codon. The last codon is: " + lastCodon + " (" + lastAA + "). NOTE: This is just a warning - the transcript will be included in the output."); - } - } - - final int txStart_5prime = positiveStrand ? parsedTranscriptRod.txStart : parsedTranscriptRod.txEnd; //1-based, inclusive - final int txEnd_3prime = positiveStrand ? parsedTranscriptRod.txEnd : parsedTranscriptRod.txStart; //1-based, inclusive - final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement - final int strandSign = increment_5to3; //alias - - final int cdsStart_5prime = positiveStrand ? parsedTranscriptRod.cdsStart : parsedTranscriptRod.cdsEnd; //1-based, inclusive - final int cdsEnd_3prime = positiveStrand ? parsedTranscriptRod.cdsEnd : parsedTranscriptRod.cdsStart ; //1-based, inclusive - - int frame = 0; //the frame of the current position - int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand - int utr5Count_from5 = 0; - int mrnaCoord_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand, but only counts bases within exons. - char[] utr5NucBuffer_5to3 = null; //used to find uORFs - size = 5 because to hold the 3 codons that overlap any given position: [-2,-1,0], [-1,0,1], and [0,1,2] - - int codonCount_from5 = 1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons - 1-based - int codingCoord_from5 = isProteinCodingTranscript ? parsedTranscriptRod.computeInitialCodingCoord() : -1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - boolean codingCoordResetForCDS = false; - boolean codingCoordResetForUtr3 = false; - final char[] currentCodon_5to3 = isProteinCodingTranscript ? new char[3] : null; //holds the current RNA codon - 5' to 3' - - PositionType positionType = null; - boolean isWithinIntronAndFarFromSpliceJunction = false; - int intronStart_5prime = -1; - int intronEnd_5prime; - - final Map outputLineFields = new HashMap(); - - for(int txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3) - { - ++totalPositionsCounter; - - //compute certain attributes of the current position - final boolean isWithinExon = parsedTranscriptRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron - - final int distanceToNearestSpliceSite = parsedTranscriptRod.computeDistanceToNearestSpliceSite(txCoord_5to3); - final boolean isWithin10bpOfSpliceJunction = Math.abs(distanceToNearestSpliceSite) <= 10; - - - //increment coding coord is necessary - if(isWithinExon) { - codingCoord_from5++; - } - - //figure out the current positionType - final PositionType prevPositionType = positionType; //save the position before it is updated - if(isProteinCodingTranscript) - { - if(isWithinExon) - { - if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) { //utr5 (multiplying by strandSign is like doing absolute value.) - positionType = PositionType.utr5; - } else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) { //utr3 (multiplying by strandSign is like doing absolute value.) - positionType = PositionType.utr3; - } else { - positionType = PositionType.CDS; - } - } else { - positionType = PositionType.intron; - } - } else { - if(isWithinExon) { - positionType = PositionType.non_coding_exon; - } else { - positionType = PositionType.non_coding_intron; - } - } - - //handle transitions - if(positionType == PositionType.CDS && prevPositionType != PositionType.CDS && !codingCoordResetForCDS) { - //transitioning from utr5 to CDS, reset the coding coord from -1 to 1. - codingCoord_from5 = 1; - codingCoordResetForCDS = true; - } else if(positionType == PositionType.utr3 && prevPositionType != PositionType.utr3 && !codingCoordResetForUtr3) { - //transitioning from CDS to utr3, reset the coding coord to 1. - codingCoord_from5 = 1; - codingCoordResetForUtr3 = true; - } - - - try - { - //handle introns - boolean wasWithinIntronAndFarFromSpliceJunction = isWithinIntronAndFarFromSpliceJunction; - isWithinIntronAndFarFromSpliceJunction = !isWithinExon && !isWithin10bpOfSpliceJunction; - - if(!wasWithinIntronAndFarFromSpliceJunction && isWithinIntronAndFarFromSpliceJunction) { - //save intron start - intronStart_5prime = txCoord_5to3; - - } else if(wasWithinIntronAndFarFromSpliceJunction && !isWithinIntronAndFarFromSpliceJunction) { - //output intron record - intronEnd_5prime = txCoord_5to3 - increment_5to3; - - final int intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ; - final int intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime); - outputLineFields.clear(); - outputLineFields.put(GenomicAnnotation.CHR_COLUMN, parsedTranscriptRod.txChrom); - outputLineFields.put(GenomicAnnotation.START_COLUMN, String.valueOf(intronStart)); - outputLineFields.put(GenomicAnnotation.END_COLUMN, String.valueOf(intronEnd)); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( '*' ) ); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( '*' ) ); - for(int i = 0; i < GENE_NAME_COLUMNS.length; i++) { - outputLineFields.put(GENE_NAME_COLUMNS[i], parsedTranscriptRod.geneNames[i] ); - } - - outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() ); - outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" ); - - if ( isProteinCodingTranscript ) - outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) ); - - addThisLineToResult(outputLineFields); - } - - //when in utr5, compute the utr5NucBuffer_5to3 which is later used to compute the OUTPUT_UORF_CHANGE field - if(positionType == PositionType.utr5) - { - if(utr5Count_from5 < parsedTranscriptRod.utr5Sequence.length()) - { - if(utr5NucBuffer_5to3 == null) { - //initialize - utr5NucBuffer_5to3 = new char[5]; - utr5NucBuffer_5to3[3] = parsedTranscriptRod.utr5Sequence.charAt( utr5Count_from5 ); - - if(utr5Count_from5 + 1 < parsedTranscriptRod.utr5Sequence.length() ) { - utr5NucBuffer_5to3[4] = parsedTranscriptRod.utr5Sequence.charAt( utr5Count_from5 + 1 ); - } - } - - //as we move 5' to 3', shift nucleotides down to the 5' end, making room for the new 3' nucleotide: - utr5NucBuffer_5to3[0] = utr5NucBuffer_5to3[1]; - utr5NucBuffer_5to3[1] = utr5NucBuffer_5to3[2]; - utr5NucBuffer_5to3[2] = utr5NucBuffer_5to3[3]; - utr5NucBuffer_5to3[3] = utr5NucBuffer_5to3[4]; - - char nextRefBase = 0; - if( utr5Count_from5 + 2 < parsedTranscriptRod.utr5Sequence.length() ) - { - nextRefBase = parsedTranscriptRod.utr5Sequence.charAt( utr5Count_from5 + 2 ); - } - utr5NucBuffer_5to3[4] = nextRefBase; - - //check for bad bases - if( (utr5NucBuffer_5to3[0] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[0])) || - (utr5NucBuffer_5to3[1] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[1])) || - (utr5NucBuffer_5to3[2] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[2])) || - (utr5NucBuffer_5to3[3] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[3])) || - (utr5NucBuffer_5to3[4] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[4]))) - { - logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". utr5NucBuffer_5to3 contains irregular base:" + utr5NucBuffer_5to3[0] + utr5NucBuffer_5to3[1] + utr5NucBuffer_5to3[2] + utr5NucBuffer_5to3[3] + utr5NucBuffer_5to3[4]);// +". Transcript is: " + parsedTranscriptRod); - ++skippedPositionsCounter; - continue; - } - - } else { // if(utr5Count_from5 >= parsedTranscriptRod.utr5Sequence.length()) - //defensive programming - throw new RuntimeException("Exception: Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". utr5Count_from5 is now " + utr5Count_from5 + ", while parsedTranscriptRod.utr5Sequence.length() == " + parsedTranscriptRod.utr5Sequence.length() + ". This means parsedTranscriptRod.utr5Sequence isn't as long as it should be. This is a bug in handling this record: " + parsedTranscriptRod); - - } - } - - - //when in CDS, compute current codon - if(positionType == PositionType.CDS) - { - if(frame == 0) - { - currentCodon_5to3[0] = parsedTranscriptRod.cdsSequence.charAt( codingCoord_from5 - 1 ); //subtract 1 to go to zero-based coords - currentCodon_5to3[1] = parsedTranscriptRod.cdsSequence.charAt( codingCoord_from5 ); - currentCodon_5to3[2] = parsedTranscriptRod.cdsSequence.charAt( codingCoord_from5 + 1); - } - - //check for bad bases - if(!BaseUtils.isRegularBase(currentCodon_5to3[0]) || !BaseUtils.isRegularBase(currentCodon_5to3[1]) || !BaseUtils.isRegularBase(currentCodon_5to3[2])) { - logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". CDS codon contains irregular base:" + currentCodon_5to3[0] + currentCodon_5to3[1] + currentCodon_5to3[2]);// +". Transcript is: " + parsedTranscriptRod); - ++skippedPositionsCounter; - continue; - } - - } - - char haplotypeReference = parsedTranscriptRod.txSequence.charAt( txOffset_from5 - 1 ); - if(!positiveStrand) { - haplotypeReference = BaseUtils.simpleComplement(haplotypeReference); //txSequence contents depend on whether its +/- strand - } - char haplotypeReferenceStrandSpecific= positiveStrand ? haplotypeReference : BaseUtils.simpleComplement(haplotypeReference); - - - - if(!BaseUtils.isRegularBase(haplotypeReference) && haplotypeReference != '*') { //* is special case for mitochondrial genes where polyA tail completes the last codon - //check for bad bases - logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() + ". The reference contains an irregular base:" + haplotypeReference); // +". Transcript is: " + parsedTranscriptRod); - ++skippedPositionsCounter; - continue; - } - - - char haplotypeAlternateStrandSpecific; - for(char haplotypeAlternate : ALLELES ) - { - haplotypeAlternateStrandSpecific= positiveStrand ? haplotypeAlternate : BaseUtils.simpleComplement(haplotypeAlternate); - outputLineFields.clear(); - - if(!isProteinCodingTranscript || isWithinIntronAndFarFromSpliceJunction) { - haplotypeReference = '*'; - haplotypeAlternate = '*'; - } - - //compute simple OUTPUT fields. - outputLineFields.put(GenomicAnnotation.CHR_COLUMN, parsedTranscriptRod.txChrom); - outputLineFields.put(GenomicAnnotation.START_COLUMN, String.valueOf(txCoord_5to3)); - outputLineFields.put(GenomicAnnotation.END_COLUMN, String.valueOf(txCoord_5to3)); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( haplotypeReference ) ); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN, Character.toString( haplotypeAlternate ) ); - for(int i = 0; i < GENE_NAME_COLUMNS.length; i++) { - outputLineFields.put(GENE_NAME_COLUMNS[i], parsedTranscriptRod.geneNames[i] ); - } - - outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() ); - outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" ); - if(isWithinExon) { - outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(mrnaCoord_from5) ); - } - outputLineFields.put(OUTPUT_SPLICE_DISTANCE, Integer.toString(distanceToNearestSpliceSite) ); - - //compute OUTPUT_SPLICE_INFO - final String spliceInfoString; - if(isWithin10bpOfSpliceJunction) { - if(distanceToNearestSpliceSite < 0) { - //is on the 5' side of the splice junction - if(isWithinExon) { - spliceInfoString = "splice-donor_" + distanceToNearestSpliceSite; - } else { - spliceInfoString = "splice-acceptor_" + distanceToNearestSpliceSite; - } - } else { - if(isWithinExon) { - spliceInfoString = "splice-acceptor_" + distanceToNearestSpliceSite; - } else { - spliceInfoString = "splice-donor_" + distanceToNearestSpliceSite; - } - } - outputLineFields.put(OUTPUT_SPLICE_INFO, spliceInfoString); - } - - //compute OUTPUT_IN_CODING_REGION - if(isProteinCodingTranscript) - { - outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) ); - } - - - //compute OUTPUT_UORF_CHANGE - if(positionType == PositionType.utr5) - { - String refCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + utr5NucBuffer_5to3[2]).toUpperCase(); - String refCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(utr5NucBuffer_5to3[2]) + utr5NucBuffer_5to3[3]).toUpperCase(); - String refCodon3 = (Character.toString(utr5NucBuffer_5to3[2]) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toUpperCase(); - - String varCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + haplotypeAlternateStrandSpecific).toUpperCase(); - String varCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(haplotypeAlternateStrandSpecific) + utr5NucBuffer_5to3[3]).toUpperCase(); - String varCodon3 = (Character.toString(haplotypeAlternateStrandSpecific) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toUpperCase(); - - //check for +1 (eg. addition of new ATG uORF) and -1 (eg. disruption of existing ATG uORF) - String uORFChangeStr = null; - if( (refCodon1.equals("ATG") && !varCodon1.equals("ATG")) || - (refCodon2.equals("ATG") && !varCodon2.equals("ATG")) || - (refCodon3.equals("ATG") && !varCodon3.equals("ATG"))) - { - uORFChangeStr = "-1"; - } - else if((varCodon1.equals("ATG") && !refCodon1.equals("ATG")) || - (varCodon2.equals("ATG") && !refCodon2.equals("ATG")) || - (varCodon3.equals("ATG") && !refCodon3.equals("ATG"))) - { - uORFChangeStr = "+1"; - } - - outputLineFields.put(OUTPUT_UORF_CHANGE, uORFChangeStr ); - } - //compute CDS-specific fields - else if (positionType == PositionType.CDS) { - final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; - final char temp = currentCodon_5to3[frame]; - currentCodon_5to3[frame] = haplotypeAlternateStrandSpecific; - final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; - currentCodon_5to3[frame] = temp; - - final AminoAcid refAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA(referenceCodon, codonCount_from5 == 1) : AminoAcidTable.getEukaryoticAA( referenceCodon ) ; - final AminoAcid variantAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA(variantCodon, codonCount_from5 == 1) : AminoAcidTable.getEukaryoticAA( variantCodon ) ; - - if (refAA.isUnknown() || variantAA.isUnknown()) { - logger.warn("Illegal amino acid detected: refCodon=" + referenceCodon + " altCodon=" + variantCodon); - } - outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" ); - outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame)); - outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonCount_from5)); - outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon); - outputLineFields.put(OUTPUT_REFERENCE_AA, refAA.getCode()); - - outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon); - outputLineFields.put(OUTPUT_VARIANT_AA, variantAA.getCode()); - - outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p." + refAA.getLetter() + Integer.toString(codonCount_from5) + variantAA.getLetter()); //for example: "p.K7$ - - boolean changesAA = !refAA.equals(variantAA); - outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(changesAA)); - final String functionalClass; - if (changesAA) { - if (variantAA.isStop()) { - functionalClass = "nonsense"; - } else if (refAA.isStop()) { - functionalClass = "readthrough"; - } else { - functionalClass = "missense"; - } - } else { - functionalClass = "silent"; - } - outputLineFields.put(OUTPUT_FUNCTIONAL_CLASS, functionalClass); - } - - //compute OUTPUT_CODING_COORD_STR - if(isProteinCodingTranscript) - { - //compute coding coord - final StringBuilder codingCoordStr = new StringBuilder(); - codingCoordStr.append( "c." ); - if(positionType == PositionType.utr3) { - codingCoordStr.append( '*' ); - } - - if(isWithinExon) { - codingCoordStr.append( Integer.toString(codingCoord_from5) ); - - codingCoordStr.append ( haplotypeReferenceStrandSpecific + ">" + haplotypeAlternateStrandSpecific); - } else { - //intronic coordinates - if(distanceToNearestSpliceSite < 0) { - codingCoordStr.append( Integer.toString(codingCoord_from5 + 1) ); - } else { - codingCoordStr.append( Integer.toString(codingCoord_from5 ) ); - codingCoordStr.append( "+" ); - } - - codingCoordStr.append( Integer.toString( distanceToNearestSpliceSite ) ); - } - - outputLineFields.put(OUTPUT_CODING_COORD_STR, codingCoordStr.toString()); - } - - - //generate the output line and add it to 'result' map. - if ( !isWithinIntronAndFarFromSpliceJunction ) - addThisLineToResult(outputLineFields); - - if( haplotypeAlternate == '*' ) { - //need only one record for this position with "*" for haplotypeAlternate, instead of the 4 individual alleles - break; - } - - } //ALLELE for-loop - } - finally - { - //increment coords - txOffset_from5++; - if(isWithinExon) { - mrnaCoord_from5++; - } - - if(positionType == PositionType.utr5) { - utr5Count_from5++; - } else if(positionType == PositionType.CDS) { - frame = (frame + 1) % 3; - if(frame == 0) { - codonCount_from5++; - } - } - } - } // l for-loop - - } //method close - - - /** - * Utility method. Creates a line containing the outputLineFields, and adds it to result, hashed by the sortKey. - * - * @param outputLineFields Column-name to value pairs. - */ - private void addThisLineToResult(final Map outputLineFields) { - final StringBuilder outputLine = new StringBuilder(); - for( final String column : outputColumnNames ) { - if(outputLine.length() != 0) { - outputLine.append( AnnotatorInputTableCodec.DELIMITER ); - } - final String value = outputLineFields.get(column); - if(value != null) { - outputLine.append(value); - } - } - - out.println(outputLine.toString()); - } - - public Integer reduce(Integer value, Integer sum) { return sum + value; } - - public void onTraversalDone(Integer result) { - logger.info("Skipped " + skippedPositionsCounter + " in-transcript genomic positions out of "+ totalPositionsCounter + " total (" + ( totalPositionsCounter == 0 ? 0 : (100*skippedPositionsCounter)/totalPositionsCounter) + "%)"); - logger.info("Skipped " + skippedTranscriptCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*skippedTranscriptCounter)/transcriptsProcessedCounter) + "%)"); - logger.info("Protein-coding transcripts (eg. with a CDS region) that don't start with Methionine or end in a stop codon: " + transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter)/transcriptsProcessedCounter) + "%)"); - logger.info("Protein-coding transcripts (eg. with a CDS region) that don't start with Methionine: " + transcriptsThatDontStartWithMethionineCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*transcriptsThatDontStartWithMethionineCounter)/transcriptsProcessedCounter) + "%)"); - logger.info("Protein-coding transcripts (eg. with a CDS region) that don't end in a stop codon: " + transcriptsThatDontEndWithStopCodonCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*transcriptsThatDontEndWithStopCodonCounter)/transcriptsProcessedCounter) + "%)"); - } - - - /** - * Container for all data fields from a single row of the transcript table. - */ - protected static class TranscriptTableRecord - { - public static final String STRAND_COLUMN = "strand"; //eg. + - public static final String CDS_START_COLUMN = "cdsStart"; - public static final String CDS_END_COLUMN = "cdsEnd"; - public static final String EXON_COUNT_COLUMN = "exonCount"; - public static final String EXON_STARTS_COLUMN = "exonStarts"; - public static final String EXON_ENDS_COLUMN = "exonEnds"; - //public static final String EXON_FRAMES_COLUMN = "exonFrames"; - - - /** - * This StringBuffer accumulates the entire transcript sequence. - * This buffer is used instead of using the GATK window mechanism - * because arbitrary-length look-aheads and look-behinds are needed to deal - * with codons that span splice-junctions in + & - strand transcripts. - * The window mechanism requires hard-coding the window size, which would - * translate into a limit on maximum supported intron size. To avoid this, the - * sequence is accumulated as the transcript is scanned left-to-right. - * Then, all calculations are performed at the end. - */ - public StringBuilder txSequence; //the sequence of the entire transcript in order from 5' to 3' - public StringBuilder utr5Sequence; //the protein coding sequence (with introns removed) in order from 5' to 3' - public StringBuilder cdsSequence; //the protein coding sequence (with introns removed) in order from 5' to 3' - - public boolean positiveStrand; //whether the transcript is on the + or the - strand. - public String[] geneNames; //eg. NM_021649 - - public String txChrom; //The chromosome name - public int txStart; - public int txEnd; - - public int cdsStart; - public int cdsEnd; - - public int[] exonStarts; - public int[] exonEnds; - //public int[] exonFrames; - not used for anything, frame is computed another way - - /** - * Constructor. - * - * @param transcriptRod A rod representing a single record in the transcript table. - * @param geneNameColumns name columns. - */ - public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) { - - //String binStr = transcriptRod.get("bin"); - //String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated ) - String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN); - if(strandStr == null) { - throw new IllegalArgumentException("Transcript table record doesn't contain a 'strand' column. Make sure the transcripts input file has a header and the usual columns: \"" + strandStr + "\""); - } else if(strandStr.equals("+")) { - positiveStrand = true; - } else if(strandStr.equals("-")) { - positiveStrand = false; - } else { - throw new IllegalArgumentException("Transcript table record contains unexpected value for 'strand' column: \"" + strandStr + "\""); - } - - geneNames = new String[geneNameColumns.length]; - for(int i = 0; i < geneNameColumns.length; i++) { - geneNames[i] = transcriptRod.getColumnValue(geneNameColumns[i]); - } - - //String txStartStr = transcriptRod.get(TXSTART_COLUMN); //These fields were used to generate column 1 of the ROD file (eg. they got turned into chr:txStart-txStop) - //String txEndStr = transcriptRod.get(TXEND_COLUMN); - txChrom = transcriptRod.getChr(); - txStart = transcriptRod.getStart(); - txEnd = transcriptRod.getEnd(); - - String cdsStartStr = transcriptRod.getColumnValue(CDS_START_COLUMN); - String cdsEndStr = transcriptRod.getColumnValue(CDS_END_COLUMN); - - cdsStart = Integer.parseInt(cdsStartStr); - cdsEnd = Integer.parseInt(cdsEndStr); - - txSequence = new StringBuilder( (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3' - if(isProteinCodingTranscript()) { - utr5Sequence = new StringBuilder( positiveStrand ? (cdsStart - txStart + 1) : (txEnd - cdsEnd + 1) ); //TODO reduce init size by size of introns - cdsSequence = new StringBuilder( (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns - } - - String exonCountStr = transcriptRod.getColumnValue(EXON_COUNT_COLUMN); - String exonStartsStr = transcriptRod.getColumnValue(EXON_STARTS_COLUMN); - String exonEndsStr = transcriptRod.getColumnValue(EXON_ENDS_COLUMN); - //String exonFramesStr = transcriptRod.get(EXON_FRAMES_COLUMN); - - String[] exonStartStrs = exonStartsStr.split(","); - String[] exonEndStrs = exonEndsStr.split(","); - //String[] exonFrameStrs = exonFramesStr.split(","); - - int exonCount = Integer.parseInt(exonCountStr); - if(exonCount != exonStartStrs.length || exonCount != exonEndStrs.length /* || exonCount != exonFrameStrs.length */) - { - throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + /*", Exon frames: " + exonFramesStr + */", Exon count: " + exonCountStr +". transcriptRod = " + transcriptRod); - } - - exonStarts = new int[exonCount]; - exonEnds = new int[exonCount]; - //exonFrames = new int[exonCount]; - for(int i = 0; i < exonCount; i++) { - exonStarts[i] = Integer.parseInt(exonStartStrs[i]); - exonEnds[i] = Integer.parseInt(exonEndStrs[i]); - //exonFrames[i] = Integer.parseInt(exonFrameStrs[i]); - } - } - - - /** - * Takes a genomic position on the same contig as the transcript, and - * returns true if this position falls within an exon. - */ - public boolean isWithinExon(final int genomPosition) { - for(int i = 0; i < exonStarts.length; i++) { - final int curStart = exonStarts[i]; - if(genomPosition < curStart) { - return false; - } - final int curStop = exonEnds[i]; - if(genomPosition <= curStop) { - return true; - } - } - - return false; - } - - /** - * Computes the distance to the nearest splice-site. - * The returned value is negative its on the 5' side (eg. upstream) of the juntion, and - * positive if its on the 3' side. - */ - public int computeDistanceToNearestSpliceSite(final int genomPosition) { - int prevDistance = Integer.MAX_VALUE; - for(int i = 0; i < exonStarts.length; i++) { - final int curStart = exonStarts[i]; - int curDistance = curStart - genomPosition; - if(genomPosition < curStart) { - //position is within the current intron - if(prevDistance < curDistance) { - return positiveStrand ? prevDistance : -prevDistance; - } else { - return positiveStrand ? -curDistance : curDistance; - } - } else { - prevDistance = genomPosition - curStart + 1; - } - - final int curStop = exonEnds[i]; - curDistance = curStop - genomPosition + 1; - if(genomPosition <= curStop) { - //position is within an exon - if(prevDistance < curDistance) { - return positiveStrand ? prevDistance : -prevDistance; - } else { - return positiveStrand ? -curDistance : curDistance; - } - } else { - prevDistance = genomPosition - curStop; - } - } - - throw new IllegalArgumentException("Genomic position: [" + genomPosition +"] not found within transcript: " + this +". " + - "This method should not have been called for this position. NOTE: this method assumes that all transcripts start " + - "with an exon and end with an exon (rather than an intron). Is this wrong?"); - //return prevDistance; //out of exons. return genomPosition-curStop - } - - - /** - * Returns true if this is a coding transcript (eg. is translated - * into proteins). Returns false for non-coding RNA. - */ - public boolean isProteinCodingTranscript() { - return cdsStart < cdsEnd; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("chrpos=" + txChrom + ':' + txStart + '-' + txEnd + ", strand=" + (positiveStrand ? '+':'-') + ", gene-names=" + Arrays.toString(geneNames) + ", cds="+ cdsStart + '-' + cdsEnd + ", exonStarts=" + Arrays.toString(exonStarts) + ", exonEnds=" + Arrays.toString(exonEnds)); - return sb.toString(); - } - - - - /** - * Computes the coding coord of the 1st nucleotide in the transcript. - * If the 1st nucleotide is in the 5'utr, the returned value will be negative. - * Otherwise (if the 1st nucleotide is CDS), the returned value is 1. - */ - public int computeInitialCodingCoord() { - if(!isProteinCodingTranscript()) { - throw new ReviewedStingException("This method should only be called for protein-coding transcripts"); - } - - if(positiveStrand) - { - if( cdsStart == exonStarts[0] ) { - //the 1st nucleotide of the transcript is CDS. - return 1; - } - - int result = 0; - for(int i = 0; i < exonStarts.length; i++) - { - final int exonStart = exonStarts[i]; - final int exonEnd = exonEnds[i]; - if(cdsStart <= exonEnd) { //eg. exonEnd is now on the 3' side of cdsStart - //this means cdsStart is within the current exon - result += (cdsStart - exonStart) + 1; - break; - } else { - //cdsStart is downstream of the current exon - result += (exonEnd - exonStart) + 1; - } - } - return -result; //negate because 5' UTR coding coord is negative - } - else //(negative strand) - { - final int cdsStart_5prime = cdsEnd; - if(cdsStart_5prime == exonEnds[exonEnds.length - 1]) { - //the 1st nucleotide of the transcript is CDS. - return 1; - } - - int result = 0; - for(int i = exonEnds.length - 1; i >= 0; i--) - { - final int exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i] - final int exonEnd = exonStarts[i]; - if( exonEnd <= cdsStart_5prime ) { //eg. exonEnd is now on the 3' side of cdsStart - //this means cdsStart is within the current exon - result += -(cdsStart_5prime - exonStart) + 1; - break; - } else { - //cdsStart is downstream of the current exon - result += -(exonEnd - exonStart) + 1; - } - } - return -result; //negate because 5' UTR coding coord is negative - } - } - } - - -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java new file mode 100644 index 000000000..7200f841b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/AnnotatorCompatibleWalker.java @@ -0,0 +1,16 @@ +package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; + +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.List; + +public interface AnnotatorCompatibleWalker { + + // getter methods for various used bindings + public abstract RodBinding getVariantRodBinding(); + public abstract RodBinding getSnpEffRodBinding(); + public abstract RodBinding getDbsnpRodBinding(); + public abstract List> getCompRodBindings(); + public abstract List> getResourceRodBindings(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java index 57bc44ab8..f87f0e310 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java @@ -10,15 +10,13 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.List; import java.util.Map; -public interface GenotypeAnnotation { +public abstract class GenotypeAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts/genotype split by sample - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g); - - // return the FORMAT keys - public List getKeyNames(); + public abstract Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, + ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g); // return the descriptions used for the VCF FORMAT meta field - public List getDescriptions(); - + public abstract List getDescriptions(); + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java index 4e850d01b..b94bee31b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java @@ -9,15 +9,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.List; import java.util.Map; -public interface InfoFieldAnnotation { - +public abstract class InfoFieldAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts split by sample - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc); - - // return the INFO keys - public List getKeyNames(); + public abstract Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, + ReferenceContext ref, Map stratifiedContexts, VariantContext vc); // return the descriptions used for the VCF INFO meta field - public List getDescriptions(); - + public abstract List getDescriptions(); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java new file mode 100644 index 000000000..160a3d258 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; + +import java.util.List; + +@DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations") +public abstract class VariantAnnotatorAnnotation { + // return the INFO keys + public abstract List getKeyNames(); + + // initialization method (optional for subclasses, and therefore non-abstract) + public void initialize ( AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit ) { } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 21c8ec430..7f6dabeec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -25,15 +25,13 @@ package org.broadinstitute.sting.gatk.walkers.beagle; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.codecs.beagle.BeagleFeature; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; @@ -50,27 +48,76 @@ import static java.lang.Math.log10; /** * Takes files produced by Beagle imputation engine and creates a vcf with modified annotations. - */ -@Requires(value={},referenceMetaData=@RMD(name=BeagleOutputToVCFWalker.INPUT_ROD_NAME, type=VariantContext.class)) + * + *

This walker is intended to be run after Beagle has successfully executed. The full calling sequence for using Beagle along with the GATK is:

+ * + *

1. Run ProduceBeagleInputWalker.

+ *

2. Run Beagle

+ *

3. Uncompress output files

+ *

4. Run BeagleOutputToVCFWalker.

+ * + * + * Note that this walker requires all input files produced by Beagle. + * + * + *

Example

+ *
+ *     java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
+ *      -R reffile.fasta -T BeagleOutputToVCF \
+ *      -V input_vcf.vcf \
+ *      -beagleR2:BEAGLE /myrun.beagle_output.r2 \
+ *      -beaglePhased:BEAGLE /myrun.beagle_output.phased \
+ *      -beagleProbs:BEAGLE /myrun.beagle_output.gprobs \
+ *      -o output_vcf.vcf
+ *      
+

Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them

+ + */ public class BeagleOutputToVCFWalker extends RodWalker { - public static final String INPUT_ROD_NAME = "variant"; - public static final String COMP_ROD_NAME = "comp"; - public static final String R2_ROD_NAME = "beagleR2"; - public static final String PROBS_ROD_NAME = "beagleProbs"; - public static final String PHASED_ROD_NAME = "beaglePhased"; + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Output(doc="File to which variants should be written",required=true) + /** + * If this argument is present, the original allele frequencies and counts from this vcf are added as annotations ACH,AFH and ANH. at each record present in this vcf + */ + @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false) + public RodBinding comp; + + + /** + * This required argument is used to annotate each site in the vcf INFO field with R2 annotation. Will be NaN if Beagle determined there are no variant samples. + */ + @Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true) + public RodBinding beagleR2; + + /** + * These values will populate the GL field for each sample and contain the posterior probability of each genotype given the data after phasing and imputation. + */ + @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true) + public RodBinding beagleProbs; + + /** + * By default, all genotypes will be marked in the VCF as "phased", using the "|" separator after Beagle. + */ + @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true) + public RodBinding beaglePhased; + + @Output(doc="VCF File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="output_file", shortName="output", doc="Please use --out instead" ,required=false) - @Deprecated - protected String oldOutputArg; - + /** + * If this argument is absent, and if Beagle determines that there is no sample in a site that has a variant genotype, the site will be marked as filtered (Default behavior). + * If the argument is present, the site won't be marked as filtered under this condition even if there are no variant genotypes. + */ @Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false) public boolean DONT_FILTER_MONOMORPHIC_SITES = false; + /** + * Value between 0 and 1. If the probability of getting a genotype correctly (based on the posterior genotype probabilities and the actual genotype) is below this threshold, + * a genotype will be substitute by a no-call. + */ @Argument(fullName="no" + "call_threshold", shortName="ncthr", doc="Threshold of confidence at which a genotype won't be called", required=false) private double noCallThreshold = 0.0; @@ -97,17 +144,13 @@ public class BeagleOutputToVCFWalker extends RodWalker { // Open output file specified by output VCF ROD final List dataSources = this.getToolkit().getRodDataSources(); - for( final ReferenceOrderedDataSource source : dataSources ) { - if (source.getName().equals(COMP_ROD_NAME)) { - hInfo.add(new VCFInfoHeaderLine("ACH", 1, VCFHeaderLineType.Integer, "Allele Count from Comparison ROD at this site")); - hInfo.add(new VCFInfoHeaderLine("ANH", 1, VCFHeaderLineType.Integer, "Allele Frequency from Comparison ROD at this site")); - hInfo.add(new VCFInfoHeaderLine("AFH", 1, VCFHeaderLineType.Float, "Allele Number from Comparison ROD at this site")); - break; - } - + if ( comp.isBound() ) { + hInfo.add(new VCFInfoHeaderLine("ACH", 1, VCFHeaderLineType.Integer, "Allele Count from Comparison ROD at this site")); + hInfo.add(new VCFInfoHeaderLine("ANH", 1, VCFHeaderLineType.Integer, "Allele Frequency from Comparison ROD at this site")); + hInfo.add(new VCFInfoHeaderLine("AFH", 1, VCFHeaderLineType.Float, "Allele Number from Comparison ROD at this site")); } - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(INPUT_ROD_NAME)); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(variantCollection.variants.getName())); final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); @@ -119,40 +162,29 @@ public class BeagleOutputToVCFWalker extends RodWalker { return 0; GenomeLoc loc = context.getLocation(); - VariantContext vc_input = tracker.getVariantContext(ref,INPUT_ROD_NAME, null, loc, true); + VariantContext vc_input = tracker.getFirstValue(variantCollection.variants, loc); - VariantContext vc_comp = tracker.getVariantContext(ref,COMP_ROD_NAME, null, loc, true); + VariantContext vc_comp = tracker.getFirstValue(comp, loc); if ( vc_input == null ) return 0; if (vc_input.isFiltered()) { - vcfWriter.add(vc_input, ref.getBase()); + vcfWriter.add(vc_input); return 1; } - List r2rods = tracker.getReferenceMetaData(R2_ROD_NAME); + + BeagleFeature beagleR2Feature = tracker.getFirstValue(beagleR2); + BeagleFeature beagleProbsFeature = tracker.getFirstValue(beagleProbs); + BeagleFeature beaglePhasedFeature = tracker.getFirstValue(beaglePhased); // ignore places where we don't have a variant - if ( r2rods.size() == 0 ) - return 0; + if ( beagleR2Feature == null || beagleProbsFeature == null || beaglePhasedFeature == null) + { + vcfWriter.add(vc_input); + return 1; + } - BeagleFeature beagleR2Feature = (BeagleFeature)r2rods.get(0); - - List gProbsrods = tracker.getReferenceMetaData(PROBS_ROD_NAME); - - // ignore places where we don't have a variant - if ( gProbsrods.size() == 0 ) - return 0; - - BeagleFeature beagleProbsFeature = (BeagleFeature)gProbsrods.get(0); - - List gPhasedrods = tracker.getReferenceMetaData(PHASED_ROD_NAME); - - // ignore places where we don't have a variant - if ( gPhasedrods.size() == 0 ) - return 0; - - BeagleFeature beaglePhasedFeature = (BeagleFeature)gPhasedrods.get(0); // get reference base for current position byte refByte = ref.getBase(); @@ -333,7 +365,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { } - vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes), ref.getBase()); + vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes)); return 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 3eed12992..87695077d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -25,16 +25,12 @@ package org.broadinstitute.sting.gatk.walkers.beagle; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VQSRCalibrationCurve; import org.broadinstitute.sting.utils.GenomeLoc; @@ -52,17 +48,45 @@ import java.io.PrintStream; import java.util.*; /** - * Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input variant file + * Converts the input VCF into a format accepted by the Beagle imputation/analysis program. + *

+ * + *

Input

+ *

+ * A VCF with variants to convert to Beagle format + *

+ * + *

Outputs

+ *

+ * A single text file which can be fed to Beagle + *

+ *

+ * Optional: A file with a list of markers + *

+ * + *

Examples

+ *
+ *     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
+ *      -R reffile.fasta -T ProduceBeagleInput \
+ *      -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
+ * 
+ * */ -@Requires(value={},referenceMetaData=@RMD(name=ProduceBeagleInputWalker.ROD_NAME, type=VariantContext.class)) + public class ProduceBeagleInputWalker extends RodWalker { - public static final String ROD_NAME = "variant"; - public static final String VALIDATION_ROD_NAME = "validation"; + + @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Hidden + @Input(fullName="validation", shortName = "validation", doc="Validation VCF file", required=false) + public RodBinding validation; + @Output(doc="File to which BEAGLE input should be written",required=true) protected PrintStream beagleWriter = null; - @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false) + @Hidden + @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false) protected PrintStream markers = null; int markerCounter = 1; @@ -75,14 +99,22 @@ public class ProduceBeagleInputWalker extends RodWalker { @Argument(doc="VQSqual key", shortName = "vqskey", required=false) protected String VQSLOD_KEY = "VQSqual"; - @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) + @Hidden + @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) public double insertedNoCallRate = 0; - @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) + @Hidden + @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) public double validationPrior = -1.0; - @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) + @Hidden + @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) public double bootstrap = 0.0; - @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) + @Hidden + @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) VCFWriter bootstrapVCFOutput = null; + + /** + * If sample gender is known, this flag should be set to true to ensure that Beagle treats male Chr X properly. + */ @Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false) public boolean CHECK_IS_MALE_ON_CHR_X = false; @@ -99,7 +131,7 @@ public class ProduceBeagleInputWalker extends RodWalker { public void initialize() { - samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(ROD_NAME)); + samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(variantCollection.variants.getName())); beagleWriter.print("marker alleleA alleleB"); for ( String sample : samples ) @@ -121,8 +153,8 @@ public class ProduceBeagleInputWalker extends RodWalker { public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { if( tracker != null ) { GenomeLoc loc = context.getLocation(); - VariantContext variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, true); - VariantContext validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc, true); + VariantContext variant_eval = tracker.getFirstValue(variantCollection.variants, loc); + VariantContext validation_eval = tracker.getFirstValue(validation, loc); if ( goodSite(variant_eval,validation_eval) ) { if ( useValidation(validation_eval, ref) ) { @@ -171,20 +203,20 @@ public class ProduceBeagleInputWalker extends RodWalker { logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1)); if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) { if ( bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() ); + bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER)); } bootstrapSetSize++; return true; } else { if ( bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(validation,ref.getBase()); + bootstrapVCFOutput.add(validation); } testSetSize++; return false; } } else { if ( validation != null && bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(validation,ref.getBase()); + bootstrapVCFOutput.add(validation); } return false; } @@ -303,9 +335,7 @@ public class ProduceBeagleInputWalker extends RodWalker { } private void initializeVcfWriter() { - - final ArrayList inputNames = new ArrayList(); - inputNames.add( VALIDATION_ROD_NAME ); + final List inputNames = Arrays.asList(validation.getName()); // setup the header fields Set hInfo = new HashSet(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java index f6cd1d636..22c39d794 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java @@ -26,12 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.beagle; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; @@ -54,9 +54,9 @@ import java.util.Set; * in input variant file. Will additional hold back a fraction of the sites for evaluation, marking the * genotypes at that sites as missing, and writing the truth of these sites to a second VCF file */ -@Requires(value={},referenceMetaData=@RMD(name= VariantsToBeagleUnphasedWalker.ROD_NAME, type=VariantContext.class)) public class VariantsToBeagleUnphasedWalker extends RodWalker { - public static final String ROD_NAME = "variant"; + @Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true) + public RodBinding variants; @Output(doc="File to which BEAGLE unphased genotypes should be written",required=true) protected PrintStream beagleWriter = null; @@ -75,7 +75,7 @@ public class VariantsToBeagleUnphasedWalker extends RodWalker private int testSetSize = 0; public void initialize() { - samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(ROD_NAME)); + samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(variants.getName())); beagleWriter.print("I marker alleleA alleleB"); for ( String sample : samples ) @@ -102,7 +102,7 @@ public class VariantsToBeagleUnphasedWalker extends RodWalker public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { if( tracker != null ) { GenomeLoc loc = context.getLocation(); - VariantContext vc = tracker.getVariantContext(ref, ROD_NAME, null, loc, true); + VariantContext vc = tracker.getFirstValue(variants, loc); if ( ProduceBeagleInputWalker.canBeOutputToBeagle(vc) ) { // do we want to hold back this site? @@ -110,7 +110,7 @@ public class VariantsToBeagleUnphasedWalker extends RodWalker // if we are holding it back and we are writing a bootstrap VCF, write it out if ( makeMissing && bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(vc, ref.getBase()); + bootstrapVCFOutput.add(vc); } // regardless, all sites are written to the unphased genotypes file, marked as missing if appropriate diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 90e6fcd77..32875a098 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -22,6 +22,7 @@ package org.broadinstitute.sting.gatk.walkers.coverage; +import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -42,50 +43,195 @@ import java.io.PrintStream; /** * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome * - * @Author depristo - * @Date May 7, 2010 + *

+ * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system + * considers the coverage at each locus and emits either a per base state or a summary interval BED file that + * partitions the genomic intervals into the following callable states: + *

+ *
REF_N
+ *
the reference base was an N, which is not considered callable the GATK
+ *
CALLABLE
+ *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ *
NO_COVERAGE
+ *
absolutely no reads were seen at this locus, regardless of the filtering parameters
+ *
LOW_COVERAGE
+ *
there were less than min. depth bases at the locus, after applying filters
+ *
EXCESSIVE_COVERAGE
+ *
more than -maxDepth read at the locus, indicating some sort of mapping problem
+ *
POOR_MAPPING_QUALITY
+ *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *
+ *

+ * + *

Input

+ *

+ * A BAM file containing exactly one sample. + *

+ * + *

Output

+ *

+ *

    + *
  • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
  • + *
  • -summary: a table of callable status x count of all examined bases
  • + *
+ *

+ * + *

Examples

+ *
+ *     -T CallableLociWalker \
+ *     -I my.bam \
+ *     -summary my.summary \
+ *     -o my.bed
+ * 
+ * + * would produce a BED file (my.bed) that looks like: + * + *
+ *     20 10000000 10000864 CALLABLE
+ *     20 10000865 10000985 POOR_MAPPING_QUALITY
+ *     20 10000986 10001138 CALLABLE
+ *     20 10001139 10001254 POOR_MAPPING_QUALITY
+ *     20 10001255 10012255 CALLABLE
+ *     20 10012256 10012259 POOR_MAPPING_QUALITY
+ *     20 10012260 10012263 CALLABLE
+ *     20 10012264 10012328 POOR_MAPPING_QUALITY
+ *     20 10012329 10012550 CALLABLE
+ *     20 10012551 10012551 LOW_COVERAGE
+ *     20 10012552 10012554 CALLABLE
+ *     20 10012555 10012557 LOW_COVERAGE
+ *     20 10012558 10012558 CALLABLE
+ *     et cetera...
+ * 
+ * as well as a summary table that looks like: + * + *
+ *                        state nBases
+ *                        REF_N 0
+ *                     CALLABLE 996046
+ *                  NO_COVERAGE 121
+ *                 LOW_COVERAGE 928
+ *           EXCESSIVE_COVERAGE 0
+ *         POOR_MAPPING_QUALITY 2906
+ * 
+ * + * @author Mark DePristo + * @since May 7, 2010 */ @By(DataSource.REFERENCE) public class CallableLociWalker extends LocusWalker { @Output PrintStream out; - @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read. The gap between this value and mmq are reads that are not sufficiently well mapped for calling but aren't indicative of mapping problems.", required = false) + /** + * Callable loci summary counts (see outputs) will be written to this file. + */ + @Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true) + File summaryFile; + + /** + * The gap between this value and mmq are reads that are not sufficiently well mapped for calling but + * aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with + * MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where + * reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to + * calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states + */ + @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false) byte maxLowMAPQ = 1; - @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false) + /** + * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE + * state. + */ + @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false) byte minMappingQuality = 10; - @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false) + /** + * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state + */ + @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false) byte minBaseQuality = 20; + /** + * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this + * value and is less than maxDepth the site is considered CALLABLE. + */ + @Advanced @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false) int minDepth = 4; + /** + * If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH + */ @Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false) int maxDepth = -1; + /** + * We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We + * won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads + * covering the site. + */ + @Advanced @Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false) int minDepthLowMAPQ = 10; - @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "Maximum read depth before a locus is considered poorly mapped", required = false) + /** + * If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality + * exceeds this fraction then the site has POOR_MAPPING_QUALITY. + */ + @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false) double maxLowMAPQFraction = 0.1; - @Argument(fullName = "format", shortName = "format", doc = "Output format for the system: either BED or STATE_PER_BASE", required = false) + /** + * The output of this walker will be written in this format. The recommended option is BED. + */ + @Advanced + @Argument(fullName = "format", shortName = "format", doc = "Output format", required = false) OutputFormat outputFormat; - @Argument(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true) - File summaryFile; + public enum OutputFormat { + /** + * The output will be written as a BED file. There's a BED element for each + * continuous run of callable states (i.e., CALLABLE, REF_N, etc). This is the recommended + * format + */ + BED, - public enum OutputFormat { BED, STATE_PER_BASE } + /** + * Emit chr start stop state quads for each base. Produces a potentially disasterously + * large amount of output. + */ + STATE_PER_BASE + } + + public enum CalledState { + /** the reference base was an N, which is not considered callable the GATK */ + REF_N, + /** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */ + CALLABLE, + /** absolutely no reads were seen at this locus, regardless of the filtering parameters */ + NO_COVERAGE, + /** there were less than min. depth bases at the locus, after applying filters */ + LOW_COVERAGE, + /** more than -maxDepth read at the locus, indicating some sort of mapping problem */ + EXCESSIVE_COVERAGE, + /** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */ + POOR_MAPPING_QUALITY + } //////////////////////////////////////////////////////////////////////////////////// // STANDARD WALKER METHODS //////////////////////////////////////////////////////////////////////////////////// + @Override public boolean includeReadsWithDeletionAtLoci() { return true; } + @Override public void initialize() { + if ( getToolkit().getSamples().size() != 2 ) { + // unbelievably there are actually two samples even when there's just one in the header. God I hate this Samples system + throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getToolkit().getSamples()); + } + try { PrintStream summaryOut = new PrintStream(summaryFile); summaryOut.close(); @@ -94,15 +240,15 @@ public class CallableLociWalker extends LocusWalker compTrack1; - @Argument(shortName="comp2", doc="First comparison track name", required=false) - protected String COMP2 = "comp2"; + @Input(fullName="comp2", shortName = "comp2", doc="Second comparison track name", required=true) + public RodBinding compTrack2; @Argument(shortName="printState", doc="If provided, prints sites satisfying this state pair", required=false) protected String printState = null; @@ -77,8 +79,8 @@ public class CompareCallableLociWalker extends RodWalker map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker != null ) { - CallableLociWalker.CallableBaseState comp1 = getCallableBaseState(tracker, COMP1); - CallableLociWalker.CallableBaseState comp2 = getCallableBaseState(tracker, COMP2); + CallableLociWalker.CallableBaseState comp1 = getCallableBaseState(tracker, compTrack1); + CallableLociWalker.CallableBaseState comp2 = getCallableBaseState(tracker, compTrack2); if ( printState != null && comp1.getState() == printState1 && comp2.getState() == printState2 ) { out.printf("%s %s %s %s%n", comp1.getLocation(), comp1.getState(), comp2.getLocation(), comp2.getState()); @@ -90,14 +92,14 @@ public class CompareCallableLociWalker extends RodWalker rodBinding) { //System.out.printf("tracker %s%n", tracker); - List bindings = tracker.getReferenceMetaData(track); - if ( bindings.size() != 1 || ! (bindings.get(0) instanceof FullBEDFeature)) { - throw new UserException.MalformedFile(String.format("%s track isn't a properly formated CallableBases object!", track)); + List bindings = tracker.getValues(rodBinding); + if ( bindings.size() != 1 ) { + throw new UserException.MalformedFile(String.format("%s track isn't a properly formated CallableBases object!", rodBinding.getName())); } - FullBEDFeature bed = (FullBEDFeature)bindings.get(0); + BEDFeature bed = bindings.get(0); GenomeLoc loc = getToolkit().getGenomeLocParser().createGenomeLoc(bed.getChr(), bed.getStart(), bed.getEnd()); CallableLociWalker.CalledState state = CallableLociWalker.CalledState.valueOf(bed.getName()); return new CallableLociWalker.CallableBaseState(getToolkit().getGenomeLocParser(),loc, state); @@ -127,7 +129,7 @@ public class CompareCallableLociWalker extends RodWalker + * DepthOfCoverage processes a set of bam files to determine coverage at different levels of partitioning and + * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by + * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, + * and/or percentage of bases covered to or beyond a threshold. + * Additionally, reads and bases can be filtered by mapping or base quality score. + * + *

Input

+ *

+ * One or more bam files (with proper headers) to be analyzed for coverage statistics + * (Optional) A REFSEQ Rod to aggregate coverage to the gene level + *

+ * + *

Output

+ *

+ * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: + *

+ * - no suffix: per locus coverage + *

+ * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases + *

+ * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases + *

+ * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval + *

+ * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples + *

+ * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene + *

+ * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples + *

+ * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases + *

+ * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantEval \
+ *   -o file_name_base \
+ *   -I input_bams.list
+ *   [-geneList refSeq.sorted.txt] \
+ *   [-pt readgroup] \
+ *   [-ct 4 -ct 6 -ct 10] \
+ *   [-L my_capture_genes.interval_list]
+ * 
* - * @Author chartl - * @Date Feb 22, 2010 */ // todo -- cache the map from sample names to means in the print functions, rather than regenerating each time // todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n) @@ -71,10 +114,19 @@ public class DepthOfCoverageWalker extends LocusWalker out; + /** + * Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin. + */ @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", required = false) int start = 1; + /** + * Sets the high-coverage cutoff for granular binning. All loci with depth > END are counted in the last bin. + */ @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", required = false) int stop = 500; + /** + * Sets the number of bins for granular binning + */ @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", required = false) int nBins = 499; @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to -1.", required = false) @@ -85,28 +137,59 @@ public class DepthOfCoverageWalker extends LocusWalker partitionTypes = EnumSet.of(DoCOutputType.Partition.sample); + /** + * Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output. + */ @Argument(fullName = "includeDeletions", shortName = "dels", doc = "Include information on deletions", required = false) boolean includeDeletions = false; @Argument(fullName = "ignoreDeletionSites", doc = "Ignore sites consisting only of deletions", required = false) boolean ignoreDeletionSites = false; + + /** + * Path to the RefSeq file for use in aggregating coverage statistics over genes + */ @Argument(fullName = "calculateCoverageOverGenes", shortName = "geneList", doc = "Calculate the coverage statistics over this list of genes. Currently accepts RefSeq.", required = false) File refSeqGeneList = null; + /** + * The format of the output file + */ @Argument(fullName = "outputFormat", doc = "the format of the output file (e.g. csv, table, rtable); defaults to r-readable table", required = false) String outputFormat = "rtable"; + /** + * A coverage threshold for summarizing (e.g. % bases >= CT for each sample) + */ @Argument(fullName = "summaryCoverageThreshold", shortName = "ct", doc = "for summary file outputs, report the % of bases coverd to >= this number. Defaults to 15; can take multiple arguments.", required = false) int[] coverageThresholds = {15}; @@ -929,4 +1012,4 @@ class CoveragePartitioner { public Map> getIdentifiersByType() { return identifiersByType; } -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/GCContentByIntervalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java similarity index 86% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/GCContentByIntervalWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java index 68bea4dba..5c2a967b9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/GCContentByIntervalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java @@ -22,12 +22,13 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers; +package org.broadinstitute.sting.gatk.walkers.coverage; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; @@ -37,12 +38,32 @@ import java.util.List; /** * Walks along reference and calculates the GC content for each interval. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * GC content calculations per interval. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T GCContentByInterval \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   -L input.intervals
+ * 
+ * */ @Allows(value = {DataSource.REFERENCE}) @Requires(value = {DataSource.REFERENCE}) - @By(DataSource.REFERENCE) - public class GCContentByIntervalWalker extends LocusWalker { @Output protected PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java new file mode 100644 index 000000000..0f1cea2e1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java @@ -0,0 +1,101 @@ +package org.broadinstitute.sting.gatk.walkers.diagnostics; + +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; + +import java.io.PrintStream; +import java.util.List; + +/** + * Outputs the read lengths of all the reads in a file. + * + *

+ * Generates a table with the read lengths categorized per sample. If the file has no sample information + * (no read groups) it considers all reads to come from the same sample. + *

+ * + * + *

Input

+ *

+ * A BAM file. + *

+ * + *

Output

+ *

+ * A human/R readable table of tab separated values with one column per sample and one row per read. + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ReadLengthDistribution
+ *      -I example.bam
+ *      -R reference.fasta
+ *      -o example.tbl
+ *  
+ * + * @author Kiran Garimela + */ + + + +public class ReadLengthDistribution extends ReadWalker { + @Output + public PrintStream out; + + private GATKReport report; + + public void initialize() { + report = new GATKReport(); + report.addTable("ReadLengthDistribution", "Table of read length distributions"); + GATKReportTable table = report.getTable("ReadLengthDistribution"); + + table.addPrimaryKey("readLength"); + + List readGroups = getToolkit().getSAMFileHeader().getReadGroups(); + if (readGroups.isEmpty()) + table.addColumn("SINGLE_SAMPLE", 0); + + else + for (SAMReadGroupRecord rg : readGroups) + table.addColumn(rg.getSample(), 0); + + } + + public boolean filter(ReferenceContext ref, SAMRecord read) { + return ( !read.getReadPairedFlag() || read.getReadPairedFlag() && read.getFirstOfPairFlag()); + } + + @Override + public Integer map(ReferenceContext referenceContext, SAMRecord samRecord, ReadMetaDataTracker readMetaDataTracker) { + GATKReportTable table = report.getTable("ReadLengthDistribution"); + + int length = Math.abs(samRecord.getReadLength()); + String sample = samRecord.getReadGroup().getSample(); + + table.increment(length, sample); + + return null; + } + + @Override + public Integer reduceInit() { + return null; + } + + @Override + public Integer reduce(Integer integer, Integer integer1) { + return null; + } + + public void onTraversalDone(Integer sum) { + report.print(out); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java index 89e20dad1..2159bc839 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -42,6 +42,7 @@ import java.util.*; * Date: 7/4/11 * Time: 12:51 PM * A generic engine for comparing tree-structured objects + * */ public class DiffEngine { final protected static Logger logger = Logger.getLogger(DiffEngine.class); @@ -233,8 +234,8 @@ public class DiffEngine { // now that we have a specific list of values we want to show, display them GATKReport report = new GATKReport(); - final String tableName = "diffences"; - report.addTable(tableName, "Summarized differences between the master and test files.\nSee http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", false); + final String tableName = "differences"; + report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", false); GATKReportTable table = report.getTable(tableName); table.addPrimaryKey("Difference", true); table.addColumn("NumberOfOccurrences", 0); @@ -341,12 +342,12 @@ public class DiffEngine { return reader.readFromFile(file, maxElementsToRead); } - public static boolean simpleDiffFiles(File masterFile, File testFile, DiffEngine.SummaryReportParams params) { + public static boolean simpleDiffFiles(File masterFile, File testFile, int maxElementsToRead, DiffEngine.SummaryReportParams params) { DiffEngine diffEngine = new DiffEngine(); if ( diffEngine.canRead(masterFile) && diffEngine.canRead(testFile) ) { - DiffElement master = diffEngine.createDiffableFromFile(masterFile); - DiffElement test = diffEngine.createDiffableFromFile(testFile); + DiffElement master = diffEngine.createDiffableFromFile(masterFile, maxElementsToRead); + DiffElement test = diffEngine.createDiffableFromFile(testFile, maxElementsToRead); List diffs = diffEngine.diff(master, test); diffEngine.reportSummarizedDifferences(diffs, params); return true; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java index fba6549fb..5889d19e5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java @@ -29,7 +29,6 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import java.io.File; @@ -37,42 +36,170 @@ import java.io.PrintStream; import java.util.List; /** - * Compares two record-oriented files, itemizing specific difference between equivalent - * records in the two files. Reports both itemized and summarized differences. + * A generic engine for comparing tree-structured objects + * + *

+ * Compares two record-oriented files, itemizing specific difference between equivalent + * records in the two files. Reports both itemized and summarized differences. + *

+ * + *

What are the summarized differences and the DiffObjectsWalker?

+ * + *

+ * The GATK contains a summarizing difference engine that compares hierarchical data structures to emit: + *

    + *
  • A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G. + *
  • A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed. + *
+ *

+ * + *

+ * The GATK contains a private walker DiffObjects that allows you access to the DiffEngine capabilities on the command line. Simply provide the walker with the master and test files and it will emit summarized differences for you. + *

+ * + *

Why?

+ * + *

+ * The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them. This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others. + *

+ * + *

Input

+ *

+ * The DiffObjectsWalker works with BAM or VCF files. + *

+ * + *

Output

+ *

+ * The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named + * nodes. Suppose I have two trees: + *

+ *     Tree1=(A=1 B=(C=2 D=3))
+ *     Tree2=(A=1 B=(C=3 D=3 E=4))
+ *     Tree3=(A=1 B=(C=4 D=3 E=4))
+ * 
+ *

+ * where every node in the tree is named, or is a raw value (here all leaf values are integers). The DiffEngine + * traverses these data structures by name, identifies equivalent nodes by fully qualified names + * (Tree1.A is distinct from Tree2.A, and determines where their values are equal (Tree1.A=1, Tree2.A=1, so they are). + * These itemized differences are listed as: + *

+ *     Tree1.B.C=2 != Tree2.B.C=3
+ *     Tree1.B.C=2 != Tree3.B.C=4
+ *     Tree2.B.C=3 != Tree3.B.C=4
+ *     Tree1.B.E=MISSING != Tree2.B.E=4
+ * 
+ * + *

+ * This conceptually very similar to the output of the unix command line tool diff. What's nice about DiffEngine though + * is that it computes similarity among the itemized differences and displays the count of differences names + * in the system. In the above example, the field C is not equal three times, while the missing E in Tree1 occurs + * only once. So the summary is: + * + *

+ *     *.B.C : 3
+ *     *.B.E : 1
+ * 
+ * + *

+ * where the * operator indicates that any named field matches. This output is sorted by counts, and provides an + * immediate picture of the commonly occurring differences among the files. + *

+ * Below is a detailed example of two VCF fields that differ because of a bug in the AC, AF, and AN counting routines, + * detected by the integrationtest integration (more below). You can see that in the although there are many specific + * instances of these differences between the two files, the summarized differences provide an immediate picture that + * the AC, AF, and AN fields are the major causes of the differences. + *

+ * + *

+   [testng] path                                                             count
+   [testng] *.*.*.AC                                                         6
+   [testng] *.*.*.AF                                                         6
+   [testng] *.*.*.AN                                                         6
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC  1
+
+ * * @author Mark DePristo - * @version 0.1 + * @since 7/4/11 */ -@Requires(value={}) public class DiffObjectsWalker extends RodWalker { + /** + * Writes out a file of the DiffEngine format: + * + * http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine + */ @Output(doc="File to which results should be written",required=true) protected PrintStream out; - @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) - int MAX_OBJECTS_TO_READ = -1; - - @Argument(fullName="maxDiffs", shortName="M", doc="Max. number of diffs to process", required=false) - int MAX_DIFFS = 0; - - @Argument(fullName="maxCount1Diffs", shortName="M1", doc="Max. number of diffs occuring exactly once in the file to process", required=false) - int MAX_COUNT1_DIFFS = 0; - - @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) - int minCountForDiff = 1; - - @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) - boolean showItemizedDifferences = false; - + /** + * The master file against which we will compare test. This is one of the two required + * files to do the comparison. Conceptually master is the original file contained the expected + * results, but this doesn't currently have an impact on the calculations, but might in the future. + */ @Argument(fullName="master", shortName="m", doc="Master file: expected results", required=true) File masterFile; + /** + * The test file against which we will compare to the master. This is one of the two required + * files to do the comparison. Conceptually test is the derived file from master, but this + * doesn't currently have an impact on the calculations, but might in the future. + */ @Argument(fullName="test", shortName="t", doc="Test file: new results to compare to the master file", required=true) File testFile; - final DiffEngine diffEngine = new DiffEngine(); + /** + * The engine will read at most this number of objects from each of master and test files. This reduces + * the memory requirements for DiffObjects but does limit you to comparing at most this number of objects + */ + @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) + int MAX_OBJECTS_TO_READ = -1; + + /** + * The max number of differences to display when summarizing. For example, if there are 10M differences, but + * maxDiffs is 10, then the comparison aborts after first ten summarized differences are shown. Note that + * the system shows differences sorted by frequency, so these 10 would be the most common between the two files. + * A value of 0 means show all possible differences. + */ + @Argument(fullName="maxDiffs", shortName="M", doc="Max. number of diffs to process", required=false) + int MAX_DIFFS = 0; + + /** + * The maximum number of singleton (occurs exactly once between the two files) to display when writing out + * the summary. Only applies if maxDiffs hasn't been exceeded. For example, if maxDiffs is 10 and maxCount1Diffs + * is 2 and there are 20 diffs with count > 1, then only 10 are shown, all of which have count above 1. + */ + @Argument(fullName="maxCount1Diffs", shortName="M1", doc="Max. number of diffs occuring exactly once in the file to process", required=false) + int MAX_COUNT1_DIFFS = 0; + + /** + * Only differences that occur more than minCountForDiff are displayed. For example, if minCountForDiff is 10, then + * a difference must occur at least 10 times between the two files to be shown. + */ + @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) + int minCountForDiff = 1; + + /** + * If provided, the system will write out the summarized, individual differences. May lead to enormous outputs, + * depending on how many differences are found. Note these are not sorted in any way, so if you have 10M + * common differences in the files, you will see 10M records, whereas the final summarize will just list the + * difference and its count of 10M. + */ + @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) + boolean showItemizedDifferences = false; + + DiffEngine diffEngine; @Override public void initialize() { - + this.diffEngine = new DiffEngine(); } @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java new file mode 100644 index 000000000..ef47ee33c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/GATKReportDiffableReader.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diffengine; + +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportColumn; +import org.broadinstitute.sting.gatk.report.GATKReportTable; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Map; + + +/** + * Class implementing diffnode reader for GATKReports + */ +public class GATKReportDiffableReader implements DiffableReader { + @Override + public String getName() { return "GATKReport"; } + + @Override + public DiffElement readFromFile(File file, int maxElementsToRead) { + DiffNode root = DiffNode.rooted(file.getName()); + try { + // one line reads the whole thing into memory + GATKReport report = new GATKReport(file); + + for (GATKReportTable table : report.getTables() ) { + root.add(tableToNode(table, root)); + } + + return root.getBinding(); + } catch ( Exception e ) { + return null; + } + } + + private DiffNode tableToNode(GATKReportTable table, DiffNode root) { + DiffNode tableRoot = DiffNode.empty(table.getTableName(), root); + + tableRoot.add("Description", table.getTableDescription()); + tableRoot.add("NumberOfRows", table.getNumRows()); + tableRoot.add("Version", table.getVersion()); + + for ( GATKReportColumn column : table.getColumns().values() ) { + DiffNode columnRoot = DiffNode.empty(column.getColumnName(), tableRoot); + + columnRoot.add("Width", column.getColumnWidth()); + columnRoot.add("Displayable", column.isDisplayable()); + + int n = 1; + for ( Object elt : column.values() ) { + String name = column.getColumnName() + n++; + columnRoot.add(name, elt.toString()); + } + + tableRoot.add(columnRoot); + } + + return tableRoot; + } + + @Override + public boolean canRead(File file) { + try { + final String HEADER = GATKReport.GATKREPORT_HEADER_PREFIX; + char[] buff = new char[HEADER.length()]; + new FileReader(file).read(buff, 0, HEADER.length()); + String firstLine = new String(buff); + return firstLine.startsWith(HEADER); + } catch ( IOException e ) { + return false; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index 77a992ce0..a447d17af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -129,14 +129,6 @@ public class VCFDiffableReader implements DiffableReader { @Override public boolean canRead(File file) { - try { - final String VCF4_HEADER = "##fileformat=VCFv4"; - char[] buff = new char[VCF4_HEADER.length()]; - new FileReader(file).read(buff, 0, VCF4_HEADER.length()); - String firstLine = new String(buff); - return firstLine.startsWith(VCF4_HEADER); - } catch ( IOException e ) { - return false; - } + return AbstractVCFCodec.canDecodeFile(file, VCFCodec.VCF4_MAGIC_HEADER); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index 17426d4c1..4e2c17bf6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk.walkers.fasta; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -33,19 +35,58 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Collection; +import java.util.Collections; +import java.util.List; /** - * Generates an alternative reference sequence over the specified interval. Given variant ROD tracks, - * it replaces the reference bases at variation sites with the bases supplied by the ROD(s). Additionally, - * allows for a "snpmask" ROD to set overlapping bases to 'N'. + * Generates an alternative reference sequence over the specified interval. + * + *

+ * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). + * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * Note that if there are multiple variants at a site, it takes the first one seen. + * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). + * + *

Input

+ *

+ * The reference, requested intervals, and any number of variant rod files. + *

+ * + *

Output

+ *

+ * A fasta file representing the requested intervals. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T FastaAlternateReferenceMaker \
+ *   -o output.fasta \
+ *   -L input.intervals \
+ *   --variant input.vcf \
+ *   [--snpmask mask.vcf]
+ * 
+ * */ @WalkerName("FastaAlternateReferenceMaker") @Reference(window=@Window(start=-1,stop=50)) @Requires(value={DataSource.REFERENCE}) public class FastaAlternateReferenceWalker extends FastaReferenceWalker { + /** + * Variants from these input files are used by this tool to construct an alternate reference. + */ + @Input(fullName = "variant", shortName = "V", doc="variants to model", required=false) + public List> variants = Collections.emptyList(); + + /** + * Snps from this file are used as a mask when constructing the alternate reference. + */ + @Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false) + public RodBinding snpmask; + private int deletionBasesRemaining = 0; public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -57,28 +98,25 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker { String refBase = String.valueOf((char)ref.getBase()); - Collection vcs = tracker.getAllVariantContexts(ref); - // Check to see if we have a called snp - for ( VariantContext vc : vcs ) { + for ( VariantContext vc : tracker.getValues(variants) ) { if ( vc.isFiltered() ) continue; - if ( !vc.getSource().startsWith("snpmask") ) { - if ( vc.isDeletion()) { - deletionBasesRemaining = vc.getReference().length(); - // delete the next n bases, not this one - return new Pair(context.getLocation(), refBase); - } else if ( vc.isInsertion()) { - return new Pair(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString())); - } else if (vc.isSNP()) { - return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); - } + + if ( vc.isSimpleDeletion()) { + deletionBasesRemaining = vc.getReference().length(); + // delete the next n bases, not this one + return new Pair(context.getLocation(), refBase); + } else if ( vc.isSimpleInsertion()) { + return new Pair(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString())); + } else if (vc.isSNP()) { + return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); } } // if we don't have a called site, and we have a mask at this site, mask it - for ( VariantContext vc : vcs ) { - if ( vc.getSource().startsWith("snpmask") && vc.isSNP()) { + for ( VariantContext vc : tracker.getValues(snpmask) ) { + if ( vc.isSNP()) { return new Pair(context.getLocation(), "N"); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java index 2dbfc76ff..7ae5c5c75 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java @@ -38,14 +38,47 @@ import org.broadinstitute.sting.utils.collections.Pair; import java.io.PrintStream; /** - * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. Has optional - * features to control the output format. + * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. + * + *

+ * The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). + * + *

Input

+ *

+ * The reference and requested intervals. + *

+ * + *

Output

+ *

+ * A fasta file representing the requested intervals. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T FastaReference \
+ *   -o output.fasta \
+ *   -L input.intervals
+ * 
+ * */ @WalkerName("FastaReferenceMaker") public class FastaReferenceWalker extends RefWalker, GenomeLoc> { + @Output PrintStream out; - @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) public int fastaLineWidth=60; - @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity) - CAUTION: adjacent intervals will automatically be merged", required=false) public boolean fastaRawSeqs=false; + + @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) + public int fastaLineWidth=60; + + /** + * Please note that when using this argument adjacent intervals will automatically be merged. + */ + @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false) + public boolean fastaRawSeqs=false; protected FastaSequence fasta; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 6c023573a..bf3606b54 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -25,11 +25,11 @@ package org.broadinstitute.sting.gatk.walkers.filters; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; @@ -45,42 +45,106 @@ import java.util.*; /** * Filters variant calls using a number of user-selectable, parameterizable criteria. + * + *

+ * VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria. + * Records are hard-filtered by changing the value in the FILTER field to something other than PASS. + * + *

Input

+ *

+ * A variant set to filter. + *

+ * + *

Output

+ *

+ * A filtered VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T VariantFiltration \
+ *   -o output.vcf \
+ *   --variant input.vcf \
+ *   --filterExpression "AB < 0.2 || MQ0 > 50" \
+ *   --filterName "Nov09filters" \
+ *   --mask mask.vcf \
+ *   --maskName InDel
+ * 
+ * */ -@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class)) @Reference(window=@Window(start=-50,stop=50)) public class VariantFiltrationWalker extends RodWalker { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + /** + * Any variant which overlaps entries from the provided mask rod will be filtered. + */ + @Input(fullName="mask", doc="Input ROD mask", required=false) + public RodBinding mask; + @Output(doc="File to which variants should be written", required=true) protected VCFWriter writer = null; - @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter (see wiki docs for more info)", required=false) + /** + * VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using + * --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2"). + */ + @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false) protected ArrayList FILTER_EXPS = new ArrayList(); - @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false) + + /** + * This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names. + */ + @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false) protected ArrayList FILTER_NAMES = new ArrayList(); + /** + * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. + * VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag). + * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience + * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1"). + */ @Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false) protected ArrayList GENOTYPE_FILTER_EXPS = new ArrayList(); + + /** + * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead. + */ @Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false) protected ArrayList GENOTYPE_FILTER_NAMES = new ArrayList(); - @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]", required=false) + /** + * Works together with the --clusterWindowSize argument. + */ + @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false) protected Integer clusterSize = 3; - @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]", required=false) + + /** + * Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1. + */ + @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false) protected Integer clusterWindow = 0; - @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered; [default:0]", required=false) + @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false) protected Integer MASK_EXTEND = 0; - @Argument(fullName="maskName", shortName="mask", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false) + @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false) protected String MASK_NAME = "Mask"; - @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)?", required=false) + /** + * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing. + * Use this argument to have it evaluate as failing filters instead for these cases. + */ + @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false) protected Boolean FAIL_MISSING_VALUES = false; // JEXL expressions for the filters List filterExps; List genotypeFilterExps; - public static final String INPUT_VARIANT_ROD_BINDING_NAME = "variant"; public static final String CLUSTERED_SNP_FILTER_NAME = "SnpCluster"; private ClusteredSnps clusteredSNPs = null; private GenomeLoc previousMaskPosition = null; @@ -92,8 +156,7 @@ public class VariantFiltrationWalker extends RodWalker { private void initializeVcfWriter() { - final ArrayList inputNames = new ArrayList(); - inputNames.add( INPUT_VARIANT_ROD_BINDING_NAME ); + final List inputNames = Arrays.asList(variantCollection.variants.getName()); // setup the header fields Set hInfo = new HashSet(); @@ -110,12 +173,8 @@ public class VariantFiltrationWalker extends RodWalker { if ( genotypeFilterExps.size() > 0 ) hInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, 1, VCFHeaderLineType.String, "Genotype-level filter")); - List dataSources = getToolkit().getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - if ( source.getName().equals("mask") ) { - hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask")); - break; - } + if ( mask.isBound() ) { + hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask")); } writer.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); @@ -149,10 +208,10 @@ public class VariantFiltrationWalker extends RodWalker { if ( tracker == null ) return 0; - Collection VCs = tracker.getVariantContexts(ref, INPUT_VARIANT_ROD_BINDING_NAME, null, context.getLocation(), true, false); + Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); // is there a SNP mask present? - boolean hasMask = tracker.getReferenceMetaData("mask").size() > 0; + boolean hasMask = tracker.hasValues(mask); if ( hasMask ) previousMaskPosition = ref.getLocus(); // multi-base masks will get triggered over all bases of the mask @@ -272,7 +331,7 @@ public class VariantFiltrationWalker extends RodWalker { else filteredVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes()); - writer.add( filteredVC, context.getReferenceContext().getBase() ); + writer.add(filteredVC); } public Integer reduce(Integer value, Integer sum) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 83a8ce7d7..70f3c6a1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -44,7 +44,9 @@ import java.util.Set; public abstract class AlleleFrequencyCalculationModel implements Cloneable { public enum Model { + /** The default model with the best performance in all cases */ EXACT, + /** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */ GRID_SEARCH } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 2014801e4..5f6865d04 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -276,13 +276,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { if ( elt.isReducedRead() ) { // reduced read representation byte qual = elt.getReducedQual(); - for ( int i = 0; i < elt.getReducedCount(); i++ ) { - add(obsBase, qual, (byte)0, (byte)0); - } - return elt.getQual(); + add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods + return elt.getReducedCount(); // we added nObs bases here } else { byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0; + return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; } } @@ -309,9 +307,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { * @param qual1 * @param obsBase2 * @param qual2 can be 0, indicating no second base was observed for this fragment + * @param nObs The number of times this quad of values was seen. Generally 1, but reduced reads + * can have nObs > 1 for synthetic reads * @return */ - private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) { // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future. // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here. @@ -332,19 +332,17 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { for ( DiploidGenotype g : DiploidGenotype.values() ) { double likelihood = likelihoods[g.ordinal()]; - - //if ( VERBOSE ) { - // System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n", - // observedBase, g, qualityScore, pow(10,likelihood) * 100, likelihood); - //} - - log10Likelihoods[g.ordinal()] += likelihood; - log10Posteriors[g.ordinal()] += likelihood; + log10Likelihoods[g.ordinal()] += likelihood * nObs; + log10Posteriors[g.ordinal()] += likelihood * nObs; } return 1; } + private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { + return add(obsBase1, qual1, obsBase2, qual2, 1); + } + // ------------------------------------------------------------------------------------- // // Dealing with the cache routines diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index cd006a3cf..6ae437b27 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -63,7 +63,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private boolean SIMPLE_GREEDY_GENOTYPER = false; - + private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. final private ExactCalculation calcToUse; protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { @@ -178,22 +178,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } - private static final double[][] getGLs(Map GLs) { - double[][] genotypeLikelihoods = new double[GLs.size()+1][]; + private static final ArrayList getGLs(Map GLs) { + ArrayList genotypeLikelihoods = new ArrayList(); - int j = 0; + //int j = 0; + genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.values() ) { - j++; - if ( sample.hasLikelihoods() ) { //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); - genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector(); + double[] gls = sample.getLikelihoods().getAsVector(); + + if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL) + genotypeLikelihoods.add(gls); } } return genotypeLikelihoods; } + // ------------------------------------------------------------------------------------- // // Linearized, ~O(N), implementation. @@ -318,9 +321,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public int linearExact(Map GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { - final int numSamples = GLs.size(); + final ArrayList genotypeLikelihoods = getGLs(GLs); + final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; - final double[][] genotypeLikelihoods = getGLs(GLs); final ExactACCache logY = new ExactACCache(numSamples+1); logY.getkMinus0()[0] = 0.0; // the zero case @@ -334,14 +337,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( k == 0 ) { // special case for k = 0 for ( int j=1; j <= numSamples; j++ ) { - kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA]; + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA]; } } else { // k > 0 final double[] kMinus1 = logY.getkMinus1(); final double[] kMinus2 = logY.getkMinus2(); for ( int j=1; j <= numSamples; j++ ) { - final double[] gl = genotypeLikelihoods[j]; + final double[] gl = genotypeLikelihoods.get(j); final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; double aa = Double.NEGATIVE_INFINITY; @@ -434,10 +437,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( !vc.isVariant() ) throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - boolean multiAllelicRecord = false; - - if (vc.getAlternateAlleles().size() > 1) - multiAllelicRecord = true; Map GLs = vc.getGenotypes(); double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; @@ -454,7 +453,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { pathMetricArray[0][0] = 0.0; // todo = can't deal with optimal dynamic programming solution with multiallelic records - if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) { + if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { sampleIndices.addAll(GLs.keySet()); sampleIdx = GLs.size(); } @@ -465,6 +464,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { continue; double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); + + if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) { + //System.out.print(sample.getKey()+":"); + //for (int k=0; k < likelihoods.length; k++) + // System.out.format("%4.2f ",likelihoods[k]); + //System.out.println(); + // all likelihoods are essentially the same: skip this sample and will later on force no call. + //sampleIdx++; + continue; + } + sampleIndices.add(sample.getKey()); for (int k=0; k <= AFofMaxLikelihood; k++) { @@ -504,22 +514,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { Genotype g = GLs.get(sample); if ( !g.hasLikelihoods() ) continue; - - if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) - bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); - else { - int newIdx = tracebackArray[k][startIdx]; - bestGTguess = startIdx - newIdx; - startIdx = newIdx; - } - + // if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now, + // and will add no-call genotype to GL's in a second pass ArrayList myAlleles = new ArrayList(); double qual = Double.NEGATIVE_INFINITY; double[] likelihoods = g.getLikelihoods().getAsVector(); + + if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { + bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); + } + else { + int newIdx = tracebackArray[k][startIdx];; + bestGTguess = startIdx - newIdx; + startIdx = newIdx; + } + /* System.out.format("Sample: %s GL:",sample); for (int i=0; i < likelihoods.length; i++) - System.out.format("%1.4f ",likelihoods[i]); + System.out.format("%1.4f, ",likelihoods[i]); */ for (int i=0; i < likelihoods.length; i++) { @@ -570,6 +583,25 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } + for ( Map.Entry sample : GLs.entrySet() ) { + + if ( !sample.getValue().hasLikelihoods() ) + continue; + Genotype g = GLs.get(sample.getKey()); + + double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); + + if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL) + continue; // regular likelihoods + + ArrayList myAlleles = new ArrayList(); + + double qual = Genotype.NO_NEG_LOG_10PERROR; + myAlleles.add(Allele.NO_CALL); + myAlleles.add(Allele.NO_CALL); + //System.out.println(myAlleles.toString()); + calls.put(sample.getKey(), new Genotype(sample.getKey(), myAlleles, qual, null, g.getAttributes(), false)); + } return calls; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 8261cd588..60dfe4fe7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -35,6 +36,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Map; @@ -51,7 +53,9 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { } public enum GENOTYPING_MODE { + /** the default; the Unified Genotyper will choose the most likely alternate allele */ DISCOVERY, + /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */ GENOTYPE_GIVEN_ALLELES } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 60ea601d5..ec5eefd60 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -32,7 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.genotype.Haplotype; @@ -293,6 +295,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return aList; } + + private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); + public Allele getLikelihoods(RefMetaDataTracker tracker, ReferenceContext ref, Map contexts, @@ -318,11 +323,10 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood haplotypeMap.clear(); if (getAlleleListFromVCF) { - EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL); - allowableTypes.add(VariantContext.Type.MIXED); - for( final VariantContext vc_input : tracker.getVariantContexts(ref, "alleles", - allowableTypes, ref.getLocus(), false, false) ) { - if( vc_input != null && ref.getLocus().getStart() == vc_input.getStart()) { + for( final VariantContext vc_input : tracker.getValues(UAC.alleles, loc) ) { + if( vc_input != null && + allowableTypes.contains(vc_input.getType()) && + ref.getLocus().getStart() == vc_input.getStart()) { vc = vc_input; break; } @@ -411,16 +415,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (pileup != null ) { double[] genotypeLikelihoods; + if (useOldWrongHorribleHackedUpLikelihoodModel) genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap); else genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); - - // which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example - // for 3 alleles it's 00 01 11 02 12 22 - GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), + GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), alleleList, genotypeLikelihoods, getFilteredDepth(pileup))); @@ -442,4 +444,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return indelLikelihoodMap.get(); } + // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup, + // so that per-sample DP will include deletions covering the event. + protected int getFilteredDepth(ReadBackedPileup pileup) { + int count = 0; + for ( PileupElement p : pileup ) { + if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()) ) + count++; + } + + return count; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 9d917078d..6905ce4a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -57,25 +56,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; } - public static VariantContext getSNPVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, boolean requireSNP, Logger logger) { - if ( tracker == null || ref == null || logger == null ) - throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger); - VariantContext vc = null; - - // search for usable record - for( final VariantContext vc_input : tracker.getVariantContexts(ref, "alleles", null, ref.getLocus(), true, false) ) { - if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { - if ( vc == null ) { - vc = vc_input; - } else { - logger.warn("Multiple valid VCF records detected at site " + ref.getLocus() + ", only considering alleles from first record"); - } - } - } - - return vc; - } - public Allele getLikelihoods(RefMetaDataTracker tracker, ReferenceContext ref, Map contexts, @@ -95,7 +75,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC if ( alternateAlleleToUse != null ) { bestAlternateAllele = alternateAlleleToUse.getBases()[0]; } else if ( useAlleleFromVCF ) { - VariantContext vc = getSNPVCFromAllelesRod(tracker, ref, true, logger); + VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); // ignore places where we don't have a variant if ( vc == null ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java index 22c3081a3..503d87cbe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java @@ -49,7 +49,6 @@ import java.util.TreeSet; * the name 'allele' so we know which alternate allele to use at each site. */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) -@Requires(value={},referenceMetaData=@RMD(name="alleles", type= VariantContext.class)) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.READS) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) @@ -93,7 +92,7 @@ public class UGCalcLikelihoods extends LocusWalker public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext); - return call == null ? null : new VariantCallContext(call, refContext.getBase(), true); + return call == null ? null : new VariantCallContext(call, true); } public Integer reduceInit() { return 0; } @@ -107,7 +106,7 @@ public class UGCalcLikelihoods extends LocusWalker return sum; try { - writer.add(value, value.refBase); + writer.add(value); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java index 68d8f9b54..500b11360 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -25,7 +25,9 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; @@ -51,6 +53,9 @@ public class UGCallVariants extends RodWalker { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public List> variants; + // control the output @Output(doc="File to which variants should be written",required=true) protected VCFWriter writer = null; @@ -62,15 +67,9 @@ public class UGCallVariants extends RodWalker { private Set trackNames = new HashSet(); public void initialize() { - UAC.NO_SLOD = true; - - for ( ReferenceOrderedDataSource d : getToolkit().getRodDataSources() ) { - if ( d.getName().startsWith("variant") ) - trackNames.add(d.getName()); - } - if ( trackNames.size() == 0 ) - throw new UserException("At least one track bound to a name beginning with 'variant' must be provided."); + for ( RodBinding rb : variants ) + trackNames.add(rb.getName()); Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), trackNames); UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); @@ -94,11 +93,7 @@ public class UGCallVariants extends RodWalker { if ( tracker == null ) return null; - List VCs = new ArrayList(); - for ( String name : trackNames ) { - Collection vc = tracker.getVariantContexts(ref, name, null, context.getLocation(), true, true); - VCs.addAll(vc); - } + List VCs = tracker.getValues(variants, context.getLocation()); VariantContext mergedVC = mergeVCsWithGLs(VCs); if ( mergedVC == null ) @@ -116,7 +111,7 @@ public class UGCallVariants extends RodWalker { try { Map attrs = new HashMap(value.getAttributes()); VariantContextUtils.calculateChromosomeCounts(value, attrs, true); - writer.add(VariantContext.modifyAttributes(value, attrs), value.refBase); + writer.add(VariantContext.modifyAttributes(value, attrs)); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 2b25df4aa..7b8045581 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,40 +27,76 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; public class UnifiedArgumentCollection { - // control the various models to be used @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; + /** + * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. + */ @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false) public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; + /** + * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are: + * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 + */ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; + /** + * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily + * distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it + * effectively acts as a cap on the base qualities. + */ @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; + /** + * Specifies how to determine the alternate allele to use for genotyping + */ @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + /** + * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with + * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this + * is the default). Note that the confidence (QUAL) values for multi-sample low-pass (e.g. 4x per sample) calling might + * be significantly smaller with the new EXACT model than with our older GRID_SEARCH model, as the latter tended to + * over-estimate the confidence; for low-pass calling we tend to use much smaller thresholds (e.g. 4). + */ @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + /** + * the minimum phred-scaled Qscore threshold to emit low confidence calls. Genotypes with confidence >= this but less + * than the calling threshold are emitted but marked as filtered. + */ @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; - @Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false) - public boolean NO_SLOD = false; + /** + * This argument is not enabled by default because it increases the runtime by an appreciable amount. + */ + @Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false) + public boolean COMPUTE_SLOD = false; + /** + * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding + */ + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when in GENOTYPE_MODE = GENOTYPE_GIVEN_ALLELES", required=false) + public RodBinding alleles; // control the error modes @Hidden @@ -72,7 +108,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "abort_at_too_much_coverage", doc = "Don't call a site if the downsampled coverage is greater than this value", required = false) public int COVERAGE_AT_WHICH_TO_ABORT = -1; - // control the various parameters to be used @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) public int MIN_BASE_QUALTY_SCORE = 17; @@ -83,11 +118,17 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - // indel-related arguments + /** + * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site. + * Decreasing this value will increase sensitivity but at the cost of larger calling time and a larger number of false positives. + */ @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; + /** + * This argument informs the prior probability of having an indel at a site. + */ @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) public double INDEL_HETEROZYGOSITY = 1.0/8000; @@ -118,22 +159,23 @@ public class UnifiedArgumentCollection { @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; + @Hidden - @Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false) + @Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false) public boolean dovit = false; + @Hidden @Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false) public boolean GSA_PRODUCTION_ONLY = false; + @Hidden - @Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false) public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL; @Hidden - @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) + @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) public boolean IGNORE_SNP_ALLELES = false; - @Deprecated @Argument(fullName="output_all_callable_bases", shortName="all_bases", doc="Please use --output_mode EMIT_ALL_SITES instead" ,required=false) private Boolean ALL_BASES_DEPRECATED = false; @@ -154,7 +196,7 @@ public class UnifiedArgumentCollection { uac.PCR_error = PCR_error; uac.GenotypingMode = GenotypingMode; uac.OutputMode = OutputMode; - uac.NO_SLOD = NO_SLOD; + uac.COMPUTE_SLOD = COMPUTE_SLOD; uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; @@ -168,6 +210,7 @@ public class UnifiedArgumentCollection { uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; uac.DO_CONTEXT_DEPENDENT_PENALTIES = DO_CONTEXT_DEPENDENT_PENALTIES; + uac.alleles = alleles; uac.GET_GAP_PENALTIES_FROM_DATA = GET_GAP_PENALTIES_FROM_DATA; uac.INDEL_RECAL_FILE = INDEL_RECAL_FILE; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 2a0338bca..428f97e2a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -25,41 +25,115 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.*; - /** - * A variant caller which unifies the approaches of several disparate callers. Works for single-sample and - * multi-sample data. The user can choose from several different incorporated calculation models. + * A variant caller which unifies the approaches of several disparate callers -- Works for single-sample and multi-sample data. + * + *

+ * The GATK Unified Genotyper is a multiple-sample, technology-aware SNP and indel caller. It uses a Bayesian genotype + * likelihood model to estimate simultaneously the most likely genotypes and allele frequency in a population of N samples, + * emitting an accurate posterior probability of there being a segregating variant allele at each locus as well as for the + * genotype of each sample. The system can either emit just the variant sites or complete genotypes (which includes + * homozygous reference calls) satisfying some phred-scaled confidence value. The genotyper can make accurate calls on + * both single sample data and multi-sample data. + * + *

Input

+ *

+ * The read data from which to make variant calls. + *

+ * + *

Output

+ *

+ * A raw, unfiltered, highly specific callset in VCF format. + *

+ * + *

Example generic command for multi-sample SNP calling

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -T UnifiedGenotyper \
+ *   -I sample1.bam [-I sample2.bam ...] \
+ *   --dbsnp dbSNP.vcf \
+ *   -o snps.raw.vcf \
+ *   -stand_call_conf [50.0] \
+ *   -stand_emit_conf 10.0 \
+ *   -dcov [50] \
+ *   [-L targets.interval_list]
+ * 
+ * + *

+ * The above command will call all of the samples in your provided BAM files [-I arguments] together and produce a VCF file + * with sites and genotypes for all samples. The easiest way to get the dbSNP file is from the GATK resource bundle. Several + * arguments have parameters that should be chosen based on the average coverage per sample in your data. See the detailed + * argument descriptions below. + *

+ * + *

Example command for generating calls at all sites

+ *
+ * java -jar /path/to/GenomeAnalysisTK.jar \
+ *   -l INFO \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -T UnifiedGenotyper \
+ *   -I /DCC/ftp/pilot_data/data/NA12878/alignment/NA12878.SLX.maq.SRP000031.2009_08.bam \
+ *   -o my.vcf \
+ *   --output_mode EMIT_ALL_SITES
+ * 
+ * + *

Caveats

+ *
    + *
  • The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and + * file formats are likely to change.
  • + *
  • The system can be very aggressive in calling variants. In the 1000 genomes project for pilot 2 (deep coverage of ~35x) + * we expect the raw Qscore > 50 variants to contain at least ~10% FP calls. We use extensive post-calling filters to eliminate + * most of these FPs. Variant Quality Score Recalibration is a tool to perform this filtering.
  • + *
  • We only handle diploid genotypes
  • + *
+ * */ + @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) -@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableReadFilter.class} ) +@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UnifiedGenotyper extends LocusWalker implements TreeReducible { +public class UnifiedGenotyper extends LocusWalker implements TreeReducible, AnnotatorCompatibleWalker { - @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + @ArgumentCollection + private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - // control the output + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP is not used in any way for the calculations themselves. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + public RodBinding getVariantRodBinding() { return null; } + public RodBinding getSnpEffRodBinding() { return null; } + public List> getCompRodBindings() { return Collections.emptyList(); } + public List> getResourceRodBindings() { return Collections.emptyList(); } + + /** + * A raw, unfiltered, highly specific callset in VCF format. + */ @Output(doc="File to which variants should be written",required=true) protected VCFWriter writer = null; @@ -69,9 +143,15 @@ public class UnifiedGenotyper extends LocusWalker annotationsToUse = new ArrayList(); + /** + * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. + */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) protected String[] annotationClassesToUse = { "Standard" }; @@ -130,7 +210,7 @@ public class UnifiedGenotyper extends LocusWalker dataSources = getToolkit().getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); - } - else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) { - String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length()); - headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership")); - } - } + if ( dbsnp.dbsnp.isBound() ) + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); // FORMAT and INFO fields headerInfo.addAll(getSupportedHeaderStrings()); @@ -227,7 +299,7 @@ public class UnifiedGenotyper extends LocusWalker stratifiedContexts, AlignmentContext rawContext) { VariantContext vc; if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, ref, false, logger); + VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles); if ( vcInput == null ) return null; - vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles()); + vc = new VariantContext("UG_call", vcInput.getChr(), vcInput.getStart(), vcInput.getEnd(), vcInput.getAlleles(), InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, ref.getBase()); + } else { // deal with bad/non-standard reference bases if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) ) @@ -247,7 +251,7 @@ public class UnifiedGenotyperEngine { } if ( annotationEngine != null ) { - // we want to use the *unfiltered* and *unBAQed* context for the annotations + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations ReadBackedPileup pileup = null; if (rawContext.hasExtendedEventPileup()) pileup = rawContext.getExtendedEventPileup(); @@ -255,10 +259,10 @@ public class UnifiedGenotyperEngine { pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc).iterator().next(); + vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc); } - return new VariantCallContext(vc, ref.getBase(), false); + return new VariantCallContext(vc, false); } private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map GLs) { @@ -300,7 +304,8 @@ public class UnifiedGenotyperEngine { genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, - null); + null, + refContext.getBase()); } // private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine @@ -372,8 +377,8 @@ public class UnifiedGenotyperEngine { attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - if ( !UAC.NO_SLOD && bestAFguess != 0 ) { - final boolean DEBUG_SLOD = false; + if ( UAC.COMPUTE_SLOD && bestAFguess != 0 ) { + //final boolean DEBUG_SLOD = false; // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); @@ -381,7 +386,7 @@ public class UnifiedGenotyperEngine { afcm.get().getLog10PNonRef(tracker, refContext, vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); - if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); + //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); @@ -390,7 +395,7 @@ public class UnifiedGenotyperEngine { //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); - if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); + //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); @@ -399,11 +404,11 @@ public class UnifiedGenotyperEngine { //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); - if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); + //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; - if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); + //if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); // strand score is max bias between forward and reverse strands double strandScore = Math.max(forwardLod, reverseLod); @@ -425,10 +430,10 @@ public class UnifiedGenotyperEngine { myAlleles.add(vc.getReference()); } VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, - myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes); + myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, refContext.getBase()); if ( annotationEngine != null ) { - // first off, we want to use the *unfiltered* and *unBAQed* context for the annotations + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations ReadBackedPileup pileup = null; if (rawContext.hasExtendedEventPileup()) pileup = rawContext.getExtendedEventPileup(); @@ -436,13 +441,10 @@ public class UnifiedGenotyperEngine { pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - Collection variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); - vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element. + vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); } - VariantCallContext call = new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); - call.setRefBase(refContext.getBase()); - return call; + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } private int calculateEndPos(Set alleles, Allele refAllele, GenomeLoc loc) { @@ -633,7 +635,7 @@ public class UnifiedGenotyperEngine { // no extended event pileup // if we're genotyping given alleles and we have a requested SNP at this position, do SNP if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { - VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, refContext, false, logger); + VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); if (vcInput == null) return null; @@ -739,4 +741,23 @@ public class UnifiedGenotyperEngine { return afcm; } + + public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { + if ( tracker == null || ref == null || logger == null ) + throw new ReviewedStingException("Bad arguments: tracker=" + tracker + " ref=" + ref + " logger=" + logger); + VariantContext vc = null; + + // search for usable record + for( final VariantContext vc_input : tracker.getValues(allelesBinding, loc) ) { + if ( vc_input != null && ! vc_input.isFiltered() && (! requireSNP || vc_input.isSNP() )) { + if ( vc == null ) { + vc = vc_input; + } else { + logger.warn("Multiple valid VCF records detected in the alleles input file at site " + ref.getLocus() + ", only considering the first record"); + } + } + } + + return vc; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java index 5896e784e..423c80112 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java @@ -36,7 +36,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; * Useful helper class to communicate the results of calculateGenotype to framework */ public class VariantCallContext extends VariantContext { - public byte refBase; // Was the site called confidently, either reference or variant? public boolean confidentlyCalled = false; @@ -55,16 +54,6 @@ public class VariantCallContext extends VariantContext { this.shouldEmit = shouldEmit; } - VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) { - super(vc); - this.refBase = ref; - this.confidentlyCalled = confidentlyCalledP; - } - - public void setRefBase(byte ref) { - this.refBase = ref; - } - /* these methods are only implemented for GENOTYPE_GIVEN_ALLELES MODE */ //todo -- expand these methods to all modes diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 61f21c488..8680f3537 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -30,16 +30,12 @@ import net.sf.samtools.*; import net.sf.samtools.util.RuntimeIOException; import net.sf.samtools.util.SequenceUtil; import net.sf.samtools.util.StringUtil; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.BAQMode; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -69,10 +65,53 @@ import java.util.*; /** * Performs local realignment of reads based on misalignments due to the presence of indels. - * Unlike most mappers, this walker uses the full alignment context to determine whether an - * appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly. + * + *

+ * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion + * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, + * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, + * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus + * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and + * specifically identify indels. + *

+ *

    There are 2 steps to the realignment process: + *
  1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
  2. + *
  3. Running the realigner over those intervals (IndelRealigner)
  4. + *
+ *

+ * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. + *

+ * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). + * + *

Input

+ *

+ * One or more aligned BAM files and optionally one or more lists of known indels. + *

+ * + *

Output

+ *

+ * A realigned version of your input BAM file(s). + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -I input.bam \
+ *   -R ref.fasta \
+ *   -T IndelRealigner \
+ *   -targetIntervals intervalListFromRTC.intervals \
+ *   -o realignedBam.bam \
+ *   [--known /path/to/indels.vcf] \
+ *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
+ * 
+ * + * @author ebanks */ -//Reference(window=@Window(start=-30,stop=30)) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) public class IndelRealigner extends ReadWalker { @@ -81,100 +120,145 @@ public class IndelRealigner extends ReadWalker { public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner"; public enum ConsensusDeterminationModel { + /** + * Uses only indels from a provided ROD of known indels. + */ KNOWNS_ONLY, + /** + * Additionally uses indels already present in the original alignments of the reads. + */ USE_READS, + /** + * Additionally uses 'Smith-Waterman' to generate alternate consenses. + */ USE_SW } + /** + * Any number of VCF files representing known indels to be used for constructing alternate consenses. + * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored. + */ + @Input(fullName="known", shortName = "known", doc="Input VCF file(s) with known indels", required=false) + public List> known = Collections.emptyList(); + + /** + * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). + */ @Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) protected String intervalsFile = null; + /** + * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number + * should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency, + * this number should be smaller. + */ @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) protected double LOD_THRESHOLD = 5.0; - @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) - protected double MISMATCH_THRESHOLD = 0.15; - + /** + * The realigned bam file. + */ @Output(required=false, doc="Output bam") protected StingSAMFileWriter writer = null; protected ConstrainedMateFixingManager manager = null; protected SAMFileWriter writerToUse = null; - @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false) + /** + * We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner; + * Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data). + */ + @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false) public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; // ADVANCED OPTIONS FOLLOW - @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+ - "Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).", required=false) + /** + * For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner + * will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply + * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set. + */ + @Advanced + @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) + protected double MISMATCH_THRESHOLD = 0.15; + + /** + * For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage; + * and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value. + */ + @Advanced + @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false) protected int MAX_RECORDS_IN_MEMORY = 150000; + /** + * For expert users only! + */ + @Advanced @Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false) protected int MAX_ISIZE_FOR_MOVEMENT = 3000; + /** + * For expert users only! + */ + @Advanced @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="maximum positional move in basepairs that a read can be adjusted during realignment", required=false) protected int MAX_POS_MOVE_ALLOWED = 200; + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ + @Advanced @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) protected int MAX_CONSENSUSES = 30; + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ + @Advanced @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) protected int MAX_READS_FOR_CONSENSUSES = 120; - @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment; "+ - "if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is", required=false) + /** + * For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is. + * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number. + */ + @Advanced + @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false) protected int MAX_READS = 20000; - @Argument(fullName="noPGTag", shortName="noPG", required=false, - doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. "+ - "This option is required in order to pass integration tests.") - protected boolean NO_PG_TAG = false; - - @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, - doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.") + @Advanced + @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam") protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false; - @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, - doc="This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, "+ - "it will throw an exception. Use this argument when your interval list is not sorted to instruct "+"" + - "the Realigner to first sort it in memory.") + /** + * For expert users only! This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception. + * Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory. + */ + @Advanced + @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="The target intervals are not sorted") protected boolean TARGET_NOT_SORTED = false; - //NWay output: testing, not ready for the prime time, hence hidden: - - @Hidden - @Argument(fullName="nWayOut", shortName="nWayOut", required=false, - doc="Generate one output file for each input (-I) bam file. Reads from all input files "+ - "will be realigned together, but then each read will be saved in the output file corresponding to "+ - "the input file the read came from. There are two ways to generate output bam file names: 1) if the "+ - "value of this argument is a general string (e.g. '.cleaned.bam'), then "+ - "extensions (\".bam\" or \".sam\") will be stripped from the input file names and the provided string value "+ - "will be pasted on instead; 2) if the value ends with a '.map' (e.g. input_output.map), then " + - "the two-column tab-separated file with the specified name must exist and list unique output file name (2nd column)" + - "for each input file name (1st column).") + /** + * Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that + * the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'), + * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the + * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output + * file name (2nd column) for each input file name (1st column). + */ + @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file") protected String N_WAY_OUT = null; + + + // DEBUGGING OPTIONS FOLLOW + @Hidden @Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses") protected boolean CHECKEARLY = false; - - // DEPRECATED - - @Deprecated - @Argument(fullName="sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", doc="This argument is no longer used.", required=false) - protected boolean DEPRECATED_SORT_IN_COORDINATE_ORDER = false; - - @Deprecated - @Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false) - protected boolean DEPRECATED_REALIGN_MATES = false; - - @Deprecated - @Argument(fullName="useOnlyKnownIndels", shortName="knownsOnly", doc="This argument is no longer used. See --consensusDeterminationModel instead.", required=false) - protected boolean DEPRECATED_KNOWNS_ONLY = false; - - - // DEBUGGING OPTIONS FOLLOW + @Hidden + @Argument(fullName="noPGTag", shortName="noPG", required=false, + doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") + protected boolean NO_PG_TAG = false; @Hidden @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") @@ -558,8 +642,8 @@ public class IndelRealigner extends ReadWalker { if ( indelRodsSeen.contains(rod) ) continue; indelRodsSeen.add(rod); - if ( VariantContextAdaptors.canBeConvertedToVariantContext(rod)) - knownIndelsToTry.add(VariantContextAdaptors.toVariantContext("", rod, ref)); + if ( rod instanceof VariantContext ) + knownIndelsToTry.add((VariantContext)rod); } } } @@ -802,7 +886,7 @@ public class IndelRealigner extends ReadWalker { for ( VariantContext knownIndel : knownIndelsToTry ) { if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() ) continue; - byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length()); + byte[] indelStr = knownIndel.isSimpleInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length()); int start = knownIndel.getStart() - leftmostIndex + 1; Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel); if ( c != null ) @@ -1004,11 +1088,11 @@ public class IndelRealigner extends ReadWalker { if ( indexOnRef > 0 ) cigar.add(new CigarElement(indexOnRef, CigarOperator.M)); - if ( indel.isDeletion() ) { + if ( indel.isSimpleDeletion() ) { refIdx += indelStr.length; cigar.add(new CigarElement(indelStr.length, CigarOperator.D)); } - else if ( indel.isInsertion() ) { + else if ( indel.isSimpleInsertion() ) { for ( byte b : indelStr ) sb.append((char)b); cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index af8051334..17d5a8e9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -35,16 +35,46 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.sam.AlignmentUtils; + /** - * Left aligns indels in reads. + * Left-aligns indels from reads in a bam file. + * + *

+ * LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be + * placed at multiple positions and still represent the same haplotype. While a standard convention is to place an + * indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + * + *

Input

+ *

+ * A bam file to left-align. + *

+ * + *

Output

+ *

+ * A left-aligned bam. + *

+ * + *

Examples

+ *
+ * java -Xmx3g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T LeftAlignIndels \
+ *   -I input.bam \
+ *   -o output.vcf
+ * 
+ * */ public class LeftAlignIndels extends ReadWalker { @Output(required=false, doc="Output bam") protected StingSAMFileWriter writer = null; - @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+ - "If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false) + /** + * If set too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool + * may run out of memory. We recommend that you additionally tell Java to use a temp directory with plenty of available + * space (by setting java.io.tempdir on the command-line). + */ + @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the output writer", required=false) protected int MAX_RECORDS_IN_RAM = 500000; public void initialize() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 55450486b..2d7969230 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -274,7 +274,7 @@ public class PairHMMIndelErrorModel { this.doViterbi = dovit; } - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) { + public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) { this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob @@ -754,7 +754,7 @@ public class PairHMMIndelErrorModel { // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) if (indelLikelihoodMap.containsKey(p)) { - HashMap el = indelLikelihoodMap.get(p); + HashMap el = indelLikelihoodMap.get(p); int j=0; for (Allele a: haplotypeMap.keySet()) { readLikelihoods[readIdx][j++] = el.get(a); @@ -1055,7 +1055,6 @@ public class PairHMMIndelErrorModel { genotypeLikelihoods[i] -= maxElement; return genotypeLikelihoods; - } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index 488e37f26..bede50a0b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -26,12 +26,14 @@ package org.broadinstitute.sting.gatk.walkers.indels; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.BadCigarFilter; import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; @@ -46,36 +48,98 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.ArrayList; +import java.util.Collections; +import java.util.List; /** - * Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string. + * Emits intervals for the Local Indel Realigner to target for realignment. + * + *

+ * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion + * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, + * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, + * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus + * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and + * specifically identify indels. + *

+ *

    There are 2 steps to the realignment process: + *
  1. Determining (small) suspicious intervals which are likely in need of realignment (RealignerTargetCreator)
  2. + *
  3. Running the realigner over those intervals (see the IndelRealigner tool)
  4. + *
+ *

+ * An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. + *

+ * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. + * + *

Input

+ *

+ * One or more aligned BAM files and optionally one or more lists of known indels. + *

+ * + *

Output

+ *

+ * A list of target intervals to pass to the Indel Realigner. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -I input.bam \
+ *   -R ref.fasta \
+ *   -T RealignerTargetCreator \
+ *   -o forIndelRealigner.intervals \
+ *   [--known /path/to/indels.vcf]
+ * 
+ * + * @author ebanks */ -@ReadFilters({Platform454Filter.class, MappingQualityZeroReadFilter.class, BadCigarFilter.class}) +@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, BadCigarFilter.class}) @Reference(window=@Window(start=-1,stop=50)) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @By(DataSource.REFERENCE) @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) public class RealignerTargetCreator extends RodWalker { + + /** + * The target intervals for realignment. + */ @Output protected PrintStream out; - // mismatch/entropy/SNP arguments + /** + * Any number of VCF files representing known SNPs and/or indels. Could be e.g. dbSNP and/or official 1000 Genomes indel calls. + * SNPs in these files will be ignored unless the --mismatchFraction argument is used. + */ + @Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false) + public List> known = Collections.emptyList(); + + /** + * Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart. + */ @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false) protected int windowSize = 10; - @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false) - protected double mismatchThreshold = 0.15; + /** + * To disable this behavior, set this value to <= 0 or > 1. This feature is really only necessary when using an ungapped aligner + * (e.g. MAQ in the case of single-end read data) and should be used in conjunction with '--model USE_SW' in the IndelRealigner. + */ + @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy", required=false) + protected double mismatchThreshold = 0.0; @Argument(fullName="minReadsAtLocus", shortName="minReads", doc="minimum reads at a locus to enable using the entropy calculation", required=false) protected int minReadsAtLocus = 4; - // interval merging arguments + /** + * Because the realignment algorithm is N^2, allowing too large an interval might take too long to completely realign. + */ @Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="maximum interval size", required=false) protected int maxIntervalSize = 500; - @Deprecated - @Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false) - protected boolean DEPRECATED_REALIGN_MATES = false; @Override public boolean generateExtendedEvents() { return true; } @@ -110,11 +174,11 @@ public class RealignerTargetCreator extends RodWalker { -// @Output -// PrintStream out; - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter vcf_writer = null; - - @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true) - @Deprecated - java.io.File output_file; - - @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print callability metrics output", required = false) - public PrintStream metricsWriter = null; - -// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false) -// boolean FORMAT_VCF = false; - - @Hidden - @Argument(fullName = "genotype_intervals", shortName = "genotype", - doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false) - public String genotypeIntervalsFile = null; - - @Hidden - @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, - doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ - "if the list turns out to be unsorted, it will throw an exception. "+ - "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ - "to sort and keep it in memory (increases memory usage!).") - protected boolean GENOTYPE_NOT_SORTED = false; - - @Hidden - @Argument(fullName="unpaired", shortName="unpaired", - doc="Perform unpaired calls (no somatic status detection)", required=false) - boolean call_unpaired = false; - boolean call_somatic ; - - @Argument(fullName="verboseOutput", shortName="verbose", - doc="Verbose output file in text format", required=false) - java.io.File verboseOutput = null; - - @Argument(fullName="bedOutput", shortName="bed", - doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) - java.io.File bedOutput = null; - - @Argument(fullName="minCoverage", shortName="minCoverage", - doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false) - int minCoverage = 6; - - @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", - doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) - int minNormalCoverage = 4; - - @Argument(fullName="minFraction", shortName="minFraction", - doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ - " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) - double minFraction = 0.3; - - @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", - doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false) - double minConsensusFraction = 0.7; - - @Argument(fullName="minIndelCount", shortName="minCnt", - doc="Minimum count of reads supporting consensus indel required for making the call. "+ - " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ - "(minIndelCount not met) will not pass.", required=false) - int minIndelCount = 0; - - @Argument(fullName="refseq", shortName="refseq", - doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) - String RefseqFileName = null; - - @Argument(fullName="blacklistedLanes", shortName="BL", - doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ - "by this application, so they will not contribute indels to consider and will not be counted.", required=false) - PlatformUnitFilterHelper dummy; - @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false; - @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+ - "May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200; - @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+ - " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000; - - private WindowContext tumor_context; - private WindowContext normal_context; - private int currentContigIndex = -1; - private int contigLength = -1; // we see to much messy data with reads hanging out of contig ends... - private int currentPosition = -1; // position of the last read we've seen on the current contig - private String refName = null; - private java.io.Writer output = null; - private GenomeLoc location = null; - private long normalCallsMade = 0L, tumorCallsMade = 0L; - - boolean outOfContigUserWarned = false; - - private LocationAwareSeekableRODIterator refseqIterator=null; - -// private Set normalReadGroups; // we are going to remember which read groups are normals and which are tumors in order to be able -// private Set tumorReadGroups ; // to properly assign the reads coming from a merged stream - private Set normalSamples; // we are going to remember which samples are normal and which are tumor: - private Set tumorSamples ; // these are used only to generate genotypes for vcf output - - private int NQS_WIDTH = 5; // 5 bases on each side of the indel for NQS-style statistics - - private Writer bedWriter = null; - private Writer verboseWriter = null; - - - private static String annGenomic = "GENOMIC"; - private static String annIntron = "INTRON"; - private static String annUTR = "UTR"; - private static String annCoding = "CODING"; - private static String annUnknown = "UNKNOWN"; - - enum CallType { - NOCOVERAGE, - BADCOVERAGE, - NOEVIDENCE, - GERMLINE, - SOMATIC - }; - - private SAMRecord lastRead; - private byte[] refBases; - private ReferenceDataSource refData; - private Iterator genotypeIntervalIterator = null; - - // the current interval in the list of intervals, for which we want to do full genotyping - private GenomeLoc currentGenotypeInterval = null; - private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed; - // can be 1 base before lastGenotyped start - - - // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt" - - private Set getVCFHeaderInfo() { - Set headerInfo = new HashSet(); - - // first, the basic info - headerInfo.add(new VCFHeaderLine("source", "IndelGenotyperV2")); - headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - // FORMAT and INFO fields -// headerInfo.addAll(VCFUtils.getSupportedHeaderStrings()); - - headerInfo.addAll(VCFIndelAttributes.getAttributeHeaderLines()); - if ( call_somatic ) { - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); - } else { - } - - // all of the arguments from the argument collection - Set args = new HashSet(); - args.add(this); - args.addAll(getToolkit().getFilters()); - Map commandLineArgs = getToolkit().getApproximateCommandLineArguments(args); - for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) - headerInfo.add(new VCFHeaderLine(String.format("IGv2_%s", commandLineArg.getKey()), commandLineArg.getValue())); - // also, the list of input bams - for ( String fileName : getToolkit().getArguments().samFiles ) - headerInfo.add(new VCFHeaderLine("IGv2_bam_file_used", fileName)); - - return headerInfo; - } - - - @Override - public void initialize() { - - call_somatic = (call_unpaired ? false : true); - normal_context = new WindowContext(0,WINDOW_SIZE); - normalSamples = new HashSet(); - - if ( bedOutput != null && output_file != null ) { - throw new UserException.DeprecatedArgument("-O", "-O option is deprecated and -bed option replaces it; you can not use both at the same time"); - } - - if ( RefseqFileName != null ) { - logger.info("Using RefSeq annotations from "+RefseqFileName); - - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - getToolkit().getArguments().unsafe); - RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName)); - - refseqIterator = new SeekableRODIterator(refseq.getHeader(), - refseq.getSequenceDictionary(), - getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - refseq.getIterator()); - } - - if ( refseqIterator == null ) logger.info("No gene annotations available"); - - int nSams = getToolkit().getArguments().samFiles.size(); - - if ( call_somatic ) { - if ( nSams < 2 ) throw new UserException.BadInput("In default (paired sample) mode at least two bam files (normal and tumor) must be specified"); - tumor_context = new WindowContext(0,WINDOW_SIZE); - tumorSamples = new HashSet(); - } - - int nNorm = 0; - int nTum = 0; - for ( SAMReaderID rid : getToolkit().getReadsDataSource().getReaderIDs() ) { - Tags tags = rid.getTags() ; - if ( tags.getPositionalTags().isEmpty() && call_somatic ) - throw new UserException.BadInput("In default (paired sample) mode all input bam files must be tagged as either 'normal' or 'tumor'. Untagged file: "+ - getToolkit().getSourceFileForReaderID(rid)); - boolean normal = false; - boolean tumor = false; - for ( String s : tags.getPositionalTags() ) { // we allow additional unrelated tags (and we do not use them), but we REQUIRE one of Tumor/Normal to be present if --somatic is on - if ( "NORMAL".equals(s.toUpperCase()) ) { - normal = true; - nNorm++; - } - if ( "TUMOR".equals(s.toUpperCase()) ) { - tumor = true; - nTum++ ; - } - } - if ( call_somatic && normal && tumor ) throw new UserException.BadInput("Input bam file "+ - getToolkit().getSourceFileForReaderID(rid)+" is tagged both as normal and as tumor. Which one is it??"); - if ( call_somatic && !normal && ! tumor ) - throw new UserException.BadInput("In somatic mode all input bams must be tagged as either normal or tumor. Encountered untagged file: "+ - getToolkit().getSourceFileForReaderID(rid)); - if ( ! call_somatic && (normal || tumor) ) - System.out.println("WARNING: input bam file "+getToolkit().getSourceFileForReaderID(rid) - +" is tagged as Normal and/or Tumor, but somatic mode is not on. Tags will ne IGNORED"); - if ( call_somatic && tumor ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { - tumorSamples.add(rg.getSample()); - } - } else { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { - normalSamples.add(rg.getSample()); - } - } - if ( genotypeIntervalsFile != null ) { - - if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) { - // prepare to read intervals one-by-one, as needed (assuming they are sorted). - genotypeIntervalIterator = new IntervalFileMergingIterator(getToolkit().getGenomeLocParser(), - new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); - } else { - // read in the whole list of intervals for cleaning - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), - IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(genotypeIntervalsFile),true), IntervalMergingRule.OVERLAPPING_ONLY); - genotypeIntervalIterator = locs.iterator(); - } - - // wrap intervals requested for genotyping inside overlapping iterator, so that we actually - // genotype only on the intersections of the requested intervals with the -L intervals - genotypeIntervalIterator = new OverlappingIntervalIterator(genotypeIntervalIterator, getToolkit().getIntervals().iterator() ); - - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - - if ( DEBUG) System.out.println("DEBUG>> first genotyping interval="+currentGenotypeInterval); - - if ( currentGenotypeInterval != null ) lastGenotypedPosition = currentGenotypeInterval.getStart()-1; - } - - } - - location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); - - normalSamples = getToolkit().getSamplesByReaders().get(0); - - try { - // we already checked that bedOutput and output_file are not set simultaneously - if ( bedOutput != null ) bedWriter = new FileWriter(bedOutput); - if ( output_file != null ) bedWriter = new FileWriter(output_file); - } catch (java.io.IOException e) { - throw new UserException.CouldNotReadInputFile(bedOutput, "Failed to open BED file for writing.", e); - } - try { - if ( verboseOutput != null ) verboseWriter = new FileWriter(verboseOutput); - } catch (java.io.IOException e) { - throw new UserException.CouldNotReadInputFile(verboseOutput, "Failed to open BED file for writing.", e); - } - - vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ; - refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile); - } - - - @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - - // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); - - if ( DEBUG ) { - // System.out.println("DEBUG>> read at "+ read.getAlignmentStart()+"-"+read.getAlignmentEnd()+ - // "("+read.getCigarString()+")"); - if ( read.getDuplicateReadFlag() ) System.out.println("DEBUG>> Duplicated read (IGNORED)"); - } - - if ( AlignmentUtils.isReadUnmapped(read) || - read.getDuplicateReadFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getMappingQuality() == 0 ) { - return 0; // we do not need those reads! - } - - if ( read.getReferenceIndex() != currentContigIndex ) { - // we just jumped onto a new contig - if ( DEBUG ) System.out.println("DEBUG>>> Moved to contig "+read.getReferenceName()); - if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName()+": contig is out of order; input BAM file is unsorted"); - - // print remaining indels from the previous contig (if any); - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); - - currentContigIndex = read.getReferenceIndex(); - currentPosition = read.getAlignmentStart(); - refName = new String(read.getReferenceName()); - - location = getToolkit().getGenomeLocParser().createGenomeLoc(refName,location.getStart(),location.getStop()); - contigLength = getToolkit().getGenomeLocParser().getContigInfo(refName).getSequenceLength(); - outOfContigUserWarned = false; - - lastGenotypedPosition = -1; - - normal_context.clear(); // reset coverage window; this will also set reference position to 0 - if ( call_somatic) tumor_context.clear(); - - refBases = new String(refData.getReference().getSequence(read.getReferenceName()).getBases()).toUpperCase().getBytes(); - } - - // we have reset the window to the new contig if it was required and emitted everything we collected - // on a previous contig. At this point we are guaranteed that we are set up properly for working - // with the contig of the current read. - - // NOTE: all the sanity checks and error messages below use normal_context only. We make sure that normal_context and - // tumor_context are synchronized exactly (windows are always shifted together by emit_somatic), so it's safe - - if ( read.getAlignmentStart() < currentPosition ) // oops, read out of order? - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName() +" out of order on the contig\n"+ - "Read starts at "+refName+":"+read.getAlignmentStart()+"; last read seen started at "+refName+":"+currentPosition - +"\nLast read was: "+lastRead.getReadName()+" RG="+lastRead.getAttribute("RG")+" at "+lastRead.getAlignmentStart()+"-" - +lastRead.getAlignmentEnd()+" cigar="+lastRead.getCigarString()); - - currentPosition = read.getAlignmentStart(); - lastRead = read; - - if ( read.getAlignmentEnd() > contigLength ) { - if ( ! outOfContigUserWarned ) { - System.out.println("WARNING: Reads aligned past contig length on "+ location.getContig()+"; all such reads will be skipped"); - outOfContigUserWarned = true; - } - return 0; - } - - long alignmentEnd = read.getAlignmentEnd(); - Cigar c = read.getCigar(); - int lastNonClippedElement = 0; // reverse offset to the last unclipped element - CigarOperator op = null; - // moving backwards from the end of the cigar, skip trailing S or H cigar elements: - do { - lastNonClippedElement++; - op = c.getCigarElement( c.numCigarElements()-lastNonClippedElement ).getOperator(); - } while ( op == CigarOperator.H || op == CigarOperator.S ); - - // now op is the last non-S/H operator in the cigar. - - // a little trick here: we want to make sure that current read completely fits into the current - // window so that we can accumulate indel observations over the whole length of the read. - // The ::getAlignmentEnd() method returns the last position on the reference where bases from the - // read actually match (M cigar elements). After our cleaning procedure, we can have reads that end - // with I element, which is not gonna be counted into alignment length on the reference. On the other hand, - // in this program we assign insertions, internally, to the first base *after* the insertion position. - // Hence, we have to make sure that that extra base is already in the window or we will get IndexOutOfBounds. - - if ( op == CigarOperator.I) alignmentEnd++; - - if ( alignmentEnd > normal_context.getStop()) { - - // we don't emit anything until we reach a read that does not fit into the current window. - // At that point we try shifting the window to the start of that read (or reasonably close) and emit everything prior to - // that position. This is legitimate, since the reads are sorted and we are not gonna see any more coverage at positions - // below the current read's start. - // Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting - // the window to around the read's start will ensure that the read fits... - - if ( DEBUG) System.out.println("DEBUG>> Window at "+normal_context.getStart()+"-"+normal_context.getStop()+", read at "+ - read.getAlignmentStart()+": trying to emit and shift" ); - if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false ); - else emit( read.getAlignmentStart(), false ); - - // let's double check now that the read fits after the shift - if ( read.getAlignmentEnd() > normal_context.getStop()) { - // ooops, looks like the read does not fit into the window even after the latter was shifted!! - throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+ - "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ - read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ - "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); - } - } - - if ( call_somatic ) { - - Tags tags = getToolkit().getReaderIDForRead(read).getTags(); - boolean assigned = false; - for ( String s : tags.getPositionalTags() ) { - if ( "NORMAL".equals(s.toUpperCase()) ) { - normal_context.add(read,ref.getBases()); - assigned = true; - break; - } - if ( "TUMOR".equals(s.toUpperCase()) ) { - tumor_context.add(read,ref.getBases()); - assigned = true; - break; - } - } - if ( ! assigned ) - throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+ - "has no Normal/Tumor tag associated with it"); - -// String rg = (String)read.getAttribute("RG"); -// if ( rg == null ) -// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls."); - -// if ( normalReadGroups.contains(rg) ) { -// normal_context.add(read,ref.getBases()); -// } else if ( tumorReadGroups.contains(rg) ) { -// tumor_context.add(read,ref.getBases()); -// } else { -// throw new UserException.MalformedBam(read, "Unrecognized read group in merged stream: "+rg); -// } - - if ( tumor_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+tumor_context.getStart()+'-'+tumor_context.getStop()+" in tumor sample. The whole window will be dropped."); - tumor_context.shift(WINDOW_SIZE); - normal_context.shift(WINDOW_SIZE); - } - if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+" in normal sample. The whole window will be dropped"); - tumor_context.shift(WINDOW_SIZE); - normal_context.shift(WINDOW_SIZE); - } - - - } else { - normal_context.add(read, ref.getBases()); - if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+". The whole window will be dropped"); - normal_context.shift(WINDOW_SIZE); - } - } - - return 1; - } - - /** An auxiliary shortcut: returns true if position(location.getContig(), p) is past l */ - private boolean pastInterval(long p, GenomeLoc l) { - return ( location.getContigIndex() > l.getContigIndex() || - location.getContigIndex() == l.getContigIndex() && p > l.getStop() ); - } - - /** Emit calls of the specified type across genotyping intervals, from position lastGenotypedPosition+1 to - * pos-1, inclusive. - * @param contigIndex - * @param pos - * @param call - */ - /* - private void emitNoCallsUpTo(int contigIndex, long pos, CallType call) { - - if ( contigIndex < currentGenotypeInterval.getContigIndex() || - contigIndex == currentGenotypeInterval.getContigIndex() && pos <= currentGenotypeInterval.getStart() ) return; - - if ( contigIndex == currentGenotypeInterval.getContigIndex() && pos >= currentGenotypeInterval.getStart() ) { - for ( long p = lastGenotypedPosition+1; p < pos; p++ ) { - - } - } - while( currentGenotypeInterval != null ) { - - while ( ) - if ( genotypeIntervalIterator.hasNext() ) { - currentGenotypeInterval = genotypeIntervalIterator.next() ; - if ( pastInterval(p,currentGenotypeInterval) ) { - // if we are about to jump over the whole next interval, we need to emit NO_COVERAGE calls there! - emitNoCoverageCalls(currentGenotypeInterval); - } - } else { - currentGenotypeInterval = null; - } - } - } -*/ - - /** Output indel calls up to the specified position and shift the window: after this method is executed, the - * first element of the window maps onto 'position', if possible, or at worst a few bases to the left of 'position' if we may need more - * reads to get full NQS-style statistics for an indel in the close proximity of 'position'. - * - * @param position - */ - private void emit(long position, boolean force) { - - long adjustedPosition = adjustPosition(position); - - if ( adjustedPosition == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(position-normal_context.getStart())); - return; - } - long move_to = adjustedPosition; - - for ( int pos = normal_context.getStart() ; pos < Math.min(adjustedPosition,normal_context.getStop()+1) ; pos++ ) { - - boolean genotype = false; - // first let's see if we need to genotype current position: - - final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) - - if ( pos <= lastGenotypedPosition ) continue; - - while ( currentGenotypeInterval != null ) { - - // if we did not even reach next interval yet, no genotyping at current position: - if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || - location.getContigIndex() == currentGenotypeInterval.getContigIndex() && - p < currentGenotypeInterval.getStart() ) break; - if ( pastInterval(p, currentGenotypeInterval) ) { - // we are past current genotyping interval, so we are done with it; let's load next interval: - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - continue; // re-enter the loop to check against the interval we just loaded - } - - // we reach this point only if p is inside current genotyping interval; set the flag and bail out: - genotype = true; - break; - } - -// if ( DEBUG ) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); - - if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue; - - IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); - - if ( normalCall.getCoverage() < minCoverage && ! genotype ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); - } - continue; // low coverage - } - - if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos); - - long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() ); - long right = pos+( normalCall.getVariant() == null ? 0 : normalCall.getVariant().lengthOnRef())+NQS_WIDTH-1; - - if ( right >= adjustedPosition && ! force) { - // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect - - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = adjustPosition(left); - if ( move_to == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(adjustedPosition-normal_context.getStart())); - return; - } - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); - break; - } - - // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right: - if ( right > normal_context.getStop() ) right = normal_context.getStop(); - - // location = getToolkit().getGenomeLocParser().setStart(location,pos); - // location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data - - location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos); - - boolean haveCall = normalCall.isCall(); // cache the value - - if ( haveCall || genotype) { - if ( haveCall ) normalCallsMade++; - printVCFLine(vcf_writer,normalCall); - if ( bedWriter != null ) normalCall.printBedLine(bedWriter); - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall); - lastGenotypedPosition = pos; - } - - normal_context.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - -// for ( IndelVariant var : variants ) { -// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); -// } - } - - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); - normal_context.shift((int)(move_to - normal_context.getStart() ) ); - } - - /** A shortcut. Returns true if we got indels within the specified interval in single and only window context - * (for single-sample calls) or in either of the two window contexts (for two-sample/somatic calls) - * - */ - private boolean indelsPresentInInterval(long start, long stop) { - if ( tumor_context == null ) return normal_context.hasIndelsInInterval(start,stop); - return tumor_context.hasIndelsInInterval(start,stop) || - normal_context.hasIndelsInInterval(start,stop); - } - /** Takes the position, to which window shift is requested, and tries to adjust it in such a way that no NQS window is broken. - * Namely, this method checks, iteratively, if there is an indel within NQS_WIDTH bases ahead of initially requested or adjusted - * shift position. If there is such an indel, - * then shifting to that position would lose some or all NQS-window bases to the left of the indel (since it's not going to be emitted - * just yet). Instead, this method tries to readjust the shift position leftwards so that full NQS window to the left of the next indel - * is preserved. This method tries thie strategy 4 times (so that it would never walk away too far to the left), and if it fails to find - * an appropriate adjusted shift position (which could happen if there are many indels following each other at short intervals), it will give up, - * go back to the original requested shift position and try finding the first shift poisition that has no indel associated with it. - */ - - private long adjustPosition(long request) { - long initial_request = request; - int attempts = 0; - boolean failure = false; - while ( indelsPresentInInterval(request,request+NQS_WIDTH) ) { - request -= NQS_WIDTH; - if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); - attempts++; - if ( attempts == 4 ) { - if ( DEBUG ) System.out.println("DEBUG>> attempts to preserve full NQS window failed; now trying to find any suitable position.") ; - failure = true; - break; - } - } - - if ( failure ) { - // we tried 4 times but did not find a good shift position that would preserve full nqs window - // around all indels. let's fall back and find any shift position as long and there's no indel at the very - // first position after the shift (this is bad for other reasons); if it breaks a nqs window, so be it - request = initial_request; - attempts = 0; - while ( indelsPresentInInterval(request,request+1) ) { - request--; - if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); - attempts++; - if ( attempts == 50 ) { - System.out.println("WARNING: Indel at every position in the interval "+refName+":"+request+"-"+initial_request+ - ". Can not find a break to shift context window to; no calls will be attempted in the current window."); - return -1; - } - } - } - if ( DEBUG ) System.out.println("DEBUG>> Found acceptable target position "+request); - return request; - } - - /** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed - * first elements of the coverage arrays map onto 'position', or a few bases prior to the specified position - * if there is an indel in close proximity to 'position' so that we may get more coverage around it later. - * - * @param position - */ - private void emit_somatic(long position, boolean force) { - - long adjustedPosition = adjustPosition(position); - if ( adjustedPosition == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(position-normal_context.getStart())); - tumor_context.shift((int)(position-tumor_context.getStart())); - return; - } - long move_to = adjustedPosition; - - if ( DEBUG ) System.out.println("DEBUG>> Emitting in somatic mode up to "+position+" force shift="+force+" current window="+tumor_context.getStart()+"-"+tumor_context.getStop()); - - for ( int pos = tumor_context.getStart() ; pos < Math.min(adjustedPosition,tumor_context.getStop()+1) ; pos++ ) { - - boolean genotype = false; - // first let's see if we need to genotype current position: - - final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) - - if ( pos <= lastGenotypedPosition ) continue; - - while ( currentGenotypeInterval != null ) { - - // if we did not even reach next interval yet, no genotyping at current position: - if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || - location.getContigIndex() == currentGenotypeInterval.getContigIndex() && - p < currentGenotypeInterval.getStart() ) break; - if ( pastInterval(p, currentGenotypeInterval) ) { - // we are past current genotyping interval, so we are done with it; let's load next interval: - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - continue; // re-enter the loop to check against the interval we just loaded - } - - // we reach tjis point only if p is inside current genotyping interval; set the flag and bail out: - genotype = true; - break; - } -// if ( DEBUG) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); - - if ( tumor_context.indelsAt(pos).size() == 0 && ! genotype ) continue; // no indels in tumor - - if ( DEBUG && genotype ) System.out.println("DEBUG>> Genotyping requested at "+pos); - - IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH); - IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); - - if ( tumorCall.getCoverage() < minCoverage && ! genotype ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)"); - } - continue; // low coverage - } - if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); - } - continue; // low coverage - } - - if ( DEBUG ) { - System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, "); - System.out.print("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in normal at "+pos); - } - - long left = Math.max( pos-NQS_WIDTH, tumor_context.getStart() ); - long right = pos+ ( tumorCall.getVariant() == null ? 0 : tumorCall.getVariant().lengthOnRef() )+NQS_WIDTH-1; - - if ( right >= adjustedPosition && ! force) { - // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect - - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = adjustPosition(left); - if ( move_to == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(adjustedPosition-normal_context.getStart())); - tumor_context.shift((int)(adjustedPosition-tumor_context.getStart())); - return; - } - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); - break; - } - - if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right - -// location = getToolkit().getGenomeLocParser().setStart(location,pos); -// location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data - - location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data - - boolean haveCall = tumorCall.isCall(); // cache the value - - if ( haveCall || genotype ) { - if ( haveCall ) tumorCallsMade++; - - printVCFLine(vcf_writer,normalCall,tumorCall); - - if ( bedWriter != null ) tumorCall.printBedLine(bedWriter); - - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall ); - lastGenotypedPosition = pos; - } - tumor_context.indelsAt(pos).clear(); - normal_context.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - -// for ( IndelVariant var : variants ) { -// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); -// } - } - - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); - tumor_context.shift((int)(move_to - tumor_context.getStart() ) ); - normal_context.shift((int)(move_to - normal_context.getStart() ) ); - } - - private String makeFullRecord(IndelPrecall normalCall, IndelPrecall tumorCall) { - StringBuilder fullRecord = new StringBuilder(); - if ( tumorCall.getVariant() != null || normalCall.getVariant() == null) { - fullRecord.append(tumorCall.makeEventString()); - } else { - fullRecord.append(normalCall.makeEventString()); - } - fullRecord.append('\t'); - fullRecord.append(normalCall.makeStatsString("N_")); - fullRecord.append('\t'); - fullRecord.append(tumorCall.makeStatsString("T_")); - fullRecord.append('\t'); - return fullRecord.toString(); - } - - private String makeFullRecord(IndelPrecall normalCall) { - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(normalCall.makeEventString()); - fullRecord.append('\t'); - fullRecord.append(normalCall.makeStatsString("")); - fullRecord.append('\t'); - return fullRecord.toString(); - } - - private String getAnnotationString(RODRecordList ann) { - if ( ann == null ) return annGenomic; - else { - StringBuilder b = new StringBuilder(); - - if ( RefSeqFeature.isExon(ann) ) { - if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence - else b.append(annUTR); // exon but not coding = UTR - } else { - if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron - else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... - } - b.append('\t'); - b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed -// while ( it.hasNext() ) { // -// t.getGeneName() -// } - return b.toString(); - } - - } - - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall) { - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(makeFullRecord(normalCall)); - fullRecord.append(annotationString); - if ( ! normalCall.isCall() && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); - try { - verboseWriter.write(fullRecord.toString()); - verboseWriter.write('\n'); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); - } - - } - - - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall) { - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(makeFullRecord(normalCall,tumorCall)); - - if ( normalCall.getVariant() == null && tumorCall.getVariant() == null ) { - // did not observe anything - if ( normalCall.getCoverage() >= minNormalCoverage && tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); - else { - if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); // no coverage in normal but nothing in tumor - else { - // no coverage in tumor; if we have no coverage in normal, it can be anything; if we do have coverage in normal, - // this still could be a somatic event. so either way it is 'unknown' - fullRecord.append("UNKNOWN"); - } - } - - } - - if ( normalCall.getVariant() == null && tumorCall.getVariant() != null ) { - // looks like somatic call - if ( normalCall.getCoverage() >= minNormalCoverage ) fullRecord.append("SOMATIC"); // we confirm there is nothing in normal - else { - // low coverage in normal - fullRecord.append("EVENT_T"); // no coverage in normal, no idea whether it is germline or somatic - } - } - - if ( normalCall.getVariant() != null && tumorCall.getVariant() == null ) { - // it's likely germline (with missing observation in tumor - maybe loh? - if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("GERMLINE_LOH"); // we confirm there is nothing in tumor - else { - // low coverage in tumor, maybe we missed the event - fullRecord.append("GERMLINE"); // no coverage in tumor but we already saw it in normal... - } - } - - if ( normalCall.getVariant() != null && tumorCall.getVariant() != null ) { - // events in both T/N, got to be germline! - fullRecord.append("GERMLINE"); - } - - - fullRecord.append('\t'); - fullRecord.append(annotationString); - - if ( ! tumorCall.isCall() && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); - - try { - verboseWriter.write(fullRecord.toString()); - verboseWriter.write('\n'); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); - } - } - - public void printVCFLine(VCFWriter vcf, IndelPrecall call) { - - long start = call.getPosition()-1; - // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. - // The suggestion is instead of putting the base before the indel, to put the base after the indel. - // For now, just don't print out that site. - if ( start == 0 ) - return; - - long stop = start; - - List alleles = new ArrayList(2); // actual observed (distinct!) alleles at the site - List homref_alleles = null; // when needed, will contain two identical copies of ref allele - needed to generate hom-ref genotype - - - if ( call.getVariant() == null ) { - // we will need to cteate genotype with two (hom) ref alleles (below). - // we can not use 'alleles' list here, since that list is supposed to contain - // only *distinct* alleles observed at the site or VCFContext will frown upon us... - alleles.add( Allele.create(refBases[(int)start-1],true) ); - homref_alleles = new ArrayList(2); - homref_alleles.add( alleles.get(0)); - homref_alleles.add( alleles.get(0)); - } else { - // we always create alt allele when we observe anything but the ref, even if it is not a call! - // (Genotype will tell us whether it is an actual call or not!) - int event_length = call.getVariant().lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - fillAlleleList(alleles,call); - stop += event_length; - } - - Map genotypes = new HashMap(); - - for ( String sample : normalSamples ) { - - Map attrs = call.makeStatsAttributes(null); - - if ( call.isCall() ) // we made a call - put actual het genotype here: - genotypes.put(sample,new Genotype(sample,alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); - else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) - genotypes.put(sample,new Genotype(sample, homref_alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); - - } - Set filters = null; - if ( call.getVariant() != null && ! call.isCall() ) { - filters = new HashSet(); - filters.add("NoCall"); - } - VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, null); - vcf.add(vc,refBases[(int)start-1]); - } - - /** Fills l with appropriate alleles depending on whether call is insertion or deletion - * (l MUST have a variant or this method will crash). It is guaranteed that the *first* allele added - * to the list is ref, and the next one is alt. - * @param l - * @param call - */ - private void fillAlleleList(List l, IndelPrecall call) { - int event_length = call.getVariant().lengthOnRef(); - if ( event_length == 0 ) { // insertion - - l.add( Allele.create(Allele.NULL_ALLELE_STRING,true) ); - l.add( Allele.create(call.getVariant().getBases(), false )); - - } else { //deletion: - l.add( Allele.create(call.getVariant().getBases(), true )); - l.add( Allele.create(Allele.NULL_ALLELE_STRING,false) ); - } - } - - public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) { - - long start = tCall.getPosition()-1; - long stop = start; - - // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. - // The suggestion is instead of putting the base before the indel, to put the base after the indel. - // For now, just don't print out that site. - if ( start == 0 ) - return; - - Map attrsNormal = nCall.makeStatsAttributes(null); - Map attrsTumor = tCall.makeStatsAttributes(null); - - Map attrs = new HashMap(); - - boolean isSomatic = false; - if ( nCall.getCoverage() >= minNormalCoverage && nCall.getVariant() == null && tCall.getVariant() != null ) { - isSomatic = true; - attrs.put(VCFConstants.SOMATIC_KEY,true); - } - List alleles = new ArrayList(2); // all alleles at the site - // List normal_alleles = null; // all alleles at the site - List homRefAlleles = null; - -// if ( nCall.getVariant() == null || tCall.getVariant() == null ) { - homRefAlleles = new ArrayList(2) ; // we need this for somatic calls (since normal is ref-ref), and also for no-calls -// } - boolean homRefT = ( tCall.getVariant() == null ); - boolean homRefN = ( nCall.getVariant() == null ); - if ( tCall.getVariant() == null && nCall.getVariant() == null) { - // no indel at all ; create base-representation ref/ref alleles for genotype construction - alleles.add( Allele.create(refBases[(int)start-1],true) ); - } else { - // we got indel(s) - int event_length = 0; - if ( tCall.getVariant() != null ) { - // indel in tumor - event_length = tCall.getVariant().lengthOnRef(); - fillAlleleList(alleles, tCall); - } else { - event_length = nCall.getVariant().lengthOnRef(); - fillAlleleList(alleles, nCall); - } - if ( event_length > 0 ) stop += event_length; - } - homRefAlleles.add( alleles.get(0)); - homRefAlleles.add( alleles.get(0)); - - Map genotypes = new HashMap(); - - for ( String sample : normalSamples ) { - genotypes.put(sample,new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsNormal,false)); - } - - for ( String sample : tumorSamples ) { - genotypes.put(sample,new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsTumor,false) ); - } - - Set filters = null; - if ( tCall.getVariant() != null && ! tCall.isCall() ) { - filters = new HashSet(); - filters.add("NoCall"); - } - if ( nCall.getCoverage() < minNormalCoverage ) { - if ( filters == null ) filters = new HashSet(); - filters.add("NCov"); - } - if ( tCall.getCoverage() < minCoverage ) { - if ( filters == null ) filters = new HashSet(); - filters.add("TCov"); - } - - VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, attrs); - vcf.add(vc,refBases[(int)start-1]); - } - - @Override - public void onTraversalDone(Integer result) { - if ( DEBUG ) { - System.out.println("DEBUG>> Emitting last window at "+normal_context.getStart()+"-"+normal_context.getStop()); - } - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); // emit everything we might have left - - if ( metricsWriter != null ) { - metricsWriter.println(String.format("Normal calls made %d", normalCallsMade)); - metricsWriter.println(String.format("Tumor calls made %d", tumorCallsMade)); - metricsWriter.close(); - } - - try { - if ( bedWriter != null ) bedWriter.close(); - if ( verboseWriter != null ) verboseWriter.close(); - } catch (IOException e) { - System.out.println("Failed to close output BED file gracefully, data may be lost"); - e.printStackTrace(); - } - super.onTraversalDone(result); - } - - @Override - public Integer reduce(Integer value, Integer sum) { - if ( value == -1 ) { - onTraversalDone(sum); - System.exit(1); - } - sum += value; - return sum; - } - - @Override - public Integer reduceInit() { - return new Integer(0); - } - - - static class IndelVariant { - public static enum Type { I, D}; - private String bases; - private Type type; - private ArrayList fromStartOffsets = null; - private ArrayList fromEndOffsets = null; - - private Set reads = new HashSet(); // keep track of reads that have this indel - private Set samples = new HashSet(); // which samples had the indel described by this object - - public IndelVariant(ExpandedSAMRecord read , Type type, String bases) { - this.type = type; - this.bases = bases.toUpperCase(); - addObservation(read); - fromStartOffsets = new ArrayList(); - fromEndOffsets = new ArrayList(); - } - - /** Adds another observation for the current indel. It is assumed that the read being registered - * does contain the observation, no checks are performed. Read's sample is added to the list of samples - * this indel was observed in as well. - * @param read - */ - public void addObservation(ExpandedSAMRecord read) { - if ( reads.contains(read) ) { - //TODO fix CleanedReadInjector and reinstate exception here: duplicate records may signal a problem with the bam - // seeing the same read again can mean only one thing: the input bam file is corrupted and contains - // duplicate records. We KNOW that this may happen for the time being due to bug in CleanedReadInjector - // so this is a short-term patch: don't cry, but just ignore the duplicate record - - //throw new StingException("Attempting to add indel observation that was already registered"); - return; - } - reads.add(read); - String sample = null; - if ( read.getSAMRecord().getReadGroup() != null ) sample = read.getSAMRecord().getReadGroup().getSample(); - if ( sample != null ) samples.add(sample); - } - - - /** Returns length of the event on the reference (number of deleted bases - * for deletions, -1 for insertions. - * @return - */ - public int lengthOnRef() { - if ( type == Type.D ) return bases.length(); - else return 0; - } - - - public void addSample(String sample) { - if ( sample != null ) - samples.add(sample); - } - - public void addReadPositions(int fromStart, int fromEnd) { - fromStartOffsets.add(fromStart); - fromEndOffsets.add(fromEnd); - } - - public List getOffsetsFromStart() { return fromStartOffsets ; } - public List getOffsetsFromEnd() { return fromEndOffsets; } - - public String getSamples() { - StringBuffer sb = new StringBuffer(); - Iterator i = samples.iterator(); - while ( i.hasNext() ) { - sb.append(i.next()); - if ( i.hasNext() ) - sb.append(","); - } - return sb.toString(); - } - - public Set getReadSet() { return reads; } - - public int getCount() { return reads.size(); } - - public String getBases() { return bases; } - - public Type getType() { return type; } - - @Override - public boolean equals(Object o) { - if ( ! ( o instanceof IndelVariant ) ) return false; - IndelVariant that = (IndelVariant)o; - return ( this.type == that.type && this.bases.equals(that.bases) ); - } - - public boolean equals(Type type, String bases) { - return ( this.type == type && this.bases.equals(bases.toUpperCase()) ); - } - } - - /** - * Utility class that encapsulates the logic related to collecting all the stats and counts required to - * make (or discard) a call, as well as the calling heuristics that uses those data. - */ - class IndelPrecall { -// private boolean DEBUG = false; - private int NQS_MISMATCH_CUTOFF = 1000000; - private double AV_MISMATCHES_PER_READ = 1.5; - - private int nqs = 0; - private IndelVariant consensus_indel = null; // indel we are going to call - private long pos = -1 ; // position on the ref - private int total_coverage = 0; // total number of reads overlapping with the event - private int consensus_indel_count = 0; // number of reads, in which consensus indel was observed - private int all_indel_count = 0 ; // number of reads, in which any indel was observed at current position - - private int total_mismatches_in_nqs_window = 0; // total number of mismatches in the nqs window around the indel - private int total_bases_in_nqs_window = 0; // total number of bases in the nqs window (some reads may not fully span the window so it's not coverage*nqs_size) - private int total_base_qual_in_nqs_window = 0; // sum of qualitites of all the bases in the nqs window - private int total_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of all mismatching bases in the nqs window - - private int indel_read_mismatches_in_nqs_window = 0; // mismatches inside the nqs window in indel-containing reads only - private int indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from indel-containing reads only - private int indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from indel-containing reads only - private int indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from indel-containing reads only - - - private int consensus_indel_read_mismatches_in_nqs_window = 0; // mismatches within the nqs window from consensus indel reads only - private int consensus_indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from consensus indel-containing reads only - private int consensus_indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from consensus indel-containing reads only - private int consensus_indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from consensus indel-containing reads only - - - private double consensus_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain consensus indel - private double all_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain any indel at given position - private double all_read_total_mm = 0.0; // sum of all mismatches in all reads - - private double consensus_indel_read_total_mapq = 0.0; // sum of mapping qualitites of all reads with consensus indel - private double all_indel_read_total_mapq = 0.0 ; // sum of mapping qualitites of all reads with (any) indel at current position - private double all_read_total_mapq = 0.0; // sum of all mapping qualities of all reads - - private PrimitivePair.Int consensus_indel_read_orientation_cnt = new PrimitivePair.Int(); - private PrimitivePair.Int all_indel_read_orientation_cnt = new PrimitivePair.Int(); - private PrimitivePair.Int all_read_orientation_cnt = new PrimitivePair.Int(); - - private int from_start_median = 0; - private int from_start_mad = 0; - private int from_end_median = 0; - private int from_end_mad = 0; - - /** Makes an empty call (no-call) with all stats set to 0 - * - * @param position - */ - public IndelPrecall(long position) { - this.pos = position; - } - - public IndelPrecall(WindowContext context, long position, int nqs_width) { - this.pos = position; - this.nqs = nqs_width; - total_coverage = context.coverageAt(pos,true); - List variants = context.indelsAt(pos); - findConsensus(variants); - - // pos is the first base after the event: first deleted base or first base after insertion. - // hence, [pos-nqs, pos+nqs-1] (inclusive) is the window with nqs bases on each side of a no-event or an insertion - // and [pos-nqs, pos+Ndeleted+nqs-1] is the window with nqs bases on each side of a deletion. - // we initialize the nqs window for no-event/insertion case - long left = Math.max( pos-nqs, context.getStart() ); - long right = Math.min(pos+nqs-1, context.getStop()); -//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size()); - Iterator read_iter = context.getReads().iterator(); - - - while ( read_iter.hasNext() ) { - ExpandedSAMRecord rec = read_iter.next(); - SAMRecord read = rec.getSAMRecord(); - byte[] flags = rec.getExpandedMMFlags(); - byte[] quals = rec.getExpandedQuals(); - int mm = rec.getMMCount(); - - - if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue; - - long local_right = right; // end of nqs window for this particular read. May need to be advanced further right - // if read has a deletion. The gap in the middle of nqs window will be skipped - // automatically since flags/quals are set to -1 there - - boolean read_has_a_variant = false; - boolean read_has_consensus = ( consensus_indel!= null && consensus_indel.getReadSet().contains(rec) ); - for ( IndelVariant v : variants ) { - if ( v.getReadSet().contains(rec) ) { - read_has_a_variant = true; - local_right += v.lengthOnRef(); - break; - } - } - - if ( read_has_consensus ) { - consensus_indel_read_total_mm += mm; - consensus_indel_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++; - else consensus_indel_read_orientation_cnt.first++; - } - if ( read_has_a_variant ) { - all_indel_read_total_mm += mm; - all_indel_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++; - else all_indel_read_orientation_cnt.first++; - } - - all_read_total_mm+= mm; - all_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++; - else all_read_orientation_cnt.first++; - - for ( int pos_in_flags = Math.max((int)(left - read.getAlignmentStart()),0); - pos_in_flags <= Math.min((int)local_right-read.getAlignmentStart(),flags.length - 1); - pos_in_flags++) { - - if ( flags[pos_in_flags] == -1 ) continue; // gap (deletion), skip it; we count only bases aligned to the ref - total_bases_in_nqs_window++; - if ( read_has_consensus ) consensus_indel_read_bases_in_nqs_window++; - if ( read_has_a_variant ) indel_read_bases_in_nqs_window++; - - if ( quals[pos_in_flags] != -1 ) { - - total_base_qual_in_nqs_window += quals[pos_in_flags]; - if ( read_has_a_variant ) indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; - if ( read_has_consensus ) consensus_indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; - } - - if ( flags[pos_in_flags] == 1 ) { // it's a mismatch - total_mismatches_in_nqs_window++; - total_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - - if ( read_has_consensus ) { - consensus_indel_read_mismatches_in_nqs_window++; - consensus_indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - } - - if ( read_has_a_variant ) { - indel_read_mismatches_in_nqs_window++; - indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - } - } - } -// if ( pos == 3534096 ) { -// System.out.println(read.getReadName()); -// System.out.println(" cons nqs bases="+consensus_indel_read_bases_in_nqs_window); -// System.out.println(" qual sum="+consensus_indel_read_base_qual_in_nqs_window); -// } - - } - - // compute median/mad for offsets from the read starts/ends - if ( consensus_indel != null ) { - from_start_median = median(consensus_indel.getOffsetsFromStart()) ; - from_start_mad = mad(consensus_indel.getOffsetsFromStart(),from_start_median); - from_end_median = median(consensus_indel.getOffsetsFromEnd()) ; - from_end_mad = mad(consensus_indel.getOffsetsFromEnd(),from_end_median); - } - } - - /** As a side effect will sort l! - * - * @param l - * @return - */ - private int median(List l) { - Collections.sort(l); - int k = l.size()/2; - return ( l.size() % 2 == 0 ? - (l.get(k-1).intValue()+l.get(k).intValue())/2 : - l.get(k).intValue()); - } - - private int median(int[] l) { - Arrays.sort(l); - int k = l.length/2; - return ( l.length % 2 == 0 ? - (l[k-1]+l[k])/2 : - l[k]); - } - - private int mad(List l, int med) { - int [] diff = new int[l.size()]; - for ( int i = 0; i < l.size(); i++ ) { - diff[i] = Math.abs(l.get(i).intValue() - med); - } - return median(diff); - } - - public long getPosition() { return pos; } - - public boolean hasObservation() { return consensus_indel != null; } - - public int getCoverage() { return total_coverage; } - - public double getTotalMismatches() { return all_read_total_mm; } - public double getConsensusMismatches() { return consensus_indel_read_total_mm; } - public double getAllVariantMismatches() { return all_indel_read_total_mm; } - - /** Returns average number of mismatches per consensus indel-containing read */ - public double getAvConsensusMismatches() { - return ( consensus_indel_count != 0 ? consensus_indel_read_total_mm/consensus_indel_count : 0.0 ); - } - - /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants) */ - public double getAvRefMismatches() { - int coverage_ref = total_coverage-all_indel_count; - return ( coverage_ref != 0 ? (all_read_total_mm - all_indel_read_total_mm )/coverage_ref : 0.0 ); - } - - public PrimitivePair.Int getConsensusStrandCounts() { - return consensus_indel_read_orientation_cnt; - } - - public PrimitivePair.Int getRefStrandCounts() { - return new PrimitivePair.Int(all_read_orientation_cnt.first-all_indel_read_orientation_cnt.first, - all_read_orientation_cnt.second - all_indel_read_orientation_cnt.second); - } - - /** Returns a sum of mapping qualities of all reads spanning the event. */ - public double getTotalMapq() { return all_read_total_mapq; } - - /** Returns a sum of mapping qualities of all reads, in which the consensus variant is observed. */ - public double getConsensusMapq() { return consensus_indel_read_total_mapq; } - - /** Returns a sum of mapping qualities of all reads, in which any variant is observed at the current event site. */ - public double getAllVariantMapq() { return all_indel_read_total_mapq; } - - /** Returns average mapping quality per consensus indel-containing read. */ - public double getAvConsensusMapq() { - return ( consensus_indel_count != 0 ? consensus_indel_read_total_mapq/consensus_indel_count : 0.0 ); - } - - /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants). */ - public double getAvRefMapq() { - int coverage_ref = total_coverage-all_indel_count; - return ( coverage_ref != 0 ? (all_read_total_mapq - all_indel_read_total_mapq )/coverage_ref : 0.0 ); - } - - /** Returns fraction of bases in NQS window around the indel that are mismatches, across all reads, - * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around - * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} - * */ - public double getNQSConsensusMMRate() { - if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; - return ((double)consensus_indel_read_mismatches_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; - } - - /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads - * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref - * reads is defined around the event start position, NOT around the actual consensus indel. - * */ - public double getNQSRefMMRate() { - int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; - if ( num_ref_bases == 0 ) return 0; - return ((double)(total_mismatches_in_nqs_window - indel_read_mismatches_in_nqs_window))/num_ref_bases; - } - - /** Returns average base quality in NQS window around the indel, across all reads, - * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around - * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} - * */ - public double getNQSConsensusAvQual() { - if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; - return ((double)consensus_indel_read_base_qual_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; - } - - /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads - * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref - * reads is defined around the event start position, NOT around the actual consensus indel. - * */ - public double getNQSRefAvQual() { - int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; - if ( num_ref_bases == 0 ) return 0; - return ((double)(total_base_qual_in_nqs_window - indel_read_base_qual_in_nqs_window))/num_ref_bases; - } - - public int getTotalNQSMismatches() { return total_mismatches_in_nqs_window; } - - public int getAllVariantCount() { return all_indel_count; } - public int getConsensusVariantCount() { return consensus_indel_count; } - -// public boolean failsNQSMismatch() { -// //TODO wrong fraction: mismatches are counted only in indel-containing reads, but total_coverage is used! -// return ( indel_read_mismatches_in_nqs_window > NQS_MISMATCH_CUTOFF ) || -// ( indel_read_mismatches_in_nqs_window > total_coverage * AV_MISMATCHES_PER_READ ); -// } - - public IndelVariant getVariant() { return consensus_indel; } - - public boolean isCall() { - boolean ret = ( consensus_indel_count >= minIndelCount && - (double)consensus_indel_count > minFraction * total_coverage && - (double) consensus_indel_count > minConsensusFraction*all_indel_count && total_coverage >= minCoverage); - if ( DEBUG && ! ret ) System.out.println("DEBUG>> NOT a call: count="+consensus_indel_count+ - " total_count="+all_indel_count+" cov="+total_coverage+ - " minConsensusF="+((double)consensus_indel_count)/all_indel_count+ - " minF="+((double)consensus_indel_count)/total_coverage); - return ret; - - } - - /** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed - * variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus) - * @param variants - * @return - */ - private void findConsensus(List variants) { - for ( IndelVariant var : variants ) { - if ( DEBUG ) System.out.println("DEBUG>> Variant "+var.getBases()+" (cnt="+var.getCount()+")"); - int cnt = var.getCount(); - all_indel_count +=cnt; - if ( cnt > consensus_indel_count ) { - consensus_indel = var; - consensus_indel_count = cnt; - } - } - if ( DEBUG && consensus_indel != null ) System.out.println("DEBUG>> Returning: "+consensus_indel.getBases()+ - " (cnt="+consensus_indel.getCount()+") with total count of "+all_indel_count); - } - - - - public void printBedLine(Writer bed) { - int event_length; - if ( consensus_indel == null ) event_length = 0; - else { - event_length = consensus_indel.lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - } - - StringBuffer message = new StringBuffer(); - message.append(refName+"\t"+(pos-1)+"\t"); - message.append((pos-1+event_length)+"\t"); - if ( consensus_indel != null ) { - message.append((event_length>0? "-":"+")+consensus_indel.getBases()); - } else { - message.append('.'); - } - message.append(":"+all_indel_count+"/"+total_coverage); - try { - bed.write(message.toString()+"\n"); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(bedOutput, "Error encountered while writing into output BED file", e); - } - } - - public String makeEventString() { - int event_length; - if ( consensus_indel == null ) event_length = 0; - else { - event_length = consensus_indel.lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - } - StringBuffer message = new StringBuffer(); - message.append(refName); - message.append('\t'); - message.append(pos-1); - message.append('\t'); - message.append(pos-1+event_length); - message.append('\t'); - if ( consensus_indel != null ) { - message.append((event_length>0?'-':'+')); - message.append(consensus_indel.getBases()); - } else { - message.append('.'); - } - return message.toString(); - } - - public String makeStatsString(String prefix) { - StringBuilder message = new StringBuilder(); - message.append(prefix+"OBS_COUNTS[C/A/T]:"+getConsensusVariantCount()+"/"+getAllVariantCount()+"/"+getCoverage()); - message.append('\t'); - message.append(prefix+"AV_MM[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMismatches(), - getAvRefMismatches())); - message.append('\t'); - message.append(prefix+"AV_MAPQ[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMapq(), - getAvRefMapq())); - message.append('\t'); - message.append(prefix+"NQS_MM_RATE[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusMMRate(),getNQSRefMMRate())); - message.append('\t'); - message.append(prefix+"NQS_AV_QUAL[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusAvQual(),getNQSRefAvQual())); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - message.append('\t'); - message.append(prefix+"STRAND_COUNTS[C/C/R/R]:"+strand_cons.first+"/"+strand_cons.second+"/"+strand_ref.first+"/"+strand_ref.second); - - message.append('\t'); - message.append(prefix+"OFFSET_RSTART:"+from_start_median+"/"+from_start_mad); - message.append('\t'); - message.append(prefix+"OFFSET_REND:"+from_end_median+"/"+from_end_mad); - - return message.toString(); - - } - - /** - * Places alignment statistics into attribute map and returns the map. If attr parameter is null, - * a new map is allocated, filled and returned. If attr is not null, new attributes are added to that - * preexisting map, and the same instance of the (updated) map is returned. - * - * @param attr - * @return - */ - public Map makeStatsAttributes(Map attr) { - if ( attr == null ) attr = new HashMap(); - - VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),attr); - - VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),attr); - - VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),attr); - - VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),attr); - - VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),attr); - - VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,attr); - - VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,attr); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - - VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,attr); - return attr; - } - } - - interface IndelListener { - public void addObservation(int pos, IndelVariant.Type t, String bases, int fromStart, int fromEnd, ExpandedSAMRecord r); - } - - class WindowContext implements IndelListener { - private Set reads; - private int start=0; // where the window starts on the ref, 1-based - private CircularArray< List< IndelVariant > > indels; - - private List emptyIndelList = new ArrayList(); - - - public WindowContext(int start, int length) { - this.start = start; - indels = new CircularArray< List >(length); -// reads = new LinkedList(); - reads = new HashSet(); - } - - /** Returns 1-based reference start position of the interval this object keeps context for. - * - * @return - */ - public int getStart() { return start; } - - /** Returns 1-based reference stop position (inclusive) of the interval this object keeps context for. - * - * @return - */ - public int getStop() { return start + indels.length() - 1; } - - /** Resets reference start position to 0 and clears the context. - * - */ - public void clear() { - start = 0; - reads.clear(); - indels.clear(); - } - - /** - * Returns true if any indel observations are present in the specified interval - * [begin,end] (1-based, inclusive). Interval can be partially of fully outside of the - * current context window: positions outside of the window will be ignored. - * @param begin - * @param end - */ - public boolean hasIndelsInInterval(long begin, long end) { - for ( long k = Math.max(start,begin); k < Math.min(getStop(),end); k++ ) { - if ( indelsAt(k) != emptyIndelList ) return true; - } - return false; - } - - public Set getReads() { return reads; } - - /** Returns the number of reads spanning over the specified reference position - * (regardless of whether they have a base or indel at that specific location). - * The second argument controls whether to count with indels in mind (this is relevant for insertions only, - * deletions do not require any special treatment since they occupy non-zero length on the ref and since - * alignment can not start or end with a deletion). For insertions, note that, internally, we assign insertions - * to the reference position right after the actual event, and we count all events assigned to a given position. - * This count (reads with indels) should be contrasted to reads without indels, or more rigorously, reads - * that support the ref rather than the indel. Few special cases may occur here: - * 1) an alignment that ends (as per getAlignmentEnd()) right before the current position but has I as its - * last element: we have to count that read into the "coverage" at the current position for the purposes of indel - * assessment, as the indel in that read will be counted at the current position, so the total coverage - * should be consistent with that. - */ - /* NOT IMPLEMENTED: 2) alsignments that start exactly at the current position do not count for the purpose of insertion - * assessment since they do not contribute any evidence to either Ref or Alt=insertion hypothesis, unless - * the alignment starts with I (so that we do have evidence for an indel assigned to the current position and - * read should be counted). For deletions, reads starting at the current position should always be counted (as they - * show no deletion=ref). - * @param refPos position on the reference; must be within the bounds of the window - */ - public int coverageAt(final long refPos, boolean countForIndels) { - int cov = 0; - for ( ExpandedSAMRecord read : reads ) { - if ( read.getSAMRecord().getAlignmentStart() > refPos || read.getSAMRecord().getAlignmentEnd() < refPos ) { - if ( countForIndels && read.getSAMRecord().getAlignmentEnd() == refPos - 1) { - Cigar c = read.getSAMRecord().getCigar(); - if ( c.getCigarElement(c.numCigarElements()-1).getOperator() == CigarOperator.I ) cov++; - } - continue; - } - cov++; - } - return cov; - } - - - /** Shifts current window to the right along the reference contig by the specified number of bases. - * The context will be updated accordingly (indels and reads that go out of scope will be dropped). - * @param offset - */ - public void shift(int offset) { - start += offset; - - indels.shiftData(offset); - if ( indels.get(0) != null && indels.get(0).size() != 0 ) { - IndelVariant indel = indels.get(0).get(0); - - System.out.println("WARNING: Indel(s) at first position in the window ("+refName+":"+start+"): currently not supported: "+ - (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; read: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()+"; site ignored"); - indels.get(0).clear(); -// throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+ -// (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()); - } - - Iterator read_iter = reads.iterator(); - - while ( read_iter.hasNext() ) { - ExpandedSAMRecord r = read_iter.next(); - if ( r.getSAMRecord().getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope - read_iter.remove(); - } - } - } - - public void add(SAMRecord read, byte [] ref) { - - if ( read.getAlignmentStart() < start ) return; // silently ignore reads starting before the window start - - ExpandedSAMRecord er = new ExpandedSAMRecord(read,ref,read.getAlignmentStart()-start,this); - //TODO duplicate records may actually indicate a problem with input bam file; throw an exception when the bug in CleanedReadInjector is fixed - if ( reads.contains(er)) return; // ignore duplicate records - reads.add(er); - } - - public void addObservation(int pos, IndelVariant.Type type, String bases, int fromStart, int fromEnd, ExpandedSAMRecord rec) { - List indelsAtSite; - try { - indelsAtSite = indels.get(pos); - } catch (IndexOutOfBoundsException e) { - SAMRecord r = rec.getSAMRecord(); - System.out.println("Failed to add indel observation, probably out of coverage window bounds (trailing indel?):\nRead "+ - r.getReadName()+": "+ - "read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+ - r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+ - "; window end="+getStop()); - throw e; - } - - if ( indelsAtSite == null ) { - indelsAtSite = new ArrayList(); - indels.set(pos, indelsAtSite); - } - - IndelVariant indel = null; - for ( IndelVariant v : indelsAtSite ) { - if ( ! v.equals(type, bases) ) continue; - - indel = v; - indel.addObservation(rec); - break; - } - - if ( indel == null ) { // not found: - indel = new IndelVariant(rec, type, bases); - indelsAtSite.add(indel); - } - indel.addReadPositions(fromStart,fromEnd); - } - - public List indelsAt( final long refPos ) { - List l = indels.get((int)( refPos - start )); - if ( l == null ) return emptyIndelList; - else return l; - } - - - } - - - class ExpandedSAMRecord { - private SAMRecord read; - private byte[] mismatch_flags; - private byte[] expanded_quals; - private int mms; - - public ExpandedSAMRecord(SAMRecord r, byte [] ref, long offset, IndelListener l) { - - read = r; - final long rStart = read.getAlignmentStart(); - final long rStop = read.getAlignmentEnd(); - final byte[] readBases = read.getReadString().toUpperCase().getBytes(); - - ref = new String(ref).toUpperCase().getBytes(); - - mismatch_flags = new byte[(int)(rStop-rStart+1)]; - expanded_quals = new byte[(int)(rStop-rStart+1)]; - - // now let's extract indels: - - Cigar c = read.getCigar(); - final int nCigarElems = c.numCigarElements(); - - - int readLength = 0; // length of the aligned part of the read NOT counting clipped bases - for ( CigarElement cel : c.getCigarElements() ) { - - switch(cel.getOperator()) { - case H: - case S: - case D: - case N: - case P: - break; // do not count gaps or clipped bases - case I: - case M: - readLength += cel.getLength(); - break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string: "+cel.getOperator()); - } - } - - int fromStart = 0; - int posOnRead = 0; - int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read: - // its start on the actual full reference contig is r.getAlignmentStart() - for ( int i = 0 ; i < nCigarElems ; i++ ) { - - final CigarElement ce = c.getCigarElement(i); - IndelVariant.Type type = null; - String indel_bases = null; - int eventPosition = posOnRef; - - switch(ce.getOperator()) { - case H: break; // hard clipped reads do not have clipped indel_bases in their sequence, so we just ignore the H element... - case I: - type = IndelVariant.Type.I; - indel_bases = read.getReadString().substring(posOnRead,posOnRead+ce.getLength()); - // will increment position on the read below, there's no 'break' statement yet... - case S: - // here we also skip soft-clipped indel_bases on the read; according to SAM format specification, - // alignment start position on the reference points to where the actually aligned - // (not clipped) indel_bases go, so we do not need to increment reference position here - posOnRead += ce.getLength(); - break; - case D: - type = IndelVariant.Type.D; - indel_bases = new String( ref, posOnRef, ce.getLength() ); - for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) mismatch_flags[posOnRef] = expanded_quals[posOnRef] = -1; - - break; - case M: - for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { - if ( readBases[posOnRead] != ref[posOnRef] ) { // mismatch! - mms++; - mismatch_flags[posOnRef] = 1; - } - expanded_quals[posOnRef] = read.getBaseQualities()[posOnRead]; - } - fromStart += ce.getLength(); - break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator()); - } - - if ( type == null ) continue; // element was not an indel, go grab next element... - - // we got an indel if we are here... - if ( i == 0 ) logger.debug("Indel at the start of the read "+read.getReadName()); - if ( i == nCigarElems - 1) logger.debug("Indel at the end of the read "+read.getReadName()); - - // note that here we will be assigning indels to the first deleted base or to the first - // base after insertion, not to the last base before the event! - int fromEnd = readLength - fromStart; - if ( type == IndelVariant.Type.I ) fromEnd -= ce.getLength(); - - l.addObservation((int)(offset+eventPosition), type, indel_bases, fromStart, fromEnd, this); - - if ( type == IndelVariant.Type.I ) fromStart += ce.getLength(); - - } - } - - public SAMRecord getSAMRecord() { return read; } - - public byte [] getExpandedMMFlags() { return mismatch_flags; } - - public byte [] getExpandedQuals() { return expanded_quals; } - - public int getMMCount() { return mms; } - - public boolean equals(Object o) { - if ( this == o ) return true; - if ( read == null ) return false; - if ( o instanceof SAMRecord ) return read.equals(o); - if ( o instanceof ExpandedSAMRecord ) return read.equals(((ExpandedSAMRecord)o).read); - return false; - } - - - } - -} - - -class VCFIndelAttributes { - public static String ALLELIC_DEPTH_KEY = "AD"; - public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY; - - public static String MAPQ_KEY = "MQS"; - - public static String MM_KEY = "MM"; - - public static String NQS_MMRATE_KEY = "NQSMM"; - - public static String NQS_AVQ_KEY = "NQSBQ"; - - public static String STRAND_COUNT_KEY = "SC"; - public static String RSTART_OFFSET_KEY = "RStart"; - public static String REND_OFFSET_KEY = "REnd"; - - public static Set getAttributeHeaderLines() { - Set lines = new HashSet(); - - lines.add(new VCFFormatHeaderLine(ALLELIC_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "# of reads supporting consensus indel/reference at the site")); - lines.add(new VCFFormatHeaderLine(DEPTH_TOTAL_KEY, 1, VCFHeaderLineType.Integer, "Total coverage at the site")); - - lines.add(new VCFFormatHeaderLine(MAPQ_KEY, 2, VCFHeaderLineType.Float, "Average mapping qualities of consensus indel-supporting reads/reference-supporting reads")); - - lines.add(new VCFFormatHeaderLine(MM_KEY, 2, VCFHeaderLineType.Float, "Average # of mismatches per consensus indel-supporting read/per reference-supporting read")); - - lines.add(new VCFFormatHeaderLine(NQS_MMRATE_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads")); - - lines.add(new VCFFormatHeaderLine(NQS_AVQ_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads")); - - lines.add(new VCFFormatHeaderLine(STRAND_COUNT_KEY, 4, VCFHeaderLineType.Integer, "Strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads")); - - lines.add(new VCFFormatHeaderLine(RSTART_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the starts of the reads")); - lines.add(new VCFFormatHeaderLine(REND_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the ends of the reads")); - - return lines; - } - - public static Map recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, Map attrs) { - attrs.put(STRAND_COUNT_KEY, new Integer[] {cnt_cons_fwd, cnt_cons_rev, cnt_ref_fwd, cnt_ref_rev} ); - return attrs; - } - - public static Map recordDepth(int cnt_cons, int cnt_indel, int cnt_total, Map attrs) { - attrs.put(ALLELIC_DEPTH_KEY, new Integer[] {cnt_cons, cnt_indel} ); - attrs.put(DEPTH_TOTAL_KEY, cnt_total); - return attrs; - } - - public static Map recordAvMapQ(double cons, double ref, Map attrs) { - attrs.put(MAPQ_KEY, new Float[] {(float)cons, (float)ref} ); - return attrs; - } - - public static Map recordAvMM(double cons, double ref, Map attrs) { - attrs.put(MM_KEY, new Float[] {(float)cons, (float)ref} ); - return attrs; - } - - public static Map recordNQSMMRate(double cons, double ref, Map attrs) { - attrs.put(NQS_MMRATE_KEY, new Float[] {(float)cons, (float)ref} ); - return attrs; - } - - public static Map recordNQSAvQ(double cons, double ref, Map attrs) { - attrs.put(NQS_AVQ_KEY, new Float[] {(float)cons, (float)ref} ); - return attrs; - } - - public static Map recordOffsetFromStart(int median, int mad, Map attrs) { - attrs.put(RSTART_OFFSET_KEY, new Integer[] {median, mad} ); - return attrs; - } - - public static Map recordOffsetFromEnd(int median, int mad, Map attrs) { - attrs.put(REND_OFFSET_KEY, new Integer[] {median, mad} ); - return attrs; - } -} +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; +import org.broadinstitute.sting.gatk.filters.Platform454Filter; +import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; +import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; +import org.broadinstitute.sting.utils.codecs.refseq.Transcript; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.collections.CircularArray; +import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; +import java.util.*; + + +/** + * Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well, + * but this latter functionality is now superceded by UnifiedGenotyper. + * + *

+ * This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing + * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs + * include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many + * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional + * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will + * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional + * metrics for the post-processing tools to make the final decision). The calls are performed by default + * from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I + * command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments, + * respectively. Indels are called from the tumor sample and annotated as germline + * if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic + * if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling + * is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold + * in tumor sample, it will not be reported. + * + * To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input + * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged + * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups). + * + *

Input

+ *

+ * Tumor and normal bam files (or single sample bam file(s) in --unpaired mode). + *

+ * + *

Output

+ *

+ * Indel calls with associated metrics. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SomaticIndelDetector \
+ *   -o indels.vcf \
+ *   -verbose indels.txt
+ *   -I:normal normal.bam \
+ *   -I:tumor tumor.bam
+ * 
+ * + */ + +@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class}) +public class SomaticIndelDetectorWalker extends ReadWalker { +// @Output +// PrintStream out; + @Output(doc="File to write variants (indels) in VCF format",required=true) + protected VCFWriter vcf_writer = null; + + @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true) + @Deprecated + java.io.File output_file; + + @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print callability metrics output", required = false) + public PrintStream metricsWriter = null; + +// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false) +// boolean FORMAT_VCF = false; + + @Hidden + @Argument(fullName = "genotype_intervals", shortName = "genotype", + doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false) + public String genotypeIntervalsFile = null; + + @Hidden + @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, + doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ + "if the list turns out to be unsorted, it will throw an exception. "+ + "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ + "to sort and keep it in memory (increases memory usage!).") + protected boolean GENOTYPE_NOT_SORTED = false; + + @Hidden + @Argument(fullName="unpaired", shortName="unpaired", + doc="Perform unpaired calls (no somatic status detection)", required=false) + boolean call_unpaired = false; + boolean call_somatic ; + + @Argument(fullName="verboseOutput", shortName="verbose", + doc="Verbose output file in text format", required=false) + java.io.File verboseOutput = null; + + @Argument(fullName="bedOutput", shortName="bed", + doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) + java.io.File bedOutput = null; + + @Argument(fullName="minCoverage", shortName="minCoverage", + doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+ + "with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false) + int minCoverage = 6; + + @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", + doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+ + "or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) + int minNormalCoverage = 4; + + @Argument(fullName="minFraction", shortName="minFraction", + doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ + " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) + double minFraction = 0.3; + + @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", + doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+ + "all indel observations at the site exceeds this threshold", required=false) + double minConsensusFraction = 0.7; + + @Argument(fullName="minIndelCount", shortName="minCnt", + doc="Minimum count of reads supporting consensus indel required for making the call. "+ + " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ + "(minIndelCount not met) will not pass.", required=false) + int minIndelCount = 0; + + @Argument(fullName="refseq", shortName="refseq", + doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+ + "GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) + String RefseqFileName = null; + +//@Argument(fullName="blacklistedLanes", shortName="BL", +// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ +// "by this application, so they will not contribute indels to consider and will not be counted.", required=false) +//PlatformUnitFilterHelper dummy; + + @Hidden + @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on", + required=false) Boolean DEBUG = false; + @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+ + "May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+ + "window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+ + "than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+ + "if window is too small",required=false) int WINDOW_SIZE = 200; + @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+ + " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000; + + + + private WindowContext tumor_context; + private WindowContext normal_context; + private int currentContigIndex = -1; + private int contigLength = -1; // we see to much messy data with reads hanging out of contig ends... + private int currentPosition = -1; // position of the last read we've seen on the current contig + private String refName = null; + private java.io.Writer output = null; + private GenomeLoc location = null; + private long normalCallsMade = 0L, tumorCallsMade = 0L; + + boolean outOfContigUserWarned = false; + + private LocationAwareSeekableRODIterator refseqIterator=null; + +// private Set normalReadGroups; // we are going to remember which read groups are normals and which are tumors in order to be able +// private Set tumorReadGroups ; // to properly assign the reads coming from a merged stream + private Set normalSamples; // we are going to remember which samples are normal and which are tumor: + private Set tumorSamples ; // these are used only to generate genotypes for vcf output + + private int NQS_WIDTH = 5; // 5 bases on each side of the indel for NQS-style statistics + + private Writer bedWriter = null; + private Writer verboseWriter = null; + + + private static String annGenomic = "GENOMIC"; + private static String annIntron = "INTRON"; + private static String annUTR = "UTR"; + private static String annCoding = "CODING"; + private static String annUnknown = "UNKNOWN"; + + enum CallType { + NOCOVERAGE, + BADCOVERAGE, + NOEVIDENCE, + GERMLINE, + SOMATIC + }; + + private SAMRecord lastRead; + private byte[] refBases; + private ReferenceDataSource refData; + private Iterator genotypeIntervalIterator = null; + + // the current interval in the list of intervals, for which we want to do full genotyping + private GenomeLoc currentGenotypeInterval = null; + private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed; + // can be 1 base before lastGenotyped start + + + // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt" + + private Set getVCFHeaderInfo() { + Set headerInfo = new HashSet(); + + // first, the basic info + headerInfo.add(new VCFHeaderLine("source", "IndelGenotyperV2")); + headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + + // FORMAT and INFO fields +// headerInfo.addAll(VCFUtils.getSupportedHeaderStrings()); + + headerInfo.addAll(VCFIndelAttributes.getAttributeHeaderLines()); + if ( call_somatic ) { + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + } else { + } + + // all of the arguments from the argument collection + Set args = new HashSet(); + args.add(this); + args.addAll(getToolkit().getFilters()); + Map commandLineArgs = getToolkit().getApproximateCommandLineArguments(args); + for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) + headerInfo.add(new VCFHeaderLine(String.format("IGv2_%s", commandLineArg.getKey()), commandLineArg.getValue())); + // also, the list of input bams + for ( String fileName : getToolkit().getArguments().samFiles ) + headerInfo.add(new VCFHeaderLine("IGv2_bam_file_used", fileName)); + + return headerInfo; + } + + + @Override + public void initialize() { + + call_somatic = (call_unpaired ? false : true); + normal_context = new WindowContext(0,WINDOW_SIZE); + normalSamples = new HashSet(); + + if ( bedOutput != null && output_file != null ) { + throw new UserException.DeprecatedArgument("-O", "-O option is deprecated and -bed option replaces it; you can not use both at the same time"); + } + + if ( RefseqFileName != null ) { + logger.info("Using RefSeq annotations from "+RefseqFileName); + + RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(), + getToolkit().getArguments().unsafe); + RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName)); + + refseqIterator = new SeekableRODIterator(refseq.getHeader(), + refseq.getSequenceDictionary(), + getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(), + refseq.getIterator()); + } + + if ( refseqIterator == null ) logger.info("No gene annotations available"); + + int nSams = getToolkit().getArguments().samFiles.size(); + + if ( call_somatic ) { + if ( nSams < 2 ) throw new UserException.BadInput("In default (paired sample) mode at least two bam files (normal and tumor) must be specified"); + tumor_context = new WindowContext(0,WINDOW_SIZE); + tumorSamples = new HashSet(); + } + + int nNorm = 0; + int nTum = 0; + for ( SAMReaderID rid : getToolkit().getReadsDataSource().getReaderIDs() ) { + Tags tags = rid.getTags() ; + if ( tags.getPositionalTags().isEmpty() && call_somatic ) + throw new UserException.BadInput("In default (paired sample) mode all input bam files must be tagged as either 'normal' or 'tumor'. Untagged file: "+ + getToolkit().getSourceFileForReaderID(rid)); + boolean normal = false; + boolean tumor = false; + for ( String s : tags.getPositionalTags() ) { // we allow additional unrelated tags (and we do not use them), but we REQUIRE one of Tumor/Normal to be present if --somatic is on + if ( "NORMAL".equals(s.toUpperCase()) ) { + normal = true; + nNorm++; + } + if ( "TUMOR".equals(s.toUpperCase()) ) { + tumor = true; + nTum++ ; + } + } + if ( call_somatic && normal && tumor ) throw new UserException.BadInput("Input bam file "+ + getToolkit().getSourceFileForReaderID(rid)+" is tagged both as normal and as tumor. Which one is it??"); + if ( call_somatic && !normal && ! tumor ) + throw new UserException.BadInput("In somatic mode all input bams must be tagged as either normal or tumor. Encountered untagged file: "+ + getToolkit().getSourceFileForReaderID(rid)); + if ( ! call_somatic && (normal || tumor) ) + System.out.println("WARNING: input bam file "+getToolkit().getSourceFileForReaderID(rid) + +" is tagged as Normal and/or Tumor, but somatic mode is not on. Tags will ne IGNORED"); + if ( call_somatic && tumor ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { + tumorSamples.add(rg.getSample()); + } + } else { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { + normalSamples.add(rg.getSample()); + } + } + if ( genotypeIntervalsFile != null ) { + + if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) { + // prepare to read intervals one-by-one, as needed (assuming they are sorted). + genotypeIntervalIterator = new IntervalFileMergingIterator(getToolkit().getGenomeLocParser(), + new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); + } else { + // read in the whole list of intervals for cleaning + GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), + IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(genotypeIntervalsFile),true), IntervalMergingRule.OVERLAPPING_ONLY); + genotypeIntervalIterator = locs.iterator(); + } + + // wrap intervals requested for genotyping inside overlapping iterator, so that we actually + // genotype only on the intersections of the requested intervals with the -L intervals + genotypeIntervalIterator = new OverlappingIntervalIterator(genotypeIntervalIterator, getToolkit().getIntervals().iterator() ); + + currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; + + if ( DEBUG) System.out.println("DEBUG>> first genotyping interval="+currentGenotypeInterval); + + if ( currentGenotypeInterval != null ) lastGenotypedPosition = currentGenotypeInterval.getStart()-1; + } + + } + + location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); + + normalSamples = getToolkit().getSamplesByReaders().get(0); + + try { + // we already checked that bedOutput and output_file are not set simultaneously + if ( bedOutput != null ) bedWriter = new FileWriter(bedOutput); + if ( output_file != null ) bedWriter = new FileWriter(output_file); + } catch (java.io.IOException e) { + throw new UserException.CouldNotReadInputFile(bedOutput, "Failed to open BED file for writing.", e); + } + try { + if ( verboseOutput != null ) verboseWriter = new FileWriter(verboseOutput); + } catch (java.io.IOException e) { + throw new UserException.CouldNotReadInputFile(verboseOutput, "Failed to open BED file for writing.", e); + } + + vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ; + refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile); + } + + + @Override + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + + // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); + + if ( DEBUG ) { + // System.out.println("DEBUG>> read at "+ read.getAlignmentStart()+"-"+read.getAlignmentEnd()+ + // "("+read.getCigarString()+")"); + if ( read.getDuplicateReadFlag() ) System.out.println("DEBUG>> Duplicated read (IGNORED)"); + } + + if ( AlignmentUtils.isReadUnmapped(read) || + read.getDuplicateReadFlag() || + read.getNotPrimaryAlignmentFlag() || + read.getMappingQuality() == 0 ) { + return 0; // we do not need those reads! + } + + if ( read.getReferenceIndex() != currentContigIndex ) { + // we just jumped onto a new contig + if ( DEBUG ) System.out.println("DEBUG>>> Moved to contig "+read.getReferenceName()); + if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName()+": contig is out of order; input BAM file is unsorted"); + + // print remaining indels from the previous contig (if any); + if ( call_somatic ) emit_somatic(1000000000, true); + else emit(1000000000,true); + + currentContigIndex = read.getReferenceIndex(); + currentPosition = read.getAlignmentStart(); + refName = new String(read.getReferenceName()); + + location = getToolkit().getGenomeLocParser().createGenomeLoc(refName,location.getStart(),location.getStop()); + contigLength = getToolkit().getGenomeLocParser().getContigInfo(refName).getSequenceLength(); + outOfContigUserWarned = false; + + lastGenotypedPosition = -1; + + normal_context.clear(); // reset coverage window; this will also set reference position to 0 + if ( call_somatic) tumor_context.clear(); + + refBases = new String(refData.getReference().getSequence(read.getReferenceName()).getBases()).toUpperCase().getBytes(); + } + + // we have reset the window to the new contig if it was required and emitted everything we collected + // on a previous contig. At this point we are guaranteed that we are set up properly for working + // with the contig of the current read. + + // NOTE: all the sanity checks and error messages below use normal_context only. We make sure that normal_context and + // tumor_context are synchronized exactly (windows are always shifted together by emit_somatic), so it's safe + + if ( read.getAlignmentStart() < currentPosition ) // oops, read out of order? + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName() +" out of order on the contig\n"+ + "Read starts at "+refName+":"+read.getAlignmentStart()+"; last read seen started at "+refName+":"+currentPosition + +"\nLast read was: "+lastRead.getReadName()+" RG="+lastRead.getAttribute("RG")+" at "+lastRead.getAlignmentStart()+"-" + +lastRead.getAlignmentEnd()+" cigar="+lastRead.getCigarString()); + + currentPosition = read.getAlignmentStart(); + lastRead = read; + + if ( read.getAlignmentEnd() > contigLength ) { + if ( ! outOfContigUserWarned ) { + System.out.println("WARNING: Reads aligned past contig length on "+ location.getContig()+"; all such reads will be skipped"); + outOfContigUserWarned = true; + } + return 0; + } + + long alignmentEnd = read.getAlignmentEnd(); + Cigar c = read.getCigar(); + int lastNonClippedElement = 0; // reverse offset to the last unclipped element + CigarOperator op = null; + // moving backwards from the end of the cigar, skip trailing S or H cigar elements: + do { + lastNonClippedElement++; + op = c.getCigarElement( c.numCigarElements()-lastNonClippedElement ).getOperator(); + } while ( op == CigarOperator.H || op == CigarOperator.S ); + + // now op is the last non-S/H operator in the cigar. + + // a little trick here: we want to make sure that current read completely fits into the current + // window so that we can accumulate indel observations over the whole length of the read. + // The ::getAlignmentEnd() method returns the last position on the reference where bases from the + // read actually match (M cigar elements). After our cleaning procedure, we can have reads that end + // with I element, which is not gonna be counted into alignment length on the reference. On the other hand, + // in this program we assign insertions, internally, to the first base *after* the insertion position. + // Hence, we have to make sure that that extra base is already in the window or we will get IndexOutOfBounds. + + if ( op == CigarOperator.I) alignmentEnd++; + + if ( alignmentEnd > normal_context.getStop()) { + + // we don't emit anything until we reach a read that does not fit into the current window. + // At that point we try shifting the window to the start of that read (or reasonably close) and emit everything prior to + // that position. This is legitimate, since the reads are sorted and we are not gonna see any more coverage at positions + // below the current read's start. + // Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting + // the window to around the read's start will ensure that the read fits... + + if ( DEBUG) System.out.println("DEBUG>> Window at "+normal_context.getStart()+"-"+normal_context.getStop()+", read at "+ + read.getAlignmentStart()+": trying to emit and shift" ); + if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false ); + else emit( read.getAlignmentStart(), false ); + + // let's double check now that the read fits after the shift + if ( read.getAlignmentEnd() > normal_context.getStop()) { + // ooops, looks like the read does not fit into the window even after the latter was shifted!! + // we used to die over such reads and require user to run with larger window size. Now we + // just print a warning and discard the read (this means that our counts can be slightly off in + // th epresence of such reads) + //throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+ + // "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ + // read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ + // "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); + System.out.println("WARNING: Read "+read.getReadName()+ + " is out of coverage window bounds. Probably window is too small and the window_size value must be increased.\n"+ + " The read is ignored in this run (so all the counts/statistics reported will not include it).\n"+ + " Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ + read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ + "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); + return 1; + } + } + + if ( call_somatic ) { + + Tags tags = getToolkit().getReaderIDForRead(read).getTags(); + boolean assigned = false; + for ( String s : tags.getPositionalTags() ) { + if ( "NORMAL".equals(s.toUpperCase()) ) { + normal_context.add(read,ref.getBases()); + assigned = true; + break; + } + if ( "TUMOR".equals(s.toUpperCase()) ) { + tumor_context.add(read,ref.getBases()); + assigned = true; + break; + } + } + if ( ! assigned ) + throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+ + "has no Normal/Tumor tag associated with it"); + +// String rg = (String)read.getAttribute("RG"); +// if ( rg == null ) +// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls."); + +// if ( normalReadGroups.contains(rg) ) { +// normal_context.add(read,ref.getBases()); +// } else if ( tumorReadGroups.contains(rg) ) { +// tumor_context.add(read,ref.getBases()); +// } else { +// throw new UserException.MalformedBam(read, "Unrecognized read group in merged stream: "+rg); +// } + + if ( tumor_context.getReads().size() > MAX_READ_NUMBER ) { + System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ + refName+':'+tumor_context.getStart()+'-'+tumor_context.getStop()+" in tumor sample. The whole window will be dropped."); + tumor_context.shift(WINDOW_SIZE); + normal_context.shift(WINDOW_SIZE); + } + if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { + System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ + refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+" in normal sample. The whole window will be dropped"); + tumor_context.shift(WINDOW_SIZE); + normal_context.shift(WINDOW_SIZE); + } + + + } else { + normal_context.add(read, ref.getBases()); + if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { + System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ + refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+". The whole window will be dropped"); + normal_context.shift(WINDOW_SIZE); + } + } + + return 1; + } + + /** An auxiliary shortcut: returns true if position(location.getContig(), p) is past l */ + private boolean pastInterval(long p, GenomeLoc l) { + return ( location.getContigIndex() > l.getContigIndex() || + location.getContigIndex() == l.getContigIndex() && p > l.getStop() ); + } + + /** Emit calls of the specified type across genotyping intervals, from position lastGenotypedPosition+1 to + * pos-1, inclusive. + * @param contigIndex + * @param pos + * @param call + */ + /* + private void emitNoCallsUpTo(int contigIndex, long pos, CallType call) { + + if ( contigIndex < currentGenotypeInterval.getContigIndex() || + contigIndex == currentGenotypeInterval.getContigIndex() && pos <= currentGenotypeInterval.getStart() ) return; + + if ( contigIndex == currentGenotypeInterval.getContigIndex() && pos >= currentGenotypeInterval.getStart() ) { + for ( long p = lastGenotypedPosition+1; p < pos; p++ ) { + + } + } + while( currentGenotypeInterval != null ) { + + while ( ) + if ( genotypeIntervalIterator.hasNext() ) { + currentGenotypeInterval = genotypeIntervalIterator.next() ; + if ( pastInterval(p,currentGenotypeInterval) ) { + // if we are about to jump over the whole next interval, we need to emit NO_COVERAGE calls there! + emitNoCoverageCalls(currentGenotypeInterval); + } + } else { + currentGenotypeInterval = null; + } + } + } +*/ + + /** Output indel calls up to the specified position and shift the window: after this method is executed, the + * first element of the window maps onto 'position', if possible, or at worst a few bases to the left of 'position' if we may need more + * reads to get full NQS-style statistics for an indel in the close proximity of 'position'. + * + * @param position + */ + private void emit(long position, boolean force) { + + long adjustedPosition = adjustPosition(position); + + if ( adjustedPosition == -1 ) { + // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether + normal_context.shift((int)(position-normal_context.getStart())); + return; + } + long move_to = adjustedPosition; + + for ( int pos = normal_context.getStart() ; pos < Math.min(adjustedPosition,normal_context.getStop()+1) ; pos++ ) { + + boolean genotype = false; + // first let's see if we need to genotype current position: + + final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) + + if ( pos <= lastGenotypedPosition ) continue; + + while ( currentGenotypeInterval != null ) { + + // if we did not even reach next interval yet, no genotyping at current position: + if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || + location.getContigIndex() == currentGenotypeInterval.getContigIndex() && + p < currentGenotypeInterval.getStart() ) break; + if ( pastInterval(p, currentGenotypeInterval) ) { + // we are past current genotyping interval, so we are done with it; let's load next interval: + currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; + continue; // re-enter the loop to check against the interval we just loaded + } + + // we reach this point only if p is inside current genotyping interval; set the flag and bail out: + genotype = true; + break; + } + +// if ( DEBUG ) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); + + if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue; + + IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); + + if ( normalCall.getCoverage() < minCoverage && ! genotype ) { + if ( DEBUG ) { + System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); + } + continue; // low coverage + } + + if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos); + + long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() ); + long right = pos+( normalCall.getVariant() == null ? 0 : normalCall.getVariant().lengthOnRef())+NQS_WIDTH-1; + + if ( right >= adjustedPosition && ! force) { + // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect + + // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; + // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage + move_to = adjustPosition(left); + if ( move_to == -1 ) { + // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether + normal_context.shift((int)(adjustedPosition-normal_context.getStart())); + return; + } + if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); + break; + } + + // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right: + if ( right > normal_context.getStop() ) right = normal_context.getStop(); + + // location = getToolkit().getGenomeLocParser().setStart(location,pos); + // location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data + + location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos); + + boolean haveCall = normalCall.isCall(); // cache the value + + if ( haveCall || genotype) { + if ( haveCall ) normalCallsMade++; + printVCFLine(vcf_writer,normalCall); + if ( bedWriter != null ) normalCall.printBedLine(bedWriter); + if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall); + lastGenotypedPosition = pos; + } + + normal_context.indelsAt(pos).clear(); + // we dealt with this indel; don't want to see it again + // (we might otherwise in the case when 1) there is another indel that follows + // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) + +// for ( IndelVariant var : variants ) { +// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); +// } + } + + if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); + normal_context.shift((int)(move_to - normal_context.getStart() ) ); + } + + /** A shortcut. Returns true if we got indels within the specified interval in single and only window context + * (for single-sample calls) or in either of the two window contexts (for two-sample/somatic calls) + * + */ + private boolean indelsPresentInInterval(long start, long stop) { + if ( tumor_context == null ) return normal_context.hasIndelsInInterval(start,stop); + return tumor_context.hasIndelsInInterval(start,stop) || + normal_context.hasIndelsInInterval(start,stop); + } + /** Takes the position, to which window shift is requested, and tries to adjust it in such a way that no NQS window is broken. + * Namely, this method checks, iteratively, if there is an indel within NQS_WIDTH bases ahead of initially requested or adjusted + * shift position. If there is such an indel, + * then shifting to that position would lose some or all NQS-window bases to the left of the indel (since it's not going to be emitted + * just yet). Instead, this method tries to readjust the shift position leftwards so that full NQS window to the left of the next indel + * is preserved. This method tries thie strategy 4 times (so that it would never walk away too far to the left), and if it fails to find + * an appropriate adjusted shift position (which could happen if there are many indels following each other at short intervals), it will give up, + * go back to the original requested shift position and try finding the first shift poisition that has no indel associated with it. + */ + + private long adjustPosition(long request) { + long initial_request = request; + int attempts = 0; + boolean failure = false; + while ( indelsPresentInInterval(request,request+NQS_WIDTH) ) { + request -= NQS_WIDTH; + if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); + attempts++; + if ( attempts == 4 ) { + if ( DEBUG ) System.out.println("DEBUG>> attempts to preserve full NQS window failed; now trying to find any suitable position.") ; + failure = true; + break; + } + } + + if ( failure ) { + // we tried 4 times but did not find a good shift position that would preserve full nqs window + // around all indels. let's fall back and find any shift position as long and there's no indel at the very + // first position after the shift (this is bad for other reasons); if it breaks a nqs window, so be it + request = initial_request; + attempts = 0; + while ( indelsPresentInInterval(request,request+1) ) { + request--; + if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); + attempts++; + if ( attempts == 50 ) { + System.out.println("WARNING: Indel at every position in the interval "+refName+":"+request+"-"+initial_request+ + ". Can not find a break to shift context window to; no calls will be attempted in the current window."); + return -1; + } + } + } + if ( DEBUG ) System.out.println("DEBUG>> Found acceptable target position "+request); + return request; + } + + /** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed + * first elements of the coverage arrays map onto 'position', or a few bases prior to the specified position + * if there is an indel in close proximity to 'position' so that we may get more coverage around it later. + * + * @param position + */ + private void emit_somatic(long position, boolean force) { + + long adjustedPosition = adjustPosition(position); + if ( adjustedPosition == -1 ) { + // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether + normal_context.shift((int)(position-normal_context.getStart())); + tumor_context.shift((int)(position-tumor_context.getStart())); + return; + } + long move_to = adjustedPosition; + + if ( DEBUG ) System.out.println("DEBUG>> Emitting in somatic mode up to "+position+" force shift="+force+" current window="+tumor_context.getStart()+"-"+tumor_context.getStop()); + + for ( int pos = tumor_context.getStart() ; pos < Math.min(adjustedPosition,tumor_context.getStop()+1) ; pos++ ) { + + boolean genotype = false; + // first let's see if we need to genotype current position: + + final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) + + if ( pos <= lastGenotypedPosition ) continue; + + while ( currentGenotypeInterval != null ) { + + // if we did not even reach next interval yet, no genotyping at current position: + if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || + location.getContigIndex() == currentGenotypeInterval.getContigIndex() && + p < currentGenotypeInterval.getStart() ) break; + if ( pastInterval(p, currentGenotypeInterval) ) { + // we are past current genotyping interval, so we are done with it; let's load next interval: + currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; + continue; // re-enter the loop to check against the interval we just loaded + } + + // we reach tjis point only if p is inside current genotyping interval; set the flag and bail out: + genotype = true; + break; + } +// if ( DEBUG) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); + + if ( tumor_context.indelsAt(pos).size() == 0 && ! genotype ) continue; // no indels in tumor + + if ( DEBUG && genotype ) System.out.println("DEBUG>> Genotyping requested at "+pos); + + IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH); + IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); + + if ( tumorCall.getCoverage() < minCoverage && ! genotype ) { + if ( DEBUG ) { + System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)"); + } + continue; // low coverage + } + if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) { + if ( DEBUG ) { + System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); + } + continue; // low coverage + } + + if ( DEBUG ) { + System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, "); + System.out.print("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in normal at "+pos); + } + + long left = Math.max( pos-NQS_WIDTH, tumor_context.getStart() ); + long right = pos+ ( tumorCall.getVariant() == null ? 0 : tumorCall.getVariant().lengthOnRef() )+NQS_WIDTH-1; + + if ( right >= adjustedPosition && ! force) { + // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect + + // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; + // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage + move_to = adjustPosition(left); + if ( move_to == -1 ) { + // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether + normal_context.shift((int)(adjustedPosition-normal_context.getStart())); + tumor_context.shift((int)(adjustedPosition-tumor_context.getStart())); + return; + } + if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); + break; + } + + if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right + +// location = getToolkit().getGenomeLocParser().setStart(location,pos); +// location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data + + location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data + + boolean haveCall = tumorCall.isCall(); // cache the value + + if ( haveCall || genotype ) { + if ( haveCall ) tumorCallsMade++; + + printVCFLine(vcf_writer,normalCall,tumorCall); + + if ( bedWriter != null ) tumorCall.printBedLine(bedWriter); + + if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall ); + lastGenotypedPosition = pos; + } + tumor_context.indelsAt(pos).clear(); + normal_context.indelsAt(pos).clear(); + // we dealt with this indel; don't want to see it again + // (we might otherwise in the case when 1) there is another indel that follows + // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) + +// for ( IndelVariant var : variants ) { +// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); +// } + } + + if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); + tumor_context.shift((int)(move_to - tumor_context.getStart() ) ); + normal_context.shift((int)(move_to - normal_context.getStart() ) ); + } + + private String makeFullRecord(IndelPrecall normalCall, IndelPrecall tumorCall) { + StringBuilder fullRecord = new StringBuilder(); + if ( tumorCall.getVariant() != null || normalCall.getVariant() == null) { + fullRecord.append(tumorCall.makeEventString()); + } else { + fullRecord.append(normalCall.makeEventString()); + } + fullRecord.append('\t'); + fullRecord.append(normalCall.makeStatsString("N_")); + fullRecord.append('\t'); + fullRecord.append(tumorCall.makeStatsString("T_")); + fullRecord.append('\t'); + return fullRecord.toString(); + } + + private String makeFullRecord(IndelPrecall normalCall) { + StringBuilder fullRecord = new StringBuilder(); + fullRecord.append(normalCall.makeEventString()); + fullRecord.append('\t'); + fullRecord.append(normalCall.makeStatsString("")); + fullRecord.append('\t'); + return fullRecord.toString(); + } + + private String getAnnotationString(RODRecordList ann) { + if ( ann == null ) return annGenomic; + else { + StringBuilder b = new StringBuilder(); + + if ( RefSeqFeature.isExon(ann) ) { + if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence + else b.append(annUTR); // exon but not coding = UTR + } else { + if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron + else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... + } + b.append('\t'); + b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed +// while ( it.hasNext() ) { // +// t.getGeneName() +// } + return b.toString(); + } + + } + + public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall) { + RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); + String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); + + StringBuilder fullRecord = new StringBuilder(); + fullRecord.append(makeFullRecord(normalCall)); + fullRecord.append(annotationString); + if ( ! normalCall.isCall() && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); + try { + verboseWriter.write(fullRecord.toString()); + verboseWriter.write('\n'); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); + } + + } + + + public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall) { + RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); + String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); + + StringBuilder fullRecord = new StringBuilder(); + fullRecord.append(makeFullRecord(normalCall,tumorCall)); + + if ( normalCall.getVariant() == null && tumorCall.getVariant() == null ) { + // did not observe anything + if ( normalCall.getCoverage() >= minNormalCoverage && tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); + else { + if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); // no coverage in normal but nothing in tumor + else { + // no coverage in tumor; if we have no coverage in normal, it can be anything; if we do have coverage in normal, + // this still could be a somatic event. so either way it is 'unknown' + fullRecord.append("UNKNOWN"); + } + } + + } + + if ( normalCall.getVariant() == null && tumorCall.getVariant() != null ) { + // looks like somatic call + if ( normalCall.getCoverage() >= minNormalCoverage ) fullRecord.append("SOMATIC"); // we confirm there is nothing in normal + else { + // low coverage in normal + fullRecord.append("EVENT_T"); // no coverage in normal, no idea whether it is germline or somatic + } + } + + if ( normalCall.getVariant() != null && tumorCall.getVariant() == null ) { + // it's likely germline (with missing observation in tumor - maybe loh? + if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("GERMLINE_LOH"); // we confirm there is nothing in tumor + else { + // low coverage in tumor, maybe we missed the event + fullRecord.append("GERMLINE"); // no coverage in tumor but we already saw it in normal... + } + } + + if ( normalCall.getVariant() != null && tumorCall.getVariant() != null ) { + // events in both T/N, got to be germline! + fullRecord.append("GERMLINE"); + } + + + fullRecord.append('\t'); + fullRecord.append(annotationString); + + if ( ! tumorCall.isCall() && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); + + try { + verboseWriter.write(fullRecord.toString()); + verboseWriter.write('\n'); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); + } + } + + public void printVCFLine(VCFWriter vcf, IndelPrecall call) { + + long start = call.getPosition()-1; + // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. + // The suggestion is instead of putting the base before the indel, to put the base after the indel. + // For now, just don't print out that site. + if ( start == 0 ) + return; + + long stop = start; + + List alleles = new ArrayList(2); // actual observed (distinct!) alleles at the site + List homref_alleles = null; // when needed, will contain two identical copies of ref allele - needed to generate hom-ref genotype + + + if ( call.getVariant() == null ) { + // we will need to cteate genotype with two (hom) ref alleles (below). + // we can not use 'alleles' list here, since that list is supposed to contain + // only *distinct* alleles observed at the site or VCFContext will frown upon us... + alleles.add( Allele.create(refBases[(int)start-1],true) ); + homref_alleles = new ArrayList(2); + homref_alleles.add( alleles.get(0)); + homref_alleles.add( alleles.get(0)); + } else { + // we always create alt allele when we observe anything but the ref, even if it is not a call! + // (Genotype will tell us whether it is an actual call or not!) + int event_length = call.getVariant().lengthOnRef(); + if ( event_length < 0 ) event_length = 0; + fillAlleleList(alleles,call); + stop += event_length; + } + + Map genotypes = new HashMap(); + + for ( String sample : normalSamples ) { + + Map attrs = call.makeStatsAttributes(null); + + if ( call.isCall() ) // we made a call - put actual het genotype here: + genotypes.put(sample,new Genotype(sample,alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); + else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) + genotypes.put(sample,new Genotype(sample, homref_alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); + + } + Set filters = null; + if ( call.getVariant() != null && ! call.isCall() ) { + filters = new HashSet(); + filters.add("NoCall"); + } + VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, + -1.0 /* log error */, filters, null, refBases[(int)start-1]); + vcf.add(vc); + } + + /** Fills l with appropriate alleles depending on whether call is insertion or deletion + * (l MUST have a variant or this method will crash). It is guaranteed that the *first* allele added + * to the list is ref, and the next one is alt. + * @param l + * @param call + */ + private void fillAlleleList(List l, IndelPrecall call) { + int event_length = call.getVariant().lengthOnRef(); + if ( event_length == 0 ) { // insertion + + l.add( Allele.create(Allele.NULL_ALLELE_STRING,true) ); + l.add( Allele.create(call.getVariant().getBases(), false )); + + } else { //deletion: + l.add( Allele.create(call.getVariant().getBases(), true )); + l.add( Allele.create(Allele.NULL_ALLELE_STRING,false) ); + } + } + + public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) { + + long start = tCall.getPosition()-1; + long stop = start; + + // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. + // The suggestion is instead of putting the base before the indel, to put the base after the indel. + // For now, just don't print out that site. + if ( start == 0 ) + return; + + Map attrsNormal = nCall.makeStatsAttributes(null); + Map attrsTumor = tCall.makeStatsAttributes(null); + + Map attrs = new HashMap(); + + boolean isSomatic = false; + if ( nCall.getCoverage() >= minNormalCoverage && nCall.getVariant() == null && tCall.getVariant() != null ) { + isSomatic = true; + attrs.put(VCFConstants.SOMATIC_KEY,true); + } + List alleles = new ArrayList(2); // all alleles at the site + // List normal_alleles = null; // all alleles at the site + List homRefAlleles = null; + +// if ( nCall.getVariant() == null || tCall.getVariant() == null ) { + homRefAlleles = new ArrayList(2) ; // we need this for somatic calls (since normal is ref-ref), and also for no-calls +// } + boolean homRefT = ( tCall.getVariant() == null ); + boolean homRefN = ( nCall.getVariant() == null ); + if ( tCall.getVariant() == null && nCall.getVariant() == null) { + // no indel at all ; create base-representation ref/ref alleles for genotype construction + alleles.add( Allele.create(refBases[(int)start-1],true) ); + } else { + // we got indel(s) + int event_length = 0; + if ( tCall.getVariant() != null ) { + // indel in tumor + event_length = tCall.getVariant().lengthOnRef(); + fillAlleleList(alleles, tCall); + } else { + event_length = nCall.getVariant().lengthOnRef(); + fillAlleleList(alleles, nCall); + } + if ( event_length > 0 ) stop += event_length; + } + homRefAlleles.add( alleles.get(0)); + homRefAlleles.add( alleles.get(0)); + + Map genotypes = new HashMap(); + + for ( String sample : normalSamples ) { + genotypes.put(sample,new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsNormal,false)); + } + + for ( String sample : tumorSamples ) { + genotypes.put(sample,new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsTumor,false) ); + } + + Set filters = null; + if ( tCall.getVariant() != null && ! tCall.isCall() ) { + filters = new HashSet(); + filters.add("NoCall"); + } + if ( nCall.getCoverage() < minNormalCoverage ) { + if ( filters == null ) filters = new HashSet(); + filters.add("NCov"); + } + if ( tCall.getCoverage() < minCoverage ) { + if ( filters == null ) filters = new HashSet(); + filters.add("TCov"); + } + + VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, + -1.0 /* log error */, filters, attrs, refBases[(int)start-1]); + vcf.add(vc); + } + + @Override + public void onTraversalDone(Integer result) { + if ( DEBUG ) { + System.out.println("DEBUG>> Emitting last window at "+normal_context.getStart()+"-"+normal_context.getStop()); + } + if ( call_somatic ) emit_somatic(1000000000, true); + else emit(1000000000,true); // emit everything we might have left + + if ( metricsWriter != null ) { + metricsWriter.println(String.format("Normal calls made %d", normalCallsMade)); + metricsWriter.println(String.format("Tumor calls made %d", tumorCallsMade)); + metricsWriter.close(); + } + + try { + if ( bedWriter != null ) bedWriter.close(); + if ( verboseWriter != null ) verboseWriter.close(); + } catch (IOException e) { + System.out.println("Failed to close output BED file gracefully, data may be lost"); + e.printStackTrace(); + } + super.onTraversalDone(result); + } + + @Override + public Integer reduce(Integer value, Integer sum) { + if ( value == -1 ) { + onTraversalDone(sum); + System.exit(1); + } + sum += value; + return sum; + } + + @Override + public Integer reduceInit() { + return new Integer(0); + } + + + static class IndelVariant { + public static enum Type { I, D}; + private String bases; + private Type type; + private ArrayList fromStartOffsets = null; + private ArrayList fromEndOffsets = null; + + private Set reads = new HashSet(); // keep track of reads that have this indel + private Set samples = new HashSet(); // which samples had the indel described by this object + + public IndelVariant(ExpandedSAMRecord read , Type type, String bases) { + this.type = type; + this.bases = bases.toUpperCase(); + addObservation(read); + fromStartOffsets = new ArrayList(); + fromEndOffsets = new ArrayList(); + } + + /** Adds another observation for the current indel. It is assumed that the read being registered + * does contain the observation, no checks are performed. Read's sample is added to the list of samples + * this indel was observed in as well. + * @param read + */ + public void addObservation(ExpandedSAMRecord read) { + if ( reads.contains(read) ) { + //TODO fix CleanedReadInjector and reinstate exception here: duplicate records may signal a problem with the bam + // seeing the same read again can mean only one thing: the input bam file is corrupted and contains + // duplicate records. We KNOW that this may happen for the time being due to bug in CleanedReadInjector + // so this is a short-term patch: don't cry, but just ignore the duplicate record + + //throw new StingException("Attempting to add indel observation that was already registered"); + return; + } + reads.add(read); + String sample = null; + if ( read.getSAMRecord().getReadGroup() != null ) sample = read.getSAMRecord().getReadGroup().getSample(); + if ( sample != null ) samples.add(sample); + } + + + /** Returns length of the event on the reference (number of deleted bases + * for deletions, -1 for insertions. + * @return + */ + public int lengthOnRef() { + if ( type == Type.D ) return bases.length(); + else return 0; + } + + + public void addSample(String sample) { + if ( sample != null ) + samples.add(sample); + } + + public void addReadPositions(int fromStart, int fromEnd) { + fromStartOffsets.add(fromStart); + fromEndOffsets.add(fromEnd); + } + + public List getOffsetsFromStart() { return fromStartOffsets ; } + public List getOffsetsFromEnd() { return fromEndOffsets; } + + public String getSamples() { + StringBuffer sb = new StringBuffer(); + Iterator i = samples.iterator(); + while ( i.hasNext() ) { + sb.append(i.next()); + if ( i.hasNext() ) + sb.append(","); + } + return sb.toString(); + } + + public Set getReadSet() { return reads; } + + public int getCount() { return reads.size(); } + + public String getBases() { return bases; } + + public Type getType() { return type; } + + @Override + public boolean equals(Object o) { + if ( ! ( o instanceof IndelVariant ) ) return false; + IndelVariant that = (IndelVariant)o; + return ( this.type == that.type && this.bases.equals(that.bases) ); + } + + public boolean equals(Type type, String bases) { + return ( this.type == type && this.bases.equals(bases.toUpperCase()) ); + } + } + + /** + * Utility class that encapsulates the logic related to collecting all the stats and counts required to + * make (or discard) a call, as well as the calling heuristics that uses those data. + */ + class IndelPrecall { +// private boolean DEBUG = false; + private int NQS_MISMATCH_CUTOFF = 1000000; + private double AV_MISMATCHES_PER_READ = 1.5; + + private int nqs = 0; + private IndelVariant consensus_indel = null; // indel we are going to call + private long pos = -1 ; // position on the ref + private int total_coverage = 0; // total number of reads overlapping with the event + private int consensus_indel_count = 0; // number of reads, in which consensus indel was observed + private int all_indel_count = 0 ; // number of reads, in which any indel was observed at current position + + private int total_mismatches_in_nqs_window = 0; // total number of mismatches in the nqs window around the indel + private int total_bases_in_nqs_window = 0; // total number of bases in the nqs window (some reads may not fully span the window so it's not coverage*nqs_size) + private int total_base_qual_in_nqs_window = 0; // sum of qualitites of all the bases in the nqs window + private int total_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of all mismatching bases in the nqs window + + private int indel_read_mismatches_in_nqs_window = 0; // mismatches inside the nqs window in indel-containing reads only + private int indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from indel-containing reads only + private int indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from indel-containing reads only + private int indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from indel-containing reads only + + + private int consensus_indel_read_mismatches_in_nqs_window = 0; // mismatches within the nqs window from consensus indel reads only + private int consensus_indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from consensus indel-containing reads only + private int consensus_indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from consensus indel-containing reads only + private int consensus_indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from consensus indel-containing reads only + + + private double consensus_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain consensus indel + private double all_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain any indel at given position + private double all_read_total_mm = 0.0; // sum of all mismatches in all reads + + private double consensus_indel_read_total_mapq = 0.0; // sum of mapping qualitites of all reads with consensus indel + private double all_indel_read_total_mapq = 0.0 ; // sum of mapping qualitites of all reads with (any) indel at current position + private double all_read_total_mapq = 0.0; // sum of all mapping qualities of all reads + + private PrimitivePair.Int consensus_indel_read_orientation_cnt = new PrimitivePair.Int(); + private PrimitivePair.Int all_indel_read_orientation_cnt = new PrimitivePair.Int(); + private PrimitivePair.Int all_read_orientation_cnt = new PrimitivePair.Int(); + + private int from_start_median = 0; + private int from_start_mad = 0; + private int from_end_median = 0; + private int from_end_mad = 0; + + /** Makes an empty call (no-call) with all stats set to 0 + * + * @param position + */ + public IndelPrecall(long position) { + this.pos = position; + } + + public IndelPrecall(WindowContext context, long position, int nqs_width) { + this.pos = position; + this.nqs = nqs_width; + total_coverage = context.coverageAt(pos,true); + List variants = context.indelsAt(pos); + findConsensus(variants); + + // pos is the first base after the event: first deleted base or first base after insertion. + // hence, [pos-nqs, pos+nqs-1] (inclusive) is the window with nqs bases on each side of a no-event or an insertion + // and [pos-nqs, pos+Ndeleted+nqs-1] is the window with nqs bases on each side of a deletion. + // we initialize the nqs window for no-event/insertion case + long left = Math.max( pos-nqs, context.getStart() ); + long right = Math.min(pos+nqs-1, context.getStop()); +//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size()); + Iterator read_iter = context.getReads().iterator(); + + + while ( read_iter.hasNext() ) { + ExpandedSAMRecord rec = read_iter.next(); + SAMRecord read = rec.getSAMRecord(); + byte[] flags = rec.getExpandedMMFlags(); + byte[] quals = rec.getExpandedQuals(); + int mm = rec.getMMCount(); + + + if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue; + + long local_right = right; // end of nqs window for this particular read. May need to be advanced further right + // if read has a deletion. The gap in the middle of nqs window will be skipped + // automatically since flags/quals are set to -1 there + + boolean read_has_a_variant = false; + boolean read_has_consensus = ( consensus_indel!= null && consensus_indel.getReadSet().contains(rec) ); + for ( IndelVariant v : variants ) { + if ( v.getReadSet().contains(rec) ) { + read_has_a_variant = true; + local_right += v.lengthOnRef(); + break; + } + } + + if ( read_has_consensus ) { + consensus_indel_read_total_mm += mm; + consensus_indel_read_total_mapq += read.getMappingQuality(); + if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++; + else consensus_indel_read_orientation_cnt.first++; + } + if ( read_has_a_variant ) { + all_indel_read_total_mm += mm; + all_indel_read_total_mapq += read.getMappingQuality(); + if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++; + else all_indel_read_orientation_cnt.first++; + } + + all_read_total_mm+= mm; + all_read_total_mapq += read.getMappingQuality(); + if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++; + else all_read_orientation_cnt.first++; + + for ( int pos_in_flags = Math.max((int)(left - read.getAlignmentStart()),0); + pos_in_flags <= Math.min((int)local_right-read.getAlignmentStart(),flags.length - 1); + pos_in_flags++) { + + if ( flags[pos_in_flags] == -1 ) continue; // gap (deletion), skip it; we count only bases aligned to the ref + total_bases_in_nqs_window++; + if ( read_has_consensus ) consensus_indel_read_bases_in_nqs_window++; + if ( read_has_a_variant ) indel_read_bases_in_nqs_window++; + + if ( quals[pos_in_flags] != -1 ) { + + total_base_qual_in_nqs_window += quals[pos_in_flags]; + if ( read_has_a_variant ) indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; + if ( read_has_consensus ) consensus_indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; + } + + if ( flags[pos_in_flags] == 1 ) { // it's a mismatch + total_mismatches_in_nqs_window++; + total_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; + + if ( read_has_consensus ) { + consensus_indel_read_mismatches_in_nqs_window++; + consensus_indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; + } + + if ( read_has_a_variant ) { + indel_read_mismatches_in_nqs_window++; + indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; + } + } + } +// if ( pos == 3534096 ) { +// System.out.println(read.getReadName()); +// System.out.println(" cons nqs bases="+consensus_indel_read_bases_in_nqs_window); +// System.out.println(" qual sum="+consensus_indel_read_base_qual_in_nqs_window); +// } + + } + + // compute median/mad for offsets from the read starts/ends + if ( consensus_indel != null ) { + from_start_median = median(consensus_indel.getOffsetsFromStart()) ; + from_start_mad = mad(consensus_indel.getOffsetsFromStart(),from_start_median); + from_end_median = median(consensus_indel.getOffsetsFromEnd()) ; + from_end_mad = mad(consensus_indel.getOffsetsFromEnd(),from_end_median); + } + } + + /** As a side effect will sort l! + * + * @param l + * @return + */ + private int median(List l) { + Collections.sort(l); + int k = l.size()/2; + return ( l.size() % 2 == 0 ? + (l.get(k-1).intValue()+l.get(k).intValue())/2 : + l.get(k).intValue()); + } + + private int median(int[] l) { + Arrays.sort(l); + int k = l.length/2; + return ( l.length % 2 == 0 ? + (l[k-1]+l[k])/2 : + l[k]); + } + + private int mad(List l, int med) { + int [] diff = new int[l.size()]; + for ( int i = 0; i < l.size(); i++ ) { + diff[i] = Math.abs(l.get(i).intValue() - med); + } + return median(diff); + } + + public long getPosition() { return pos; } + + public boolean hasObservation() { return consensus_indel != null; } + + public int getCoverage() { return total_coverage; } + + public double getTotalMismatches() { return all_read_total_mm; } + public double getConsensusMismatches() { return consensus_indel_read_total_mm; } + public double getAllVariantMismatches() { return all_indel_read_total_mm; } + + /** Returns average number of mismatches per consensus indel-containing read */ + public double getAvConsensusMismatches() { + return ( consensus_indel_count != 0 ? consensus_indel_read_total_mm/consensus_indel_count : 0.0 ); + } + + /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants) */ + public double getAvRefMismatches() { + int coverage_ref = total_coverage-all_indel_count; + return ( coverage_ref != 0 ? (all_read_total_mm - all_indel_read_total_mm )/coverage_ref : 0.0 ); + } + + public PrimitivePair.Int getConsensusStrandCounts() { + return consensus_indel_read_orientation_cnt; + } + + public PrimitivePair.Int getRefStrandCounts() { + return new PrimitivePair.Int(all_read_orientation_cnt.first-all_indel_read_orientation_cnt.first, + all_read_orientation_cnt.second - all_indel_read_orientation_cnt.second); + } + + /** Returns a sum of mapping qualities of all reads spanning the event. */ + public double getTotalMapq() { return all_read_total_mapq; } + + /** Returns a sum of mapping qualities of all reads, in which the consensus variant is observed. */ + public double getConsensusMapq() { return consensus_indel_read_total_mapq; } + + /** Returns a sum of mapping qualities of all reads, in which any variant is observed at the current event site. */ + public double getAllVariantMapq() { return all_indel_read_total_mapq; } + + /** Returns average mapping quality per consensus indel-containing read. */ + public double getAvConsensusMapq() { + return ( consensus_indel_count != 0 ? consensus_indel_read_total_mapq/consensus_indel_count : 0.0 ); + } + + /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants). */ + public double getAvRefMapq() { + int coverage_ref = total_coverage-all_indel_count; + return ( coverage_ref != 0 ? (all_read_total_mapq - all_indel_read_total_mapq )/coverage_ref : 0.0 ); + } + + /** Returns fraction of bases in NQS window around the indel that are mismatches, across all reads, + * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around + * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} + * */ + public double getNQSConsensusMMRate() { + if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; + return ((double)consensus_indel_read_mismatches_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; + } + + /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads + * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref + * reads is defined around the event start position, NOT around the actual consensus indel. + * */ + public double getNQSRefMMRate() { + int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; + if ( num_ref_bases == 0 ) return 0; + return ((double)(total_mismatches_in_nqs_window - indel_read_mismatches_in_nqs_window))/num_ref_bases; + } + + /** Returns average base quality in NQS window around the indel, across all reads, + * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around + * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} + * */ + public double getNQSConsensusAvQual() { + if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; + return ((double)consensus_indel_read_base_qual_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; + } + + /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads + * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref + * reads is defined around the event start position, NOT around the actual consensus indel. + * */ + public double getNQSRefAvQual() { + int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; + if ( num_ref_bases == 0 ) return 0; + return ((double)(total_base_qual_in_nqs_window - indel_read_base_qual_in_nqs_window))/num_ref_bases; + } + + public int getTotalNQSMismatches() { return total_mismatches_in_nqs_window; } + + public int getAllVariantCount() { return all_indel_count; } + public int getConsensusVariantCount() { return consensus_indel_count; } + +// public boolean failsNQSMismatch() { +// //TODO wrong fraction: mismatches are counted only in indel-containing reads, but total_coverage is used! +// return ( indel_read_mismatches_in_nqs_window > NQS_MISMATCH_CUTOFF ) || +// ( indel_read_mismatches_in_nqs_window > total_coverage * AV_MISMATCHES_PER_READ ); +// } + + public IndelVariant getVariant() { return consensus_indel; } + + public boolean isCall() { + boolean ret = ( consensus_indel_count >= minIndelCount && + (double)consensus_indel_count > minFraction * total_coverage && + (double) consensus_indel_count > minConsensusFraction*all_indel_count && total_coverage >= minCoverage); + if ( DEBUG && ! ret ) System.out.println("DEBUG>> NOT a call: count="+consensus_indel_count+ + " total_count="+all_indel_count+" cov="+total_coverage+ + " minConsensusF="+((double)consensus_indel_count)/all_indel_count+ + " minF="+((double)consensus_indel_count)/total_coverage); + return ret; + + } + + /** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed + * variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus) + * @param variants + * @return + */ + private void findConsensus(List variants) { + for ( IndelVariant var : variants ) { + if ( DEBUG ) System.out.println("DEBUG>> Variant "+var.getBases()+" (cnt="+var.getCount()+")"); + int cnt = var.getCount(); + all_indel_count +=cnt; + if ( cnt > consensus_indel_count ) { + consensus_indel = var; + consensus_indel_count = cnt; + } + } + if ( DEBUG && consensus_indel != null ) System.out.println("DEBUG>> Returning: "+consensus_indel.getBases()+ + " (cnt="+consensus_indel.getCount()+") with total count of "+all_indel_count); + } + + + + public void printBedLine(Writer bed) { + int event_length; + if ( consensus_indel == null ) event_length = 0; + else { + event_length = consensus_indel.lengthOnRef(); + if ( event_length < 0 ) event_length = 0; + } + + StringBuffer message = new StringBuffer(); + message.append(refName+"\t"+(pos-1)+"\t"); + message.append((pos-1+event_length)+"\t"); + if ( consensus_indel != null ) { + message.append((event_length>0? "-":"+")+consensus_indel.getBases()); + } else { + message.append('.'); + } + message.append(":"+all_indel_count+"/"+total_coverage); + try { + bed.write(message.toString()+"\n"); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(bedOutput, "Error encountered while writing into output BED file", e); + } + } + + public String makeEventString() { + int event_length; + if ( consensus_indel == null ) event_length = 0; + else { + event_length = consensus_indel.lengthOnRef(); + if ( event_length < 0 ) event_length = 0; + } + StringBuffer message = new StringBuffer(); + message.append(refName); + message.append('\t'); + message.append(pos-1); + message.append('\t'); + message.append(pos-1+event_length); + message.append('\t'); + if ( consensus_indel != null ) { + message.append((event_length>0?'-':'+')); + message.append(consensus_indel.getBases()); + } else { + message.append('.'); + } + return message.toString(); + } + + public String makeStatsString(String prefix) { + StringBuilder message = new StringBuilder(); + message.append(prefix+"OBS_COUNTS[C/A/T]:"+getConsensusVariantCount()+"/"+getAllVariantCount()+"/"+getCoverage()); + message.append('\t'); + message.append(prefix+"AV_MM[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMismatches(), + getAvRefMismatches())); + message.append('\t'); + message.append(prefix+"AV_MAPQ[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMapq(), + getAvRefMapq())); + message.append('\t'); + message.append(prefix+"NQS_MM_RATE[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusMMRate(),getNQSRefMMRate())); + message.append('\t'); + message.append(prefix+"NQS_AV_QUAL[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusAvQual(),getNQSRefAvQual())); + + PrimitivePair.Int strand_cons = getConsensusStrandCounts(); + PrimitivePair.Int strand_ref = getRefStrandCounts(); + message.append('\t'); + message.append(prefix+"STRAND_COUNTS[C/C/R/R]:"+strand_cons.first+"/"+strand_cons.second+"/"+strand_ref.first+"/"+strand_ref.second); + + message.append('\t'); + message.append(prefix+"OFFSET_RSTART:"+from_start_median+"/"+from_start_mad); + message.append('\t'); + message.append(prefix+"OFFSET_REND:"+from_end_median+"/"+from_end_mad); + + return message.toString(); + + } + + /** + * Places alignment statistics into attribute map and returns the map. If attr parameter is null, + * a new map is allocated, filled and returned. If attr is not null, new attributes are added to that + * preexisting map, and the same instance of the (updated) map is returned. + * + * @param attr + * @return + */ + public Map makeStatsAttributes(Map attr) { + if ( attr == null ) attr = new HashMap(); + + VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),attr); + + VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),attr); + + VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),attr); + + VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),attr); + + VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),attr); + + VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,attr); + + VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,attr); + + PrimitivePair.Int strand_cons = getConsensusStrandCounts(); + PrimitivePair.Int strand_ref = getRefStrandCounts(); + + VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,attr); + return attr; + } + } + + interface IndelListener { + public void addObservation(int pos, IndelVariant.Type t, String bases, int fromStart, int fromEnd, ExpandedSAMRecord r); + } + + class WindowContext implements IndelListener { + private Set reads; + private int start=0; // where the window starts on the ref, 1-based + private CircularArray< List< IndelVariant > > indels; + + private List emptyIndelList = new ArrayList(); + + + public WindowContext(int start, int length) { + this.start = start; + indels = new CircularArray< List >(length); +// reads = new LinkedList(); + reads = new HashSet(); + } + + /** Returns 1-based reference start position of the interval this object keeps context for. + * + * @return + */ + public int getStart() { return start; } + + /** Returns 1-based reference stop position (inclusive) of the interval this object keeps context for. + * + * @return + */ + public int getStop() { return start + indels.length() - 1; } + + /** Resets reference start position to 0 and clears the context. + * + */ + public void clear() { + start = 0; + reads.clear(); + indels.clear(); + } + + /** + * Returns true if any indel observations are present in the specified interval + * [begin,end] (1-based, inclusive). Interval can be partially of fully outside of the + * current context window: positions outside of the window will be ignored. + * @param begin + * @param end + */ + public boolean hasIndelsInInterval(long begin, long end) { + for ( long k = Math.max(start,begin); k < Math.min(getStop(),end); k++ ) { + if ( indelsAt(k) != emptyIndelList ) return true; + } + return false; + } + + public Set getReads() { return reads; } + + /** Returns the number of reads spanning over the specified reference position + * (regardless of whether they have a base or indel at that specific location). + * The second argument controls whether to count with indels in mind (this is relevant for insertions only, + * deletions do not require any special treatment since they occupy non-zero length on the ref and since + * alignment can not start or end with a deletion). For insertions, note that, internally, we assign insertions + * to the reference position right after the actual event, and we count all events assigned to a given position. + * This count (reads with indels) should be contrasted to reads without indels, or more rigorously, reads + * that support the ref rather than the indel. Few special cases may occur here: + * 1) an alignment that ends (as per getAlignmentEnd()) right before the current position but has I as its + * last element: we have to count that read into the "coverage" at the current position for the purposes of indel + * assessment, as the indel in that read will be counted at the current position, so the total coverage + * should be consistent with that. + */ + /* NOT IMPLEMENTED: 2) alsignments that start exactly at the current position do not count for the purpose of insertion + * assessment since they do not contribute any evidence to either Ref or Alt=insertion hypothesis, unless + * the alignment starts with I (so that we do have evidence for an indel assigned to the current position and + * read should be counted). For deletions, reads starting at the current position should always be counted (as they + * show no deletion=ref). + * @param refPos position on the reference; must be within the bounds of the window + */ + public int coverageAt(final long refPos, boolean countForIndels) { + int cov = 0; + for ( ExpandedSAMRecord read : reads ) { + if ( read.getSAMRecord().getAlignmentStart() > refPos || read.getSAMRecord().getAlignmentEnd() < refPos ) { + if ( countForIndels && read.getSAMRecord().getAlignmentEnd() == refPos - 1) { + Cigar c = read.getSAMRecord().getCigar(); + if ( c.getCigarElement(c.numCigarElements()-1).getOperator() == CigarOperator.I ) cov++; + } + continue; + } + cov++; + } + return cov; + } + + + /** Shifts current window to the right along the reference contig by the specified number of bases. + * The context will be updated accordingly (indels and reads that go out of scope will be dropped). + * @param offset + */ + public void shift(int offset) { + start += offset; + + indels.shiftData(offset); + if ( indels.get(0) != null && indels.get(0).size() != 0 ) { + IndelVariant indel = indels.get(0).get(0); + + System.out.println("WARNING: Indel(s) at first position in the window ("+refName+":"+start+"): currently not supported: "+ + (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; read: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()+"; site ignored"); + indels.get(0).clear(); +// throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+ +// (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()); + } + + Iterator read_iter = reads.iterator(); + + while ( read_iter.hasNext() ) { + ExpandedSAMRecord r = read_iter.next(); + if ( r.getSAMRecord().getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope + read_iter.remove(); + } + } + } + + public void add(SAMRecord read, byte [] ref) { + + if ( read.getAlignmentStart() < start ) return; // silently ignore reads starting before the window start + + ExpandedSAMRecord er = new ExpandedSAMRecord(read,ref,read.getAlignmentStart()-start,this); + //TODO duplicate records may actually indicate a problem with input bam file; throw an exception when the bug in CleanedReadInjector is fixed + if ( reads.contains(er)) return; // ignore duplicate records + reads.add(er); + } + + public void addObservation(int pos, IndelVariant.Type type, String bases, int fromStart, int fromEnd, ExpandedSAMRecord rec) { + List indelsAtSite; + try { + indelsAtSite = indels.get(pos); + } catch (IndexOutOfBoundsException e) { + SAMRecord r = rec.getSAMRecord(); + System.out.println("Failed to add indel observation, probably out of coverage window bounds (trailing indel?):\nRead "+ + r.getReadName()+": "+ + "read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+ + r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+ + "; window end="+getStop()); + throw e; + } + + if ( indelsAtSite == null ) { + indelsAtSite = new ArrayList(); + indels.set(pos, indelsAtSite); + } + + IndelVariant indel = null; + for ( IndelVariant v : indelsAtSite ) { + if ( ! v.equals(type, bases) ) continue; + + indel = v; + indel.addObservation(rec); + break; + } + + if ( indel == null ) { // not found: + indel = new IndelVariant(rec, type, bases); + indelsAtSite.add(indel); + } + indel.addReadPositions(fromStart,fromEnd); + } + + public List indelsAt( final long refPos ) { + List l = indels.get((int)( refPos - start )); + if ( l == null ) return emptyIndelList; + else return l; + } + + + } + + + class ExpandedSAMRecord { + private SAMRecord read; + private byte[] mismatch_flags; + private byte[] expanded_quals; + private int mms; + + public ExpandedSAMRecord(SAMRecord r, byte [] ref, long offset, IndelListener l) { + + read = r; + final long rStart = read.getAlignmentStart(); + final long rStop = read.getAlignmentEnd(); + final byte[] readBases = read.getReadString().toUpperCase().getBytes(); + + ref = new String(ref).toUpperCase().getBytes(); + + mismatch_flags = new byte[(int)(rStop-rStart+1)]; + expanded_quals = new byte[(int)(rStop-rStart+1)]; + + // now let's extract indels: + + Cigar c = read.getCigar(); + final int nCigarElems = c.numCigarElements(); + + + int readLength = 0; // length of the aligned part of the read NOT counting clipped bases + for ( CigarElement cel : c.getCigarElements() ) { + + switch(cel.getOperator()) { + case H: + case S: + case D: + case N: + case P: + break; // do not count gaps or clipped bases + case I: + case M: + readLength += cel.getLength(); + break; // advance along the gapless block in the alignment + default : + throw new IllegalArgumentException("Unexpected operator in cigar string: "+cel.getOperator()); + } + } + + int fromStart = 0; + int posOnRead = 0; + int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read: + // its start on the actual full reference contig is r.getAlignmentStart() + for ( int i = 0 ; i < nCigarElems ; i++ ) { + + final CigarElement ce = c.getCigarElement(i); + IndelVariant.Type type = null; + String indel_bases = null; + int eventPosition = posOnRef; + + switch(ce.getOperator()) { + case H: break; // hard clipped reads do not have clipped indel_bases in their sequence, so we just ignore the H element... + case I: + type = IndelVariant.Type.I; + indel_bases = read.getReadString().substring(posOnRead,posOnRead+ce.getLength()); + // will increment position on the read below, there's no 'break' statement yet... + case S: + // here we also skip soft-clipped indel_bases on the read; according to SAM format specification, + // alignment start position on the reference points to where the actually aligned + // (not clipped) indel_bases go, so we do not need to increment reference position here + posOnRead += ce.getLength(); + break; + case D: + type = IndelVariant.Type.D; + indel_bases = new String( ref, posOnRef, ce.getLength() ); + for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) mismatch_flags[posOnRef] = expanded_quals[posOnRef] = -1; + + break; + case M: + for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { + if ( readBases[posOnRead] != ref[posOnRef] ) { // mismatch! + mms++; + mismatch_flags[posOnRef] = 1; + } + expanded_quals[posOnRef] = read.getBaseQualities()[posOnRead]; + } + fromStart += ce.getLength(); + break; // advance along the gapless block in the alignment + default : + throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator()); + } + + if ( type == null ) continue; // element was not an indel, go grab next element... + + // we got an indel if we are here... + if ( i == 0 ) logger.debug("Indel at the start of the read "+read.getReadName()); + if ( i == nCigarElems - 1) logger.debug("Indel at the end of the read "+read.getReadName()); + + // note that here we will be assigning indels to the first deleted base or to the first + // base after insertion, not to the last base before the event! + int fromEnd = readLength - fromStart; + if ( type == IndelVariant.Type.I ) fromEnd -= ce.getLength(); + + l.addObservation((int)(offset+eventPosition), type, indel_bases, fromStart, fromEnd, this); + + if ( type == IndelVariant.Type.I ) fromStart += ce.getLength(); + + } + } + + public SAMRecord getSAMRecord() { return read; } + + public byte [] getExpandedMMFlags() { return mismatch_flags; } + + public byte [] getExpandedQuals() { return expanded_quals; } + + public int getMMCount() { return mms; } + + public boolean equals(Object o) { + if ( this == o ) return true; + if ( read == null ) return false; + if ( o instanceof SAMRecord ) return read.equals(o); + if ( o instanceof ExpandedSAMRecord ) return read.equals(((ExpandedSAMRecord)o).read); + return false; + } + + + } + +} + + +class VCFIndelAttributes { + public static String ALLELIC_DEPTH_KEY = "AD"; + public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY; + + public static String MAPQ_KEY = "MQS"; + + public static String MM_KEY = "MM"; + + public static String NQS_MMRATE_KEY = "NQSMM"; + + public static String NQS_AVQ_KEY = "NQSBQ"; + + public static String STRAND_COUNT_KEY = "SC"; + public static String RSTART_OFFSET_KEY = "RStart"; + public static String REND_OFFSET_KEY = "REnd"; + + public static Set getAttributeHeaderLines() { + Set lines = new HashSet(); + + lines.add(new VCFFormatHeaderLine(ALLELIC_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "# of reads supporting consensus indel/reference at the site")); + lines.add(new VCFFormatHeaderLine(DEPTH_TOTAL_KEY, 1, VCFHeaderLineType.Integer, "Total coverage at the site")); + + lines.add(new VCFFormatHeaderLine(MAPQ_KEY, 2, VCFHeaderLineType.Float, "Average mapping qualities of consensus indel-supporting reads/reference-supporting reads")); + + lines.add(new VCFFormatHeaderLine(MM_KEY, 2, VCFHeaderLineType.Float, "Average # of mismatches per consensus indel-supporting read/per reference-supporting read")); + + lines.add(new VCFFormatHeaderLine(NQS_MMRATE_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads")); + + lines.add(new VCFFormatHeaderLine(NQS_AVQ_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads")); + + lines.add(new VCFFormatHeaderLine(STRAND_COUNT_KEY, 4, VCFHeaderLineType.Integer, "Strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads")); + + lines.add(new VCFFormatHeaderLine(RSTART_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the starts of the reads")); + lines.add(new VCFFormatHeaderLine(REND_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the ends of the reads")); + + return lines; + } + + public static Map recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, Map attrs) { + attrs.put(STRAND_COUNT_KEY, new Integer[] {cnt_cons_fwd, cnt_cons_rev, cnt_ref_fwd, cnt_ref_rev} ); + return attrs; + } + + public static Map recordDepth(int cnt_cons, int cnt_indel, int cnt_total, Map attrs) { + attrs.put(ALLELIC_DEPTH_KEY, new Integer[] {cnt_cons, cnt_indel} ); + attrs.put(DEPTH_TOTAL_KEY, cnt_total); + return attrs; + } + + public static Map recordAvMapQ(double cons, double ref, Map attrs) { + attrs.put(MAPQ_KEY, new Float[] {(float)cons, (float)ref} ); + return attrs; + } + + public static Map recordAvMM(double cons, double ref, Map attrs) { + attrs.put(MM_KEY, new Float[] {(float)cons, (float)ref} ); + return attrs; + } + + public static Map recordNQSMMRate(double cons, double ref, Map attrs) { + attrs.put(NQS_MMRATE_KEY, new Float[] {(float)cons, (float)ref} ); + return attrs; + } + + public static Map recordNQSAvQ(double cons, double ref, Map attrs) { + attrs.put(NQS_AVQ_KEY, new Float[] {(float)cons, (float)ref} ); + return attrs; + } + + public static Map recordOffsetFromStart(int median, int mad, Map attrs) { + attrs.put(RSTART_OFFSET_KEY, new Integer[] {median, mad} ); + return attrs; + } + + public static Map recordOffsetFromEnd(int median, int mad, Map attrs) { + attrs.put(REND_OFFSET_KEY, new Integer[] {median, mad} ); + return attrs; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java deleted file mode 100755 index 9aa370d3f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java +++ /dev/null @@ -1,890 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcid; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcidTable; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.*; - -import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - - -/** - * Walks along all variant ROD loci, and dynamically annotates alleles at MNP records. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = AnnotateMNPsWalker.REFSEQ_ROD_NAME, type = AnnotatorInputTableFeature.class), @RMD(name = AnnotateMNPsWalker.VARIANT_ROD_NAME, type = ReferenceOrderedDatum.class)}) - -public class AnnotateMNPsWalker extends RodWalker { - - @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter writer = null; - private ManualSortingVCFWriter sortingWriter = null; - - @Argument(fullName = "emitOnlyMNPs", shortName = "emitOnlyMNPs", doc = "Only output MNP records; [default:false]", required = false) - protected boolean emitOnlyMNPs = false; - - private LinkedList rodNames = null; - private GenomeLocParser locParser = null; - private TreeMap> MNPstartToStops = null; // Must be TreeMap sorted by START sites! - - public final static String REFSEQ_ROD_NAME = "refseq"; - public final static String VARIANT_ROD_NAME = "variant"; - - private LocusToFeatures locusToRefSeqFeatures = null; - - - protected final static String MNP_ANNOTATION_KEY_PREFIX = "MNP.refseq."; - - protected final static String REFSEQ_NAME = "name"; - protected final static String REFSEQ_NAME2 = "name2"; - - protected final static String REFSEQ_POSITION_TYPE = "positionType"; - protected final static String REFSEQ_CDS = "CDS"; - - protected final static String REFSEQ_STRAND = "transcriptStrand"; - protected final static String REFSEQ_POS_STRAND = "+"; - protected final static String REFSEQ_NEG_STRAND = "-"; - - protected final static String REFSEQ_CODON_COORD = "codonCoord"; - protected final static String REFSEQ_CODING_FRAME = "frame"; - - protected final static String REFSEQ_REF_CODON = "referenceCodon"; - protected final static String REFSEQ_REF_AA = "referenceAA"; - - protected final static String REFSEQ_ALT_BASE = "haplotypeAlternate"; - - protected final static String REFSEQ_VARIANT_CODON = "variantCodon"; - protected final static String REFSEQ_VARIANT_AA = "variantAA"; - protected final static String REFSEQ_CHANGES_AA = "changesAA"; - protected final static String REFSEQ_FUNCTIONAL_CLASS = "functionalClass"; - protected final static String REFSEQ_PROTEIN_COORD_DESCRIPTION = "proteinCoordStr"; - - protected final static String REFSEQ_CODING_ANNOTATIONS = "codingVariants"; - protected final static String REFSEQ_NUM_AA_CHANGES = "numAAchanges"; - protected final static String REFSEQ_HAS_MULT_AA_CHANGES = "alleleHasMultAAchanges"; - - public void initialize() { - rodNames = new LinkedList(); - rodNames.add(VARIANT_ROD_NAME); - - locParser = getToolkit().getGenomeLocParser(); - MNPstartToStops = new TreeMap>(); // sorted by start sites - - initializeVcfWriter(); - - locusToRefSeqFeatures = new LocusToFeatures(); - } - - private void initializeVcfWriter() { - sortingWriter = new ManualSortingVCFWriter(writer); - writer = sortingWriter; - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); - writer.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples()))); - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * For each site of interest, annotate it if it's a MNP. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return count of MNPs observed - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - int numMNPsObserved = 0; - GenomeLoc curLocus = ref.getLocus(); - clearOldLocusFeatures(curLocus); - - boolean requireStartHere = false; // see EVERY site of the MNP - boolean takeFirstOnly = false; // take as many entries as the VCF file has - for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { - GenomeLoc vcLoc = VariantContextUtils.getLocation(locParser, vc); - boolean atStartOfVc = curLocus.getStart() == vcLoc.getStart(); - boolean atEndOfVc = curLocus.getStart() == vcLoc.getStop(); - - if (vc.isMNP()) { - logger.debug("Observed MNP at " + vcLoc); - - if (isChrM(vc)) { - if (atStartOfVc) { - logger.warn("Skipping mitochondrial MNP at " + vcLoc + " due to complexity of coding table [need to know if first codon, etc.]..."); - writeVCF(vc); - } - continue; - } - - GenomeLoc stopLoc = locParser.createGenomeLoc(curLocus.getContig(), vcLoc.getStop()); - final List refSeqRODs = tracker.getReferenceMetaData(REFSEQ_ROD_NAME); - for (Object refSeqObject : refSeqRODs) { - AnnotatorInputTableFeature refSeqAnnotation = (AnnotatorInputTableFeature) refSeqObject; - locusToRefSeqFeatures.putLocusFeatures(curLocus, refSeqAnnotation, stopLoc); - } - - if (atStartOfVc) { // MNP is starting here, so register that we're waiting for it - Set stopLocs = MNPstartToStops.get(curLocus); - if (stopLocs == null) { - stopLocs = new HashSet(); - MNPstartToStops.put(curLocus, stopLocs); - } - stopLocs.add(stopLoc); - } - - if (atEndOfVc) { - numMNPsObserved++; // only count a MNP at its stop site - logger.debug("Observed end of MNP at " + curLocus); - logger.debug("Current list of per-locus features\n" + locusToRefSeqFeatures); - - Map MNPannotations = annotateMNP(vc); - MNPannotations.putAll(RefSeqDataParser.removeRefSeqAttributes(vc.getAttributes())); // remove any RefSeq INFO, since adding it in more thoroughly here - vc = VariantContext.modifyAttributes(vc, MNPannotations); - writeVCF(vc); - - GenomeLoc startLoc = locParser.createGenomeLoc(curLocus.getContig(), vcLoc.getStart()); - Set stopLocs = MNPstartToStops.get(startLoc); - if (stopLocs != null) { // otherwise, just removed stopLocs due to another MNP that has the same (start, stop) - stopLocs.remove(stopLoc); - if (stopLocs.isEmpty()) // no longer waiting for startLoc - MNPstartToStops.remove(startLoc); - } - } - } - else if (atStartOfVc && !emitOnlyMNPs) {// only want to write other VariantContexts records once (where they start): - writeVCF(vc); - } - } - - Integer mostUpstreamWritableLoc = null; - if (!MNPstartToStops.isEmpty()) { - GenomeLoc waitingForLoc = MNPstartToStops.entrySet().iterator().next().getKey(); - mostUpstreamWritableLoc = waitingForLoc.getStart() - 1; - } - sortingWriter.setmostUpstreamWritableLocus(mostUpstreamWritableLoc); - - return numMNPsObserved; - } - - private static boolean isChrM(final VariantContext vc) { - return vc.getChr().equals("chrM") || vc.getChr().equals("MT"); - } - - private Map annotateMNP(VariantContext vc) { - Map annotations = new HashMap(); - - RefSeqNameToFeatures nameToPositionalFeatures = new RefSeqNameToFeatures(vc); - MNPannotationKeyBuilder kb = new MNPannotationKeyBuilder(nameToPositionalFeatures); - - for (Map.Entry nameToFeatureEntry : nameToPositionalFeatures.entrySet()) { - String featureName = nameToFeatureEntry.getKey(); - RefSeqFeatureList feature = nameToFeatureEntry.getValue(); - CodonAnnotationsForAltAlleles codonAnnotationsForAlleles = new CodonAnnotationsForAltAlleles(vc, feature); - - annotations.put(kb.getKey(REFSEQ_CODING_ANNOTATIONS), codonAnnotationsForAlleles.getCodonAnnotationsString()); - annotations.put(kb.getKey(REFSEQ_NUM_AA_CHANGES), codonAnnotationsForAlleles.getNumAAchangesString()); - annotations.put(kb.getKey(REFSEQ_HAS_MULT_AA_CHANGES), codonAnnotationsForAlleles.hasAlleleWithMultipleAAchanges); - annotations.put(kb.getKey(REFSEQ_NAME), featureName); - annotations.put(kb.getKey(REFSEQ_NAME2), feature.name2); - annotations.put(kb.getKey(REFSEQ_POSITION_TYPE), REFSEQ_CDS); - annotations.put(kb.getKey(REFSEQ_STRAND), (feature.positiveStrand ? REFSEQ_POS_STRAND : REFSEQ_NEG_STRAND)); - annotations.put(kb.getKey(REFSEQ_CODON_COORD), feature.getCodonCoordString()); - - kb.incrementFeatureIndex(); - } - - return annotations; - } - - private static class MNPannotationKeyBuilder { - private int featureIndex; - private boolean multipleEntries; - - public MNPannotationKeyBuilder(RefSeqNameToFeatures nameToPositionalFeatures) { - this.featureIndex = 1; - this.multipleEntries = nameToPositionalFeatures.nameToFeatures.size() > 1; - } - - public void incrementFeatureIndex() { - featureIndex++; - } - - public String getKey(String type) { - String annotationKey = MNP_ANNOTATION_KEY_PREFIX + type; - if (multipleEntries) - annotationKey += "_" + featureIndex; - return annotationKey; - } - } - - private static byte[] ByteArrayToPrimitive(Byte[] nonNullArray) { - byte[] primArray = new byte[nonNullArray.length]; - - for (int i = 0; i < nonNullArray.length; i++) { - if (nonNullArray[i] == null) - throw new ReviewedStingException("nonNullArray[i] == null"); - primArray[i] = nonNullArray[i]; - } - - return primArray; - } - - private void clearOldLocusFeatures(GenomeLoc curLoc) { - Iterator> locusFeaturesIt = locusToRefSeqFeatures.entrySet().iterator(); - while (locusFeaturesIt.hasNext()) { - Map.Entry locusFeaturesEntry = locusFeaturesIt.next(); - if (curLoc.isPast(locusFeaturesEntry.getValue().getFurthestLocusUsingFeatures())) - locusFeaturesIt.remove(); - } - } - - public Integer reduce(Integer count, Integer total) { - if (count != null) - total = total + count; - - return total; - } - - /** - * @param result the number of MNPs processed. - */ - public void onTraversalDone(Integer result) { - System.out.println("Number of MNPs observed: " + result); - writer.close(); - } - - private void writeVCF(VariantContext vc) { - WriteVCF.writeVCF(vc, writer, logger); - } - - /* - Inner classes: - */ - - // Maps: RefSeq entry name -> features for ALL positions of a particular VariantContext MNP: - - private class RefSeqNameToFeatures { - private Map nameToFeatures; - - public RefSeqNameToFeatures(VariantContext vc) { - this.nameToFeatures = new HashMap(); - - int MNPstart = vc.getStart(); - int MNPstop = vc.getEnd(); - int MNPlength = MNPstop - MNPstart + 1; - - for (int i = 0; i < MNPlength; i++) { - int genomicPosition = MNPstart + i; - GenomeLoc posLoc = locParser.createGenomeLoc(vc.getChr(), genomicPosition); - - PositionalRefSeqFeatures locFeatures = locusToRefSeqFeatures.getLocusFeatures(posLoc); - if (locFeatures == null) // no features for posLoc - continue; - - for (Map.Entry nameToFeatureEntry : locFeatures.entrySet()) { - String name = nameToFeatureEntry.getKey(); - PositionalRefSeqFeature posFeature = nameToFeatureEntry.getValue(); - - RefSeqFeatureList featureList = nameToFeatures.get(name); - if (featureList == null) { - featureList = new RefSeqFeatureList(MNPlength); - nameToFeatures.put(name, featureList); - } - featureList.updateFeatureAtPosition(i, posFeature); - } - } - } - - public Set> entrySet() { - return nameToFeatures.entrySet(); - } - } - - // For a particular RefSeq entry, contains the features for ALL positions of a particular VariantContext MNP - - private static class RefSeqFeatureList { - private final static String CODON_FRAME_START = "("; - private final static String CODON_FRAME_END = ")"; - private final static String CODON_DELIM = "|"; - - private CodingRefSeqFeature[] refSeqFeatures; - private String name2; - private Boolean positiveStrand; - - private Map> codonToIndices; // Map of: codon index -> MNP indices that refer to codon - - public RefSeqFeatureList(int MNPlength) { - this.refSeqFeatures = new CodingRefSeqFeature[MNPlength]; - for (int i = 0; i < MNPlength; i++) - this.refSeqFeatures[i] = null; - - this.name2 = null; - this.positiveStrand = null; - this.codonToIndices = new TreeMap>(); - } - - public void updateFeatureAtPosition(int index, PositionalRefSeqFeature feature) { - if (name2 == null) { - name2 = feature.name2; - positiveStrand = feature.positiveStrand; - } - else if (!name2.equals(feature.name2) || positiveStrand != feature.positiveStrand) { - throw new UserException("Inconsistency between previous RefSeq entry and: " + feature); - } - - CodingRefSeqFeature crsf = new CodingRefSeqFeature(feature); - refSeqFeatures[index] = crsf; - - List indicesWithCodon = codonToIndices.get(crsf.codonCoord); - if (indicesWithCodon == null) { - indicesWithCodon = new LinkedList(); - codonToIndices.put(crsf.codonCoord, indicesWithCodon); - } - indicesWithCodon.add(index); - } - - public Set>> codonIndicesEntrySet() { - return codonToIndices.entrySet(); - } - - public String getCodonCoordString() { - StringBuilder sb = new StringBuilder(); - - for (int i = 0; i < refSeqFeatures.length; i++) { - CodingRefSeqFeature crsf = refSeqFeatures[i]; - if (crsf != null) - sb.append(crsf.codonCoord).append(CODON_FRAME_START).append(crsf.codingFrame).append(CODON_FRAME_END); - if (i < refSeqFeatures.length - 1) - sb.append(CODON_DELIM); - } - - return sb.toString(); - } - } - - private static class CodingRefSeqFeature { - protected int codonCoord; - protected int codingFrame; - protected String referenceCodon; - protected String referenceAA; - - public CodingRefSeqFeature(PositionalRefSeqFeature feature) { - this.codonCoord = feature.codonCoord; - this.codingFrame = feature.codingFrame; - this.referenceCodon = feature.referenceCodon.toUpperCase(); - this.referenceAA = feature.referenceAA; - } - } - - private static class CodonAnnotationsForAltAlleles { - protected final static int MIN_CODON_INDEX = 0; - protected final static int NUM_CODON_INDICES = 3; - private final static String CODON_ANNOTATION_DELIM = ","; - - private List alleleAnnotations; - private int[] alleleToNumAAchanges; - private boolean hasAlleleWithMultipleAAchanges; - - public CodonAnnotationsForAltAlleles(VariantContext vc, RefSeqFeatureList feature) { - this.alleleAnnotations = new LinkedList(); - - Set altAlleles = vc.getAlternateAlleles(); - int numAltAlleles = altAlleles.size(); - this.alleleToNumAAchanges = new int[numAltAlleles]; - for (int i = 0; i < numAltAlleles; i++) - this.alleleToNumAAchanges[i] = 0; - - int MNPstart = vc.getStart(); - int MNPstop = vc.getEnd(); - int MNPlength = MNPstop - MNPstart + 1; - - for (Map.Entry> codonToIndicesEntry : feature.codonIndicesEntrySet()) { - int codonIndex = codonToIndicesEntry.getKey(); - List indices = codonToIndicesEntry.getValue(); - if (indices.isEmpty()) - throw new ReviewedStingException("indices should not exist if it's empty!"); - - for (int index : indices) { - int frame = feature.refSeqFeatures[index].codingFrame; - if (feature.refSeqFeatures[index].codonCoord != codonIndex) - throw new ReviewedStingException("LOGICAL ERROR: feature.refSeqFeatures[index].codonCoord != codonIndex"); - if (frame < MIN_CODON_INDEX || frame >= NUM_CODON_INDICES) - throw new UserException("RefSeq codon frame not one of {0,1,2}"); - } - CodingRefSeqFeature firstFeatureForCodon = feature.refSeqFeatures[indices.get(0)]; - String refCodon = firstFeatureForCodon.referenceCodon; - - SingleCodonAnnotationsForAlleles codonAnnotation = new SingleCodonAnnotationsForAlleles(codonIndex, altAlleles, MNPlength, refCodon, firstFeatureForCodon, indices, feature); - alleleAnnotations.add(codonAnnotation); - - // From a single codon, summarize the data for ALL alleles: - for (int i = 0; i < numAltAlleles; i++) { - if (codonAnnotation.annotationsForAlleles[i].codonFunc.changesAA) { - alleleToNumAAchanges[i]++; - if (alleleToNumAAchanges[i] > 1) - this.hasAlleleWithMultipleAAchanges = true; - } - } - } - } - - public String getCodonAnnotationsString() { - StringBuilder sb = new StringBuilder(); - - int index = 0; - for (SingleCodonAnnotationsForAlleles codonToAlleles : alleleAnnotations) { - sb.append(codonToAlleles); - if (index < alleleAnnotations.size() - 1) - sb.append(CODON_ANNOTATION_DELIM); - index++; - } - - return sb.toString(); - } - - public String getNumAAchangesString() { - StringBuilder sb = new StringBuilder(); - - for (int index = 0; index < alleleToNumAAchanges.length; index++) { - sb.append(alleleToNumAAchanges[index]); - if (index < alleleToNumAAchanges.length - 1) - sb.append(SingleCodonAnnotationsForAlleles.ALLELE_ANNOTATION_DELIM); - } - - return sb.toString(); - } - } - - private static class SingleCodonAnnotationsForAlleles { - private final static String CODON_MAP_SYMBOL = "->"; - private final static String CODON_ANNOTATION_START = "["; - private final static String CODON_ANNOTATION_END = "]"; - private final static String REF_CODON_INFO_DELIM = "|"; - private final static String ALLELE_ANNOTATION_DELIM = ","; - private final static String ASSIGNMENT = ":"; - - private int codonIndex; - private String refCodon; - private String refAA; - - private SingleCodonAnnotationsForAllele[] annotationsForAlleles; - - public SingleCodonAnnotationsForAlleles(int codonIndex, Collection altAlleles, int MNPlength, String refCodon, CodingRefSeqFeature firstFeatureForCodon, List indices, RefSeqFeatureList feature) { - if (refCodon.length() != CodonAnnotationsForAltAlleles.NUM_CODON_INDICES) - throw new UserException("RefSeq reference codon " + refCodon + " is not of length " + CodonAnnotationsForAltAlleles.NUM_CODON_INDICES); - - AminoAcid refAA = AminoAcidTable.getEukaryoticAA(refCodon); - if (!refAA.getCode().equals(firstFeatureForCodon.referenceAA)) - throw new UserException("RefSeq: translated reference codon= " + refAA + " != " + firstFeatureForCodon.referenceAA + " = reference AA"); - - this.codonIndex = codonIndex; - this.refCodon = refCodon; - this.refAA = refAA.getCode(); - this.annotationsForAlleles = new SingleCodonAnnotationsForAllele[altAlleles.size()]; - - int altInd = 0; - for (Allele altAllele : altAlleles) { - if (altAllele.length() != MNPlength) - throw new ReviewedStingException("length(altAllele) != length(MNP)"); - byte[] altBases = altAllele.getBases(); - - Byte[] variantCodonArr = new Byte[CodonAnnotationsForAltAlleles.NUM_CODON_INDICES]; - for (int i = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; i < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; i++) - variantCodonArr[i] = null; - - for (int index : indices) { - int frame = feature.refSeqFeatures[index].codingFrame; - if (variantCodonArr[frame] != null) - throw new UserException("RefSeq assigns codon " + codonIndex + " twice at same frame: " + frame); - - byte base = altBases[index]; - if (!feature.positiveStrand) // negative strand codon - base = BaseUtils.simpleComplement(base); - - variantCodonArr[frame] = base; - } - - /* For missing frames, there MUST exist AT LEAST one index that refers to this codon, - so use it to derive the missing bases [ALREADY complemented if on the negative strand]: - */ - for (int frame = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; frame < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; frame++) { - if (variantCodonArr[frame] == null) - variantCodonArr[frame] = (byte) refCodon.charAt(frame); - } - String variantCodon = new String(ByteArrayToPrimitive(variantCodonArr)).toUpperCase(); - - SingleCodonAnnotationsForAllele alleleAnnotation = new SingleCodonAnnotationsForAllele(variantCodon, refCodon, refAA, codonIndex); - annotationsForAlleles[altInd] = alleleAnnotation; - altInd++; - } - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append(codonIndex).append(CODON_MAP_SYMBOL).append(CODON_ANNOTATION_START); - sb.append(REFSEQ_REF_CODON).append(ASSIGNMENT).append(refCodon).append(REF_CODON_INFO_DELIM); - sb.append(REFSEQ_REF_AA).append(ASSIGNMENT).append(refAA).append(REF_CODON_INFO_DELIM); - - int index = 0; - for (SingleCodonAnnotationsForAllele annotation : annotationsForAlleles) { - sb.append(annotation); - if (index < annotationsForAlleles.length - 1) - sb.append(ALLELE_ANNOTATION_DELIM); - index++; - } - sb.append(CODON_ANNOTATION_END); - - return sb.toString(); - } - } - - private static class SingleCodonAnnotationsForAllele { - private final static String ALLELE_START = "{"; - private final static String ALLELE_END = "}"; - private final static String CODON_INFO_DELIM = "|"; - private final static String ASSIGNMENT = ":"; - private final static String MNP_DEPENDENT_AA = "MNPdependentAA"; - - private CodonFunction codonFunc; - private String proteinCoordStr; - private boolean MNPdependentAA; - private String originalAA; - - public SingleCodonAnnotationsForAllele(String variantCodon, String refCodon, AminoAcid refAA, int codonIndex) { - this.codonFunc = new CodonFunction(variantCodon, refCodon, refAA); - this.proteinCoordStr = "p." + refAA.getLetter() + codonIndex + codonFunc.variantAA.getLetter(); - - int refCodonLength = refCodon.length(); - if (codonFunc.variantCodon.length() != refCodonLength) - throw new ReviewedStingException("codonFunc.variantCodon.length() != refCodonLength, but ALREADY checked that they're both 3"); - - this.MNPdependentAA = true; - this.originalAA = "("; - for (int i = 0; i < refCodonLength; i++) { - // Take [0,i-1] and [i+1, end] from refCodon, and i from variantCodon: - String singleBaseChangeCodon = refCodon.substring(0, i) + variantCodon.substring(i, i+1) + refCodon.substring(i+1, refCodonLength); - CodonFunction singleBaseChangeCodonFunc = new CodonFunction(singleBaseChangeCodon, refCodon, refAA); - if (singleBaseChangeCodonFunc.variantAA.equals(codonFunc.variantAA)) { - this.MNPdependentAA = false; - this.originalAA = ""; - break; - } - - this.originalAA = this.originalAA + "" + singleBaseChangeCodonFunc.variantAA.getLetter(); - if (i < refCodonLength - 1) - this.originalAA = this.originalAA + ","; - } - - if (this.MNPdependentAA) - this.originalAA = this.originalAA + ")"; - } - - private static class CodonFunction { - private String variantCodon; - private AminoAcid variantAA; - private boolean changesAA; - private String functionalClass; - - public CodonFunction(String variantCodon, String refCodon, AminoAcid refAA) { - this.variantCodon = variantCodon; - this.variantAA = AminoAcidTable.getEukaryoticAA(this.variantCodon); - this.changesAA = !refAA.equals(variantAA); - - if (!this.variantCodon.equals(refCodon)) { - if (changesAA) { - if (variantAA.isStop()) { - functionalClass = "nonsense"; - } - else if (refAA.isStop()) { - functionalClass = "readthrough"; - } - else { - functionalClass = "missense"; - } - } - else { // the same aa: - functionalClass = "silent"; - } - } - else { // the same codon: - functionalClass = "no_change"; - } - } - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append(ALLELE_START); - sb.append(REFSEQ_VARIANT_CODON).append(ASSIGNMENT).append(codonFunc.variantCodon).append(CODON_INFO_DELIM); - sb.append(REFSEQ_VARIANT_AA).append(ASSIGNMENT).append(codonFunc.variantAA.getCode()).append(CODON_INFO_DELIM); - sb.append(REFSEQ_CHANGES_AA).append(ASSIGNMENT).append(codonFunc.changesAA).append(CODON_INFO_DELIM); - sb.append(REFSEQ_FUNCTIONAL_CLASS).append(ASSIGNMENT).append(codonFunc.functionalClass).append(CODON_INFO_DELIM); - sb.append(REFSEQ_PROTEIN_COORD_DESCRIPTION).append(ASSIGNMENT).append(proteinCoordStr).append(CODON_INFO_DELIM); - sb.append(MNP_DEPENDENT_AA).append(ASSIGNMENT).append(MNPdependentAA).append(originalAA); - sb.append(ALLELE_END); - - return sb.toString(); - } - } -} - - -// External classes: - -class LocusToFeatures { - private Map locusToFeatures; - - public LocusToFeatures() { - this.locusToFeatures = new TreeMap(); - } - - public PositionalRefSeqFeatures getLocusFeatures(GenomeLoc loc) { - return locusToFeatures.get(loc); - } - - public void putLocusFeatures(GenomeLoc loc, AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) { - PositionalRefSeqFeatures locFeatures = locusToFeatures.get(loc); - if (locFeatures == null) { - locFeatures = new PositionalRefSeqFeatures(locusUsingThis); - locusToFeatures.put(loc, locFeatures); - } - locFeatures.putFeature(refSeqAnnotation, locusUsingThis); - } - - public Set> entrySet() { - return locusToFeatures.entrySet(); - } - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - for (Map.Entry locFeatures : entrySet()) { - GenomeLoc loc = locFeatures.getKey(); - PositionalRefSeqFeatures features = locFeatures.getValue(); - sb.append("Locus: ").append(loc).append("\n").append(features); - } - - return sb.toString(); - } -} - -class PositionalRefSeqFeatures { - private final static String[] REQUIRE_COLUMNS = - {AnnotateMNPsWalker.REFSEQ_NAME, AnnotateMNPsWalker.REFSEQ_POSITION_TYPE}; - - private Map nameToFeature; - private GenomeLoc furthestLocusUsingFeatures; - - public PositionalRefSeqFeatures(GenomeLoc locusUsingThis) { - this.nameToFeature = new HashMap(); - this.furthestLocusUsingFeatures = locusUsingThis; - } - - public void putFeature(AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) { - for (String column : REQUIRE_COLUMNS) { - if (!refSeqAnnotation.containsColumnName(column)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column); - } - - if (locusUsingThis.isPast(furthestLocusUsingFeatures)) - furthestLocusUsingFeatures = locusUsingThis; - - String posType = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_POSITION_TYPE); - if (!posType.equals(AnnotateMNPsWalker.REFSEQ_CDS)) // only interested in coding sequence annotations - return; - - PositionalRefSeqFeature newLocusFeature = new PositionalRefSeqFeature(refSeqAnnotation); - - String refSeqName = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME); - PositionalRefSeqFeature locusFeature = nameToFeature.get(refSeqName); - if (locusFeature == null) { - locusFeature = newLocusFeature; - nameToFeature.put(refSeqName, locusFeature); - } - else if (!locusFeature.equals(newLocusFeature)) { - throw new UserException("Inconsistency between previous RefSeq entry and: " + refSeqAnnotation); - } - - locusFeature.updateFeature(refSeqAnnotation); - } - - public GenomeLoc getFurthestLocusUsingFeatures() { - return furthestLocusUsingFeatures; - } - - public Set> entrySet() { - return nameToFeature.entrySet(); - } - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - for (Map.Entry nameFeatureEntry : entrySet()) { - String name = nameFeatureEntry.getKey(); - PositionalRefSeqFeature feature = nameFeatureEntry.getValue(); - sb.append(name).append(" -> [").append(feature).append("]\n"); - } - - return sb.toString(); - } -} - -class PositionalRefSeqFeature { - private final static String[] REQUIRE_COLUMNS = - {AnnotateMNPsWalker.REFSEQ_NAME2, AnnotateMNPsWalker.REFSEQ_STRAND, - AnnotateMNPsWalker.REFSEQ_CODON_COORD, AnnotateMNPsWalker.REFSEQ_CODING_FRAME, - AnnotateMNPsWalker.REFSEQ_REF_CODON, AnnotateMNPsWalker.REFSEQ_REF_AA}; - - protected String name2; - protected boolean positiveStrand; - protected int codonCoord; - protected int codingFrame; - protected String referenceCodon; - protected String referenceAA; - - private Map baseToAnnotations; - - public PositionalRefSeqFeature(AnnotatorInputTableFeature refSeqAnnotation) { - for (String column : REQUIRE_COLUMNS) { - if (!refSeqAnnotation.containsColumnName(column)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column); - } - this.name2 = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME2); - this.positiveStrand = (refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_STRAND).equals(AnnotateMNPsWalker.REFSEQ_POS_STRAND)); - this.codonCoord = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODON_COORD)); - this.codingFrame = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODING_FRAME)); - this.referenceCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_CODON); - this.referenceAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_AA); - - this.baseToAnnotations = new HashMap(); - } - - public boolean equals(PositionalRefSeqFeature that) { - return this.name2.equals(that.name2) && this.positiveStrand == that.positiveStrand && this.codonCoord == that.codonCoord && this.codingFrame == that.codingFrame - && this.referenceCodon.equals(that.referenceCodon) && this.referenceAA.equals(that.referenceAA); - } - - public void updateFeature(AnnotatorInputTableFeature refSeqAnnotation) { - if (!refSeqAnnotation.containsColumnName(AnnotateMNPsWalker.REFSEQ_ALT_BASE)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + AnnotateMNPsWalker.REFSEQ_ALT_BASE); - String base = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_ALT_BASE); - - baseToAnnotations.put(base, new BaseAnnotations(refSeqAnnotation)); - } - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - sb.append("name2= ").append(name2); - sb.append(", positiveStrand= ").append(positiveStrand); - sb.append(", codonCoord= ").append(codonCoord); - sb.append(", codingFrame= ").append(codingFrame); - sb.append(", referenceCodon= ").append(referenceCodon); - sb.append(", referenceAA= ").append(referenceAA); - - sb.append(", baseAnnotations= {"); - for (Map.Entry baseToAnnotationsEntry : baseToAnnotations.entrySet()) { - String base = baseToAnnotationsEntry.getKey(); - BaseAnnotations annotations = baseToAnnotationsEntry.getValue(); - sb.append(" ").append(base).append(" -> {").append(annotations).append("}"); - } - sb.append(" }"); - - return sb.toString(); - } -} - -class BaseAnnotations { - private final static String[] REQUIRE_COLUMNS = - {AnnotateMNPsWalker.REFSEQ_VARIANT_CODON, AnnotateMNPsWalker.REFSEQ_VARIANT_AA, - AnnotateMNPsWalker.REFSEQ_CHANGES_AA, AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS, - AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION}; - - protected String variantCodon; - protected String variantAA; - protected boolean changesAA; - protected String functionalClass; - protected String proteinCoordStr; - - public BaseAnnotations(AnnotatorInputTableFeature refSeqAnnotation) { - for (String column : REQUIRE_COLUMNS) { - if (!refSeqAnnotation.containsColumnName(column)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column); - } - this.variantCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_CODON); - this.variantAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_AA); - this.changesAA = Boolean.parseBoolean(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CHANGES_AA)); - this.functionalClass = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS); - this.proteinCoordStr = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION); - } - - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - sb.append("variantCodon= ").append(variantCodon); - sb.append(", variantAA= ").append(variantAA); - sb.append(", changesAA= ").append(changesAA); - sb.append(", functionalClass= ").append(functionalClass); - sb.append(", proteinCoordStr= ").append(proteinCoordStr); - - return sb.toString(); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java index 298d8d6c8..306509d0c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.phasing; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -24,6 +26,12 @@ public class MergeAndMatchHaplotypes extends RodWalker { @Output protected VCFWriter vcfWriter = null; + @Input(fullName="pbt", shortName = "pbt", doc="Input VCF truth file", required=true) + public RodBinding pbtTrack; + + @Input(fullName="rbp", shortName = "rbp", doc="Input VCF truth file", required=true) + public RodBinding rbpTrack; + private Map pbtCache = new HashMap(); private Map rbpCache = new HashMap(); @@ -31,7 +39,7 @@ public class MergeAndMatchHaplotypes extends RodWalker { public void initialize() { ArrayList rodNames = new ArrayList(); - rodNames.add("pbt"); + rodNames.add(pbtTrack.getName()); Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); @@ -44,8 +52,8 @@ public class MergeAndMatchHaplotypes extends RodWalker { @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker != null) { - Collection pbts = tracker.getVariantContexts(ref, "pbt", null, ref.getLocus(), true, true); - Collection rbps = tracker.getVariantContexts(ref, "rbp", null, ref.getLocus(), true, true); + Collection pbts = tracker.getValues(pbtTrack, ref.getLocus()); + Collection rbps = tracker.getValues(rbpTrack, ref.getLocus()); VariantContext pbt = pbts.iterator().hasNext() ? pbts.iterator().next() : null; VariantContext rbp = rbps.iterator().hasNext() ? rbps.iterator().next() : null; @@ -91,7 +99,7 @@ public class MergeAndMatchHaplotypes extends RodWalker { } VariantContext newvc = new VariantContext(SOURCE_NAME, pbt.getChr(), pbt.getStart(), pbt.getStart(), pbt.getAlleles(), genotypes, pbt.getNegLog10PError(), pbt.getFilters(), pbt.getAttributes()); - vcfWriter.add(newvc, ref.getBase()); + vcfWriter.add(newvc); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java index 5bd438605..809772c05 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsWalker.java @@ -25,11 +25,12 @@ package org.broadinstitute.sting.gatk.walkers.phasing; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; @@ -46,7 +47,7 @@ import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFr * Walks along all variant ROD loci, and merges consecutive sites if they segregate in all samples in the ROD. */ @Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) +@Requires(value = {DataSource.REFERENCE}) @By(DataSource.REFERENCE_ORDERED_DATA) public class MergeMNPsWalker extends RodWalker { @@ -58,12 +59,10 @@ public class MergeMNPsWalker extends RodWalker { @Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false) protected int maxGenomicDistanceForMNP = 1; - private LinkedList rodNames = null; + @Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true) + public RodBinding variants; public void initialize() { - rodNames = new LinkedList(); - rodNames.add("variant"); - initializeVcfWriter(); } @@ -77,8 +76,8 @@ public class MergeMNPsWalker extends RodWalker { hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); - vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples()))); + Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName())); + vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(variants.getName()).getGenotypeSamples()))); } public boolean generateExtendedEvents() { @@ -101,9 +100,7 @@ public class MergeMNPsWalker extends RodWalker { if (tracker == null) return null; - boolean requireStartHere = true; // only see each VariantContext once - boolean takeFirstOnly = false; // take as many entries as the VCF file has - for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) + for (VariantContext vc : tracker.getValues(variants, context.getLocation())) writeVCF(vc); return 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java index b0491a281..53cfaa3a9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java @@ -118,7 +118,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { innerWriter.close(); } - public void add(VariantContext vc, byte refBase) { + public void add(VariantContext vc) { if (useSingleSample != null) { // only want to output context for one sample Genotype sampGt = vc.getGenotype(useSingleSample); if (sampGt != null) // TODO: subContextFromGenotypes() does not handle any INFO fields [AB, HaplotypeScore, MQ, etc.]. Note that even SelectVariants.subsetRecord() only handles AC,AN,AF, and DP! @@ -138,11 +138,11 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { if (curVcIsNotFiltered) { // still need to wait before can release vc logger.debug("Waiting for new variant " + VariantContextUtils.getLocation(genomeLocParser, vc)); - vcfrWaitingToMerge = new VCFRecord(vc, refBase, false); + vcfrWaitingToMerge = new VCFRecord(vc, false); } else if (!emitOnlyMergedRecords) { // filtered records are never merged logger.debug("DIRECTLY output " + VariantContextUtils.getLocation(genomeLocParser, vc)); - innerWriter.add(vc, refBase); + innerWriter.add(vc); } } else { // waiting to merge vcfrWaitingToMerge @@ -151,7 +151,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { if (!curVcIsNotFiltered) { if (!emitOnlyMergedRecords) { // filtered records are never merged logger.debug("Caching unprocessed output " + VariantContextUtils.getLocation(genomeLocParser, vc)); - filteredVcfrList.add(new VCFRecord(vc, refBase, false)); + filteredVcfrList.add(new VCFRecord(vc, false)); } } else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them: @@ -188,14 +188,14 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { addedAttribs.putAll(mergedVc.getAttributes()); mergedVc = VariantContext.modifyAttributes(mergedVc, addedAttribs); - vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true); + vcfrWaitingToMerge = new VCFRecord(mergedVc, true); numMergedRecords++; } } if (!mergedRecords) { stopWaitingToMerge(); - vcfrWaitingToMerge = new VCFRecord(vc, refBase, false); + vcfrWaitingToMerge = new VCFRecord(vc, false); } logger.debug("Merged? = " + mergedRecords); } @@ -210,11 +210,11 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { } if (!emitOnlyMergedRecords || vcfrWaitingToMerge.resultedFromMerge) - innerWriter.add(vcfrWaitingToMerge.vc, vcfrWaitingToMerge.refBase); + innerWriter.add(vcfrWaitingToMerge.vc); vcfrWaitingToMerge = null; for (VCFRecord vcfr : filteredVcfrList) - innerWriter.add(vcfr.vc, vcfr.refBase); + innerWriter.add(vcfr.vc); filteredVcfrList.clear(); } @@ -257,12 +257,10 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { private static class VCFRecord { public VariantContext vc; - public byte refBase; public boolean resultedFromMerge; - public VCFRecord(VariantContext vc, byte refBase, boolean resultedFromMerge) { + public VCFRecord(VariantContext vc, boolean resultedFromMerge) { this.vc = vc; - this.refBase = refBase; this.resultedFromMerge = resultedFromMerge; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java index be15d4541..96d5c471f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesWalker.java @@ -24,13 +24,10 @@ package org.broadinstitute.sting.gatk.walkers.phasing; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; @@ -49,7 +46,7 @@ import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFr * Walks along all variant ROD loci, and merges consecutive sites if some sample has segregating alt alleles in the ROD. */ @Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) +@Requires(value = {DataSource.REFERENCE}) @By(DataSource.REFERENCE_ORDERED_DATA) public class MergeSegregatingAlternateAllelesWalker extends RodWalker { @@ -81,12 +78,10 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker rodNames = null; + @Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true) + public RodBinding variants; public void initialize() { - rodNames = new LinkedList(); - rodNames.add("variant"); - initializeVcfWriter(); } @@ -114,8 +109,8 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); - vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples()))); + Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(variants.getName())); + vcMergerWriter.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(variants.getName()).getGenotypeSamples()))); } public boolean generateExtendedEvents() { @@ -138,9 +133,7 @@ public class MergeSegregatingAlternateAllelesWalker extends RodWalker { - @Argument(shortName="f", fullName="familyPattern", required=true, doc="Pattern for the family structure (usage: mom+dad=child)") - public String familyStr = null; - @Argument(shortName="nofilters", fullName="disableFilters", required=false, doc="Disable filters for sites where the phase can't be determined, where the parental origin of the alleles is ambiguous (i.e. everyone is heterozygous), or Mendelian violations") - public Boolean noFilters = false; + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.") + public ArrayList familySpecs = null; @Output protected VCFWriter vcfWriter = null; - private String SAMPLE_NAME_MOM; - private String SAMPLE_NAME_DAD; - private String SAMPLE_NAME_CHILD; - - private final String ROD_NAME = "variant"; - private final String AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME = "AmbiguousAlleleOrigin"; - private final String INSUFFICIENT_DATA_FILTER_NAME = "InsufficientInformation"; - private final String MENDELIAN_VIOLATION_FILTER_NAME = "MendelianViolation"; private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; private final String SOURCE_NAME = "PhaseByTransmission"; private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; + private class Trio { + private String mother; + private String father; + private String child; + + public Trio(String mother, String father, String child) { + this.mother = mother; + this.father = father; + this.child = child; + } + + public Trio(String familySpec) { + String[] pieces = familySpec.split("[\\+\\=]"); + + this.mother = pieces[0]; + this.father = pieces[1]; + this.child = pieces[2]; + } + + public String getMother() { return mother; } + public String getFather() { return father; } + public String getChild() { return child; } + } + + private ArrayList trios = new ArrayList(); + + public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) { + if (familySpecs != null) { + // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our + // spec list set, and treat the entries as if they had been specified on the command line. + ArrayList specs = new ArrayList(); + for (String familySpec : familySpecs) { + File specFile = new File(familySpec); + + try { + XReadLines reader = new XReadLines(specFile); + + List lines = reader.readLines(); + for (String line : lines) { + specs.add(new Trio(line)); + } + } catch (FileNotFoundException e) { + specs.add(new Trio(familySpec)); // not a file, so must be a family spec + } + } + + return specs; + } + + return new ArrayList(); + } + /** * Parse the familial relationship specification, and initialize VCF writer */ public void initialize() { - String[] pieces = familyStr.split("[\\+\\=]"); - - SAMPLE_NAME_MOM = pieces[0]; - SAMPLE_NAME_DAD = pieces[1]; - SAMPLE_NAME_CHILD = pieces[2]; + trios = getFamilySpecsFromCommandLineInput(familySpecs); ArrayList rodNames = new ArrayList(); - rodNames.add(ROD_NAME); + rodNames.add(variantCollection.variants.getName()); Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - if (vcfSamples.size() != 3) { - throw new UserException("File to phase by transmission contains more than three samples. This walker only" + - "accepts VCFs with three samples, so that the meaning of the applied filters is" + - "unambiguous."); - } - - if (!vcfSamples.contains(SAMPLE_NAME_MOM) || !vcfSamples.contains(SAMPLE_NAME_DAD) || !vcfSamples.contains(SAMPLE_NAME_CHILD)) { - throw new UserException("One or more of the samples specified in the familyPattern argument is not present" + - "in this file. Please supply a VCF file that contains only three samples: the" + - "mother, the father, and the child"); - } - - Set samples = new TreeSet(); - samples.add(SAMPLE_NAME_MOM); - samples.add(SAMPLE_NAME_DAD); - samples.add(SAMPLE_NAME_CHILD); - Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - - if (!noFilters) { - headerLines.add(new VCFFilterHeaderLine(AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME, "The parental origin of each of the child's allele cannot be determined (ie everyone is heterozygous)")); - headerLines.add(new VCFFilterHeaderLine(INSUFFICIENT_DATA_FILTER_NAME, "The phase of the child's genotype cannot be determined (ie someone is a no-call)")); - headerLines.add(new VCFFilterHeaderLine(MENDELIAN_VIOLATION_FILTER_NAME, "No combination of the parents' alleles can yield the child's genotype (ie a possible Mendelian violation)")); - } - - headerLines.add(new VCFInfoHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); - vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); + headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); + vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); } private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) { @@ -211,68 +233,54 @@ public class PhaseByTransmission extends RodWalker { return finalGenotypes; } - private VariantContext phaseTrioGenotypes(VariantContext vc) { - Genotype mom = vc.getGenotype(SAMPLE_NAME_MOM); - Genotype dad = vc.getGenotype(SAMPLE_NAME_DAD); - Genotype child = vc.getGenotype(SAMPLE_NAME_CHILD); - - Set filters = new HashSet(); - filters.addAll(vc.getFilters()); - - Map attributes = new HashMap(); - attributes.putAll(vc.getAttributes()); - attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, 0.0); - + private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) { ArrayList finalGenotypes = new ArrayList(); - finalGenotypes.add(mom); - finalGenotypes.add(dad); + finalGenotypes.add(mother); + finalGenotypes.add(father); finalGenotypes.add(child); - if (!mom.isCalled() || !dad.isCalled() || !child.isCalled()) { - filters.add(INSUFFICIENT_DATA_FILTER_NAME); - } else { - ArrayList possibleMomGenotypes = createAllThreeGenotypes(vc.getReference(), vc.getAlternateAllele(0), mom); - ArrayList possibleDadGenotypes = createAllThreeGenotypes(vc.getReference(), vc.getAlternateAllele(0), dad); - ArrayList possibleChildGenotypes = createAllThreeGenotypes(vc.getReference(), vc.getAlternateAllele(0), child); + if (mother.isCalled() && father.isCalled() && child.isCalled()) { + ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother); + ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father); + ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child); double bestConfigurationLikelihood = 0.0; double bestPrior = 0.0; - Genotype bestMomGenotype = mom; - Genotype bestDadGenotype = dad; + Genotype bestMotherGenotype = mother; + Genotype bestFatherGenotype = father; Genotype bestChildGenotype = child; double norm = 0.0; - for (Genotype momGenotype : possibleMomGenotypes) { - for (Genotype dadGenotype : possibleDadGenotypes) { + for (Genotype motherGenotype : possibleMotherGenotypes) { + for (Genotype fatherGenotype : possibleFatherGenotypes) { for (Genotype childGenotype : possibleChildGenotypes) { - double prior = isMendelianViolation(vc.getReference(), vc.getAlternateAllele(0), momGenotype, dadGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(momGenotype, dadGenotype, childGenotype); + double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; + double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype); norm += prior*configurationLikelihood; if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) { bestConfigurationLikelihood = configurationLikelihood; bestPrior = prior; - bestMomGenotype = momGenotype; - bestDadGenotype = dadGenotype; + bestMotherGenotype = motherGenotype; + bestFatherGenotype = fatherGenotype; bestChildGenotype = childGenotype; } } } } - if (isMendelianViolation(vc.getReference(), vc.getAlternateAllele(0), bestMomGenotype, bestDadGenotype, bestChildGenotype)) { - filters.add(MENDELIAN_VIOLATION_FILTER_NAME); - } else if (bestMomGenotype.isHet() && bestDadGenotype.isHet() && bestChildGenotype.isHet()) { - filters.add(AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME); - } else { - finalGenotypes = getPhasedGenotypes(bestMomGenotype, bestDadGenotype, bestChildGenotype); - + if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) { + Map attributes = new HashMap(); + attributes.putAll(bestChildGenotype.getAttributes()); attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm); + bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes); + + finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype); } } - return new VariantContext(SOURCE_NAME, vc.getChr(), vc.getStart(), vc.getStart(), vc.getAlleles(), finalGenotypes, vc.getNegLog10PError(), noFilters ? vc.getFilters() : filters, attributes); + return finalGenotypes; } /** @@ -286,11 +294,29 @@ public class PhaseByTransmission extends RodWalker { @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker != null) { - Collection vcs = tracker.getVariantContexts(ref, ROD_NAME, null, context.getLocation(), true, true); + VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); - for (VariantContext vc : vcs) { - vcfWriter.add(phaseTrioGenotypes(vc), ref.getBase()); + Map genotypeMap = vc.getGenotypes(); + + for (Trio trio : trios) { + Genotype mother = vc.getGenotype(trio.getMother()); + Genotype father = vc.getGenotype(trio.getFather()); + Genotype child = vc.getGenotype(trio.getChild()); + + ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child); + + Genotype phasedMother = trioGenotypes.get(0); + Genotype phasedFather = trioGenotypes.get(1); + Genotype phasedChild = trioGenotypes.get(2); + + genotypeMap.put(phasedMother.getSampleName(), phasedMother); + genotypeMap.put(phasedFather.getSampleName(), phasedFather); + genotypeMap.put(phasedChild.getSampleName(), phasedChild); } + + VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap); + + vcfWriter.add(newvc); } return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 9702fd18c..17a6e20f1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -24,14 +24,15 @@ package org.broadinstitute.sting.gatk.walkers.phasing; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.DisjointSet; @@ -51,30 +52,67 @@ import java.util.*; import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - /** * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). + * + *

+ * Performs physical phasing of SNP calls, based on sequencing reads. + *

+ * + *

Input

+ *

+ * VCF file of SNP calls, BAM file of sequence reads. + *

+ * + *

Output

+ *

+ * Phased VCF file. + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ReadBackedPhasing
+ *      -R reference.fasta
+ *      -I reads.bam
+ *      --variant:vcf SNPs.vcf
+ *      -BTI variant
+ *      -BTIMR INTERSECTION
+ *      -o phased_SNPs.vcf
+ *      --phaseQualityThresh 20.0
+ * 
+ * + * @author Menachem Fromer + * @since July 2010 */ @Allows(value = {DataSource.READS, DataSource.REFERENCE}) -@Requires(value = {DataSource.READS, DataSource.REFERENCE}, referenceMetaData = @RMD(name = "variant", type = ReferenceOrderedDatum.class)) +@Requires(value = {DataSource.READS, DataSource.REFERENCE}) @By(DataSource.READS) -@ReadFilters({MappingQualityZeroReadFilter.class}) // Filter out all reads with zero mapping quality +@ReadFilters({MappingQualityZeroFilter.class}) public class ReadBackedPhasingWalker extends RodWalker { private static final boolean DEBUG = false; + /** + * The VCF file we are phasing variants from. + * + * All heterozygous variants found in this VCF file will be phased, where possible + */ + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @Output(doc = "File to which variants should be written", required = true) protected VCFWriter writer = null; - @Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads; [default:20000]", required = false) + @Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads for the phasing procedure", required = false) protected Integer cacheWindow = 20000; - @Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm; [default:10]", required = false) + @Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm", required = false) protected Integer maxPhaseSites = 10; // 2^10 == 10^3 diploid haplotypes - @Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing; [default:10.0]", required = false) + @Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing", required = false) protected Double phaseQualityThresh = 10.0; // PQ = 10.0 <=> P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9 @Hidden @@ -82,10 +120,10 @@ public class ReadBackedPhasingWalker extends RodWalker rodNames = null; - public static final String PQ_KEY = "PQ"; // In order to detect phase inconsistencies: @@ -108,10 +144,10 @@ public class ReadBackedPhasingWalker extends RodWalker(); - rodNames.add("variant"); - /* Since we cap each base quality (BQ) by its read's mapping quality (MQ) [in Read.updateBaseAndQuality()], then: if minBQ > minMQ, then we require that MQ be >= minBQ as well. @@ -175,8 +208,9 @@ public class ReadBackedPhasingWalker extends RodWalker rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); - Set samples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples() : samplesToPhase); + String trackName = variantCollection.variants.getName(); + Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); + Set samples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(trackName).getGenotypeSamples() : samplesToPhase); writer.writeHeader(new VCFHeader(hInfo, samples)); } @@ -207,9 +241,7 @@ public class ReadBackedPhasingWalker extends RodWalker unprocessedList = new LinkedList(); - boolean requireStartHere = true; // only see each VariantContext once - boolean takeFirstOnly = false; // take as many entries as the VCF file has - for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { + for (VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { if (samplesToPhase != null) vc = reduceVCToSamples(vc, samplesToPhase); if (ReadBackedPhasingWalker.processVariantInPhasing(vc)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java index 2851ace0d..c10eaa2da 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java @@ -25,20 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.phasing; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; public class WriteVCF { public static void writeVCF(VariantContext vc, VCFWriter writer, Logger logger) { - byte refBase; - if (!vc.isIndel()) { - Allele refAllele = vc.getReference(); - refBase = SNPallelePair.getSingleBase(refAllele); - } - else { - refBase = vc.getReferenceBaseForIndel(); - } - - writer.add(vc, refBase); + writer.add(vc); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java index 2bdd4558f..29b649afe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java @@ -1,16 +1,19 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import java.io.PrintStream; +import java.util.Collections; import java.util.List; /** @@ -22,6 +25,9 @@ public class CountIntervals extends RefWalker { @Output PrintStream out; + @Input(fullName="check", shortName = "check", doc="Any number of RODs", required=false) + public List> features = Collections.emptyList(); + @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false) int numOverlaps = 2; @@ -36,7 +42,7 @@ public class CountIntervals extends RefWalker { return null; } - List checkIntervals = tracker.getGATKFeatureMetaData("check",false); + List checkIntervals = tracker.getValues(features); return (long) checkIntervals.size(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java index 0d68c8493..09113704a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java @@ -11,7 +11,31 @@ import java.io.PrintStream; /** * Walks over the input data set, calculating the total number of covered loci for diagnostic purposes. + * + *

* Simplest example of a locus walker. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of loci traversed. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountLoci \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ * */ public class CountLociWalker extends LocusWalker implements TreeReducible { @Output(doc="Write count to this file instead of STDOUT") diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java index df89efe6d..e770418c1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java @@ -39,8 +39,27 @@ import java.util.List; * query name order. Breaks counts down by total pairs and number * of paired reads. * + * + *

Input

+ *

+ * One or more bam files. + *

+ * + *

Output

+ *

+ * Number of pairs seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountPairs \
+ *   -o output.txt \
+ *   -I input.bam
+ * 
+ * * @author mhanna - * @version 0.1 */ public class CountPairsWalker extends ReadPairWalker { @Output diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java similarity index 62% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java index d1545f159..7c7d6417a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java @@ -25,7 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -33,25 +36,55 @@ import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.collections.Pair; +import java.util.Collections; +import java.util.List; + /** - * Prints out counts of the number of reference ordered data objects are - * each locus for debugging RefWalkers. + * Prints out counts of the number of reference ordered data objects encountered. + * + * + *

Input

+ *

+ * One or more rod files. + *

+ * + *

Output

+ *

+ * Number of rods seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountRODsByRef \
+ *   -o output.txt \
+ *   --rod input.vcf
+ * 
+ * */ -public class CountRodByRefWalker extends RefWalker, Long>> { - @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false) +public class CountRODsByRefWalker extends RefWalker, Long>> { + + /** + * One or more input rod files + */ + @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false) + public List> rods = Collections.emptyList(); + + @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false) public boolean verbose = false; - @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false) + @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false) public boolean showSkipped = false; - CountRodWalker crw = new CountRodWalker(); + CountRODsWalker crw = new CountRODsWalker(); public void initialize() { crw.verbose = verbose; crw.showSkipped = showSkipped; } - public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public CountRODsWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return crw.map(tracker, ref, context); } @@ -59,7 +92,7 @@ public class CountRodByRefWalker extends RefWalker, Long> reduce(CountRodWalker.Datum point, Pair, Long> sum) { + public Pair, Long> reduce(CountRODsWalker.Datum point, Pair, Long> sum) { return crw.reduce(point, sum); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java similarity index 87% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java index 8a03dea44..edbd5ff75 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java @@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.qc; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -41,23 +44,46 @@ import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.collections.Pair; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** - * Prints out counts of the number of reference ordered data objects are - * each locus for debugging RodWalkers. + * Prints out counts of the number of reference ordered data objects encountered. + * + * + *

Input

+ *

+ * One or more rod files. + *

+ * + *

Output

+ *

+ * Number of rods seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountRODs \
+ *   -o output.txt \
+ *   --rod input.vcf
+ * 
+ * */ -public class CountRodWalker extends RodWalker, Long>> implements TreeReducible, Long>> { +public class CountRODsWalker extends RodWalker, Long>> implements TreeReducible, Long>> { @Output public PrintStream out; - @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false) + /** + * One or more input rod files + */ + @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false) + public List> rods = Collections.emptyList(); + + @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false) public boolean verbose = false; - @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false) + @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false) public boolean showSkipped = false; @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java index 87c0409b9..9ce9c4eec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java @@ -9,8 +9,32 @@ import org.broadinstitute.sting.gatk.walkers.Requires; /** * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * + *

* Can also count the number of reads matching a given criterion using read filters (see the * --read-filter command line argument). Simplest example of a read-backed analysis. + * + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads seen. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CountReads \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ * */ @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountReadsWalker extends ReadWalker { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java new file mode 100644 index 000000000..933e24784 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.*; + +/** + * Summary test + * + *

Body test

+ */ +public class DocumentationTest extends RodWalker { + // the docs for the arguments are in the collection + @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + /** + * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants. + * Other sets can be specified with the -knownName (--known_names) argument. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + + /** + * detailed documentation about the argument goes here. + */ + @Input(fullName="listofRodBinding", shortName = "disc", doc="Output variants that were not called in this Feature comparison track", required=false) + private List> listOfRodBinding = Collections.emptyList(); + + @Input(fullName="optionalRodBinding", shortName = "conc", doc="Output variants that were also called in this Feature comparison track", required=false) + private RodBinding concordanceTrack; + + @Input(fullName="optionalRodBindingWithoutDefault", shortName = "optionalRodBindingWithoutDefault", doc="Output variants that were also called in this Feature comparison track", required=false) + private RodBinding noDefaultOptionalRodBinding; + + @Input(fullName="optionalRodBindingWithoutDefaultNull", shortName = "shortTest", doc="Output variants that were also called in this Feature comparison track", required=false) + private RodBinding noDefaultOptionalRodBindingNull = null; + + @Input(fullName="featureArg", shortName = "featureArg", doc="A RodBinding of feature", required=false) + private RodBinding featureArg = null; + + @Output(doc="VCFWriter",required=true) + protected VCFWriter vcfWriter = null; + + @Advanced + @Argument(fullName="setString", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) + public Set sampleNames; + + @Argument(fullName="setStringInitialized", shortName="setStringInitialized", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) + public Set setStringInitialized = new HashSet(); + + @Argument(shortName="optionalArgWithMissinglessDefault", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false) + public ArrayList SELECT_EXPRESSIONS = new ArrayList(); + + @Argument(shortName="AAAAA", fullName = "AAAAA", doc="Should be the first argument", required=false) + public boolean FIRST_ARG = false; + + @Advanced + @Argument(fullName="booleanArg", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) + private boolean EXCLUDE_NON_VARIANTS = false; + + @Advanced + @Argument(fullName="booleanArray", shortName="booleanArray", doc="x", required=false) + private boolean[] boolArray = null; + + @Argument(fullName="enumTest", shortName="enumTest", doc="Test enum", required=false) + private TestEnum TestEnumArg = TestEnum.ENUM2; + public enum TestEnum { + /** Docs for enum1 */ + ENUM1, + /** Docs for enum2 */ + ENUM2 + } + + @Hidden + @Argument(fullName="hiddenArg", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) + private boolean KEEP_AF_SPECTRUM = false; + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; } + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } + public void onTraversalDone(Integer result) { } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java index 170630b77..1c24f3879 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java @@ -1,12 +1,39 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; @@ -29,6 +56,9 @@ public class RodSystemValidationWalker extends RodWalker { // the divider to use in some of the text output private static final String DIVIDER = ","; + @Input(fullName="eval", shortName = "eval", doc="Input VCF eval file", required=true) + public List> eval; + @Output public PrintStream out; @@ -73,18 +103,17 @@ public class RodSystemValidationWalker extends RodWalker { @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { int ret = 0; - if (tracker != null && tracker.getAllRods().size() > 0) { + if (tracker != null && tracker.getNTracksWithBoundFeatures() > 0) { out.print(context.getLocation() + DIVIDER); - Collection features = tracker.getAllRods(); - for (GATKFeature feat : features) - out.print(feat.getName() + DIVIDER); + for (RODRecordList rod: tracker.getBoundRodTracks()) + out.print(rod.getName() + DIVIDER); out.println(";"); ret++; } // if the argument was set, check for equivalence if (allRecordsVariantContextEquivalent && tracker != null) { - Collection col = tracker.getAllVariantContexts(ref); + Collection col = tracker.getValues(eval); VariantContext con = null; for (VariantContext contextInList : col) if (con == null) con = contextInList; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java index e1e6c4b69..ca30d875b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java @@ -26,11 +26,13 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature; +import org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -45,8 +47,11 @@ import java.util.Arrays; * each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data * should be specified using the command-line argument '-B pileup,SAMPileup,'. */ -@Requires(value={DataSource.READS,DataSource.REFERENCE},referenceMetaData=@RMD(name="pileup",type=SAMPileupFeature.class)) +@Requires(value={DataSource.READS,DataSource.REFERENCE}) public class ValidatingPileupWalker extends LocusWalker implements TreeReducible { + @Input(fullName = "pileup", doc="The SAMPileup containing the expected output", required = true) + RodBinding pileup; + @Output private PrintStream out; @@ -130,17 +135,17 @@ public class ValidatingPileupWalker extends LocusWalker + * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating + * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative + * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group, + * reported quality score, cycle, and dinucleotide). Since there is a large amount of data one can then calculate an empirical + * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. + * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score). + *

+ * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified. + * + *

+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration + * + *

Input

+ *

+ * The input read data whose base quality scores need to be assessed. + *

+ * A database of known polymorphic sites to skip over. + *

+ * + *

Output

+ *

+ * A recalibration table file in CSV format that is used by the TableRecalibration walker. + * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. + * + * The first 20 lines of such a file is shown below. + * * The file begins with a series of comment lines describing: + * ** The number of counted loci + * ** The number of counted bases + * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases + * + * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. + * + * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change + * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of + * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate. + * + *

+ * # Counted Sites    19451059
+ * # Counted Bases    56582018
+ * # Skipped Sites    82666
+ * # Fraction Skipped 1 / 235 bp
+ * ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
+ * SRR006446,11,65,CA,9,1,10
+ * SRR006446,11,48,TA,10,0,40
+ * SRR006446,11,67,AA,27,0,40
+ * SRR006446,11,61,GA,11,1,10
+ * SRR006446,12,34,CA,47,1,17
+ * SRR006446,12,30,GA,52,1,17
+ * SRR006446,12,36,AA,352,1,25
+ * SRR006446,12,17,TA,182,11,12
+ * SRR006446,11,48,TG,2,0,40
+ * SRR006446,11,67,AG,1,0,40
+ * SRR006446,12,34,CG,9,0,40
+ * SRR006446,12,30,GG,43,0,40
+ * ERR001876,4,31,AG,1,0,40
+ * ERR001876,4,31,AT,2,2,1
+ * ERR001876,4,31,CA,1,0,40
+ * 
+ *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
+ *   -knownSites another/optional/setOfSitesToMask.vcf \
+ *   -I my_reads.bam \
+ *   -T CountCovariates \
+ *   -cov ReadGroupCovariate \
+ *   -cov QualityScoreCovariate \
+ *   -cov CycleCovariate \
+ *   -cov DinucCovariate \
+ *   -recalFile my_reads.recal_data.csv
+ * 
* - * @author rpoplin - * @since Nov 3, 2009 - * @help.summary First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide). */ @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file -@ReadFilters( {MappingQualityZeroReadFilter.class, MappingQualityUnavailableReadFilter.class} ) // Filter out all reads with zero or unavailable mapping quality +@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta @PartitionBy(PartitionType.LOCUS) public class CountCovariatesWalker extends LocusWalker implements TreeReducible { @@ -94,18 +150,32 @@ public class CountCovariatesWalker extends LocusWalker> knownSites = Collections.emptyList(); + + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + */ + @Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file") @Gather(CountCovariatesGatherer.class) public PrintStream RECAL_FILE; @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) private boolean LIST_ONLY = false; + + /** + * See the -list argument to view available covariates. + */ @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false) private String[] COVARIATES = null; @Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false) @@ -116,6 +186,10 @@ public class CountCovariatesWalker extends LocusWalker requestedCovariates = new ArrayList(); // A list to hold the covariate objects that were requested - private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) - private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen) + private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) + private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen) public static class CountedData { private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file @@ -136,7 +210,7 @@ public class CountCovariatesWalker extends LocusWalker covClass : covariateClasses ) { - out.println( covClass.getSimpleName() ); + logger.info( covClass.getSimpleName() ); } - out.println(); + logger.info(""); System.exit( 0 ); // Early exit here because user requested it } // Warn the user if no dbSNP file or other variant mask was specified - boolean foundDBSNP = false; - for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) { - if( rod != null ) { - if( rod.getType().equals(DbSNPCodec.class) || - rod.getType().equals(VCFCodec.class) || - rod.getType().equals(VCF3Codec.class) || - rod.getType().equals(BEDCodec.class) ) { - foundDBSNP = true; - break; - } - } - } - if( !foundDBSNP && !RUN_WITHOUT_DBSNP ) { - throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation."); + if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) { + throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."); } // Initialize the requested covariates by parsing the -cov argument @@ -266,12 +328,6 @@ public class CountCovariatesWalker extends LocusWalker flag which governs how the recalibrator handles the + * reads which have had the reference inserted because of color space inconsistencies. + */ @Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO; + + /** + * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles + * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in + * their color space tag can not be recalibrated. + */ @Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false) public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 0277fda0d..174e810c2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -29,10 +29,8 @@ import net.sf.samtools.*; import net.sf.samtools.util.SequenceUtil; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; @@ -54,18 +52,40 @@ import java.util.ResourceBundle; import java.util.regex.Pattern; /** - * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. - - * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) - * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. - * This walker then outputs a new bam file with these updated (recalibrated) reads. + * Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. * - * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. - * Note: This walker is designed to be used in conjunction with CovariateCounterWalker. + *

+ * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each + * base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, + * cycle, and dinuc). Using these values as a key in a large hashmap the walker calculates an empirical base quality score + * and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads. + * + *

+ * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration + * + *

Input

+ *

+ * The input read data whose base quality scores need to be recalibrated. + *

+ * The recalibration table file in CSV format that was generated by the CountCovariates walker. + *

+ * + *

Output

+ *

+ * A bam file in which the quality scores in each read have been recalibrated. + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -I my_reads.bam \
+ *   -T TableRecalibration \
+ *   -o my_reads.recal.bam \
+ *   -recalFile my_reads.recal_data.csv
+ * 
* - * @author rpoplin - * @since Nov 3, 2009 - * @help.summary Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @@ -80,25 +100,54 @@ public class TableRecalibrationWalker extends ReadWalker flag that instructs TableRecalibration to not modify + * quality scores less than but rather just write them out unmodified in the recalibrated BAM file. This is useful + * because Solexa writes Q2 and Q3 bases when the machine has really gone wrong. This would be fine in and of itself, + * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, + * your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases + * are unmodified during recalibration, so they don't get inappropriately evaluated. + */ + @Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) private int PRESERVE_QSCORES_LESS_THAN = 5; - @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1") + + /** + * By default TableRecalibration applies a Yates' correction to account for overfitting when it calculates the empirical + * quality score, in particular, ( # mismatches + 1 ) / ( # observations + 1 ). TableRecalibration accepts a --smoothing / -sm + * argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example, + * --smoothing 15 for a large amount of smoothing. + */ + @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points") private int SMOOTHING = 1; - @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default=50") + + /** + * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation + * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later. + */ + @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores") private int MAX_QUALITY_SCORE = 50; + + /** + * By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun + * the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag. + */ @Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read") private boolean DO_NOT_WRITE_OQ = false; @@ -155,17 +204,6 @@ public class TableRecalibrationWalker extends ReadWalker + * Genotype and Validate is a tool to evaluate the quality of a dataset for calling SNPs + * and Indels given a secondary (validation) data source. The data sources are BAM or VCF + * files. You can use them interchangeably (i.e. a BAM to validate calls in a VCF or a VCF + * to validate calls on a BAM). + *

+ * + *

+ * The simplest scenario is when you have a VCF of hand annotated SNPs and Indels, and you + * want to know how well a particular technology performs calling these snps. With a + * dataset (BAM file) generated by the technology in test, and the hand annotated VCF, you + * can run GenotypeAndValidate to asses the accuracy of the calls with the new technology's + * dataset. + *

+ * + *

+ * Another option is to validate the calls on a VCF file, using a deep coverage BAM file + * that you trust the calls on. The GenotypeAndValidate walker will make calls using the + * reads in the BAM file and take them as truth, then compare to the calls in the VCF file + * and produce a truth table. + *

+ * + * + *

Input

+ *

+ * A BAM file to make calls on and a VCF file to use as truth validation dataset. + * + * You also have the option to invert the roles of the files using the command line options listed below. + *

+ * + *

Output

+ *

+ * GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a + * 2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true + * positive or a false positive). The table should look like this: + *

+ *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
ALTREFPredictive Value
called altTrue Positive (TP)False Positive (FP)Positive PV
called refFalse Negative (FN)True Negative (TN)Negative PV
+ *
+ * + *

+ * The positive predictive value (PPV) is the proportion of subjects with positive test results + * who are correctly diagnosed. + *

+ *

+ * The negative predictive value (NPV) is the proportion of subjects with a negative test result + * who are correctly diagnosed. + *

+ *

+ * The VCF file will contain only the variants that were called or not called, excluding the ones that + * were uncovered or didn't pass the filters. This file is useful if you are trying to compare + * the PPV and NPV of two different technologies on the exact same sites (so you can compare apples to + * apples). + *

+ * + *

+ * Here is an example of an annotated VCF file (info field clipped for clarity) + * + *

+ * #CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA12878
+ * 1   20568807    .   C   T   0    HapMapHet        AC=1;AF=0.50;AN=2;DP=0;GV=T  GT  0/1
+ * 1   22359922    .   T   C   282  WG-CG-HiSeq      AC=2;AF=0.50;GV=T;AN=4;DP=42 GT:AD:DP:GL:GQ  1/0 ./. 0/1:20,22:39:-72.79,-11.75,-67.94:99    ./.
+ * 13  102391461   .   G   A   341  Indel;SnpCluster AC=1;GV=F;AF=0.50;AN=2;DP=45 GT:AD:DP:GL:GQ  ./. ./. 0/1:32,13:45:-50.99,-13.56,-112.17:99   ./.
+ * 1   175516757   .   C   G   655  SnpCluster,WG    AC=1;AF=0.50;AN=2;GV=F;DP=74 GT:AD:DP:GL:GQ  ./. ./. 0/1:52,22:67:-89.02,-20.20,-191.27:99   ./.
+ * 
+ * + *

+ * + *

Additional Details

+ *
    + *
  • + * You should always use -BTI on your VCF track, so that the GATK only looks at the sites on the VCF file. + * This speeds up the process a lot. + *
  • + *
  • + * The total number of visited bases may be greater than the number of variants in the original + * VCF file because of extended indels, as they trigger one call per new insertion or deletion. + * (i.e. ACTG/- will count as 4 genotyper calls, but it's only one line in the VCF). + *
  • + *
+ * + *

Examples

+ *
    + *
  1. + * Genotypes BAM file from new technology using the VCF as a truth dataset: + *
  2. + * + *
    + *  java
    + *      -jar /GenomeAnalysisTK.jar
    + *      -T  GenotypeAndValidate
    + *      -R human_g1k_v37.fasta
    + *      -I myNewTechReads.bam
    + *      -alleles handAnnotatedVCF.vcf
    + *      -BTI alleles
    + * 
    + * + *
  3. + * Using a BAM file as the truth dataset: + *
  4. + * + *
    + *  java
    + *      -jar /GenomeAnalysisTK.jar
    + *      -T  GenotypeAndValidate
    + *      -R human_g1k_v37.fasta
    + *      -I myTruthDataset.bam
    + *      -alleles callsToValidate.vcf
    + *      -BTI alleles
    + *      -bt
    + *      -o gav.vcf
    + * 
    + * + * + * @author Mauricio Carneiro + * @since ${DATE} + */ + +@Requires(value={DataSource.READS, DataSource.REFERENCE}) +@Allows(value={DataSource.READS, DataSource.REFERENCE}) + +@By(DataSource.REFERENCE) +@Reference(window=@Window(start=-200,stop=200)) + + +public class GenotypeAndValidateWalker extends RodWalker implements TreeReducible { + + /** + * The optional output file that will have all the variants used in the Genotype and Validation essay. + */ + @Output(doc="Generate a VCF file with the variants considered by the walker, with a new annotation \"callStatus\" which will carry the value called in the validation VCF or BAM file", required=false) + protected VCFWriter vcfWriter = null; + + /** + * The callset to be used as truth (default) or validated (if BAM file is set to truth). + */ + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype", required=true) + public RodBinding alleles; + + /** + * Makes the Unified Genotyper calls to the BAM file the truth dataset and validates the alleles ROD binding callset. + */ + @Argument(fullName ="set_bam_truth", shortName ="bt", doc="Use the calls on the reads (bam file) as the truth dataset and validate the calls on the VCF", required=false) + private boolean bamIsTruth = false; + + /** + * The minimum base quality score necessary for a base to be considered when calling a genotype. This argument is passed to the Unified Genotyper. + */ + @Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false) + private int mbq = -1; + + /** + * The maximum deletion fraction allowed in a site for calling a genotype. This argument is passed to the Unified Genotyper. + */ + @Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false) + private double deletions = -1; + + /** + * the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. This argument is passed to the Unified Genotyper. + */ + @Argument(fullName="standard_min_confidence_threshold_for_calling", shortName="stand_call_conf", doc="the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls", required=false) + private double callConf = -1; + + /** + * the minimum phred-scaled Qscore threshold to emit low confidence calls. This argument is passed to the Unified Genotyper. + */ + @Argument(fullName="standard_min_confidence_threshold_for_emitting", shortName="stand_emit_conf", doc="the minimum phred-scaled Qscore threshold to emit low confidence calls", required=false) + private double emitConf = -1; + + /** + * Only validate sites that have at least a given depth + */ + @Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false) + private int minDepth = -1; + + /** + * If your VCF or BAM file has more than one sample and you only want to validate one, use this parameter to choose it. + */ + @Hidden + @Argument(fullName ="sample", shortName ="sn", doc="Name of the sample to validate (in case your VCF/BAM has more than one sample)", required=false) + private String sample = ""; + + /** + * Print out discordance sites to standard out. + */ + @Hidden + @Argument(fullName ="print_interesting_sites", shortName ="print_interesting", doc="Print out interesting sites to standard out", required=false) + private boolean printInterestingSites; + + private UnifiedGenotyperEngine snpEngine; + private UnifiedGenotyperEngine indelEngine; + + public static class CountedData { + private long nAltCalledAlt = 0L; + private long nAltCalledRef = 0L; + private long nRefCalledAlt = 0L; + private long nRefCalledRef = 0L; + private long nNotConfidentCalls = 0L; + private long nUncovered = 0L; + + /** + * Adds the values of other to this, returning this + * @param other the other object + */ + public void add(CountedData other) { + nAltCalledAlt += other.nAltCalledAlt; + nAltCalledRef += other.nAltCalledRef; + nRefCalledAlt += other.nRefCalledAlt; + nRefCalledRef += other.nRefCalledRef; + nUncovered += other.nUncovered; + nNotConfidentCalls += other.nNotConfidentCalls; + } + } + + + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + public void initialize() { + + // Initialize VCF header + if (vcfWriter != null) { + Map header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName()); + Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger); + headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); + vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + } + + // Filling in SNP calling arguments for UG + UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); + uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; + uac.alleles = alleles; + + // TODO -- if we change this tool to actually validate against the called allele, then this if statement is needed; + // TODO -- for now, though, we need to be able to validate the right allele (because we only test isVariant below) [EB] + //if (!bamIsTruth) + uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + + if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq; + if (deletions >= 0) + uac.MAX_DELETION_FRACTION = deletions; + else + uac.MAX_DELETION_FRACTION = 1.0; + if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf; + if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf; + + uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; + snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac); + + // Adding the INDEL calling arguments for UG + uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL; + indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac); + + // make sure we have callConf set to the threshold set by the UAC so we can use it later. + callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + + final CountedData counter = new CountedData(); + + // For some reason RodWalkers get map calls with null trackers + if( tracker == null ) + return counter; + + VariantContext vcComp = tracker.getFirstValue(alleles); + if( vcComp == null ) + return counter; + + //todo - not sure I want this, may be misleading to filter extended indel events. + if (isInsideExtendedIndel(vcComp, ref)) + return counter; + + // Do not operate on variants that are not covered to the optional minimum depth + if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) { + counter.nUncovered = 1L; + return counter; + } + + VariantCallContext call; + if ( vcComp.isSNP() ) { + call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); + } else if ( vcComp.isIndel() ) { + call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); + } else if ( bamIsTruth ) { + // assume it's a SNP if no variation is present; this is necessary so that we can test supposed monomorphic sites against the truth bam + call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context); + } else { + logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles()); + return counter; + } + + + boolean writeVariant = true; + + if (bamIsTruth) { + if (call.confidentlyCalled) { + // If truth is a confident REF call + if (call.isVariant()) { + if (vcComp.isVariant()) + counter.nAltCalledAlt = 1L; // todo -- may wanna check if the alts called are the same? + else { + counter.nAltCalledRef = 1L; + if ( printInterestingSites ) + System.out.println("Truth=ALT Call=REF at " + call.getChr() + ":" + call.getStart()); + } + } + // If truth is a confident ALT call + else { + if (vcComp.isVariant()) { + counter.nRefCalledAlt = 1L; + if ( printInterestingSites ) + System.out.println("Truth=REF Call=ALT at " + call.getChr() + ":" + call.getStart()); + } else + counter.nRefCalledRef = 1L; + } + } + else { + counter.nNotConfidentCalls = 1L; + if ( printInterestingSites ) + System.out.println("Truth is not confident at " + call.getChr() + ":" + call.getStart()); + writeVariant = false; + } + } + else { + if (!vcComp.hasAttribute("GV")) + throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart()); + + + + if (call.isCalledAlt(callConf)) { + if (vcComp.getAttribute("GV").equals("T")) + counter.nAltCalledAlt = 1L; + else { + counter.nRefCalledAlt = 1L; + if ( printInterestingSites ) + System.out.println("Truth=REF Call=ALT at " + call.getChr() + ":" + call.getStart()); + } + } + else if (call.isCalledRef(callConf)) { + if (vcComp.getAttribute("GV").equals("T")) { + counter.nAltCalledRef = 1L; + if ( printInterestingSites ) + System.out.println("Truth=ALT Call=REF at " + call.getChr() + ":" + call.getStart()); + } else + counter.nRefCalledRef = 1L; + } + else { + counter.nNotConfidentCalls = 1L; + if ( printInterestingSites ) + System.out.println("Truth is not confident at " + call.getChr() + ":" + call.getStart()); + writeVariant = false; + } + } + + if (vcfWriter != null && writeVariant) { + if (!vcComp.hasAttribute("callStatus")) { + MutableVariantContext mvc = new MutableVariantContext(vcComp); + mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" ); + vcfWriter.add(mvc); + } + else + vcfWriter.add(vcComp); + } + return counter; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + public CountedData reduceInit() { + return new CountedData(); + } + + public CountedData treeReduce( final CountedData sum1, final CountedData sum2) { + sum2.add(sum1); + return sum2; + } + + public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) { + reduceSum.add(mapValue); + return reduceSum; + } + + public void onTraversalDone( CountedData reduceSum ) { + double ppv = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nRefCalledAlt)); + double npv = 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nAltCalledRef)); + double sensitivity = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nAltCalledRef)); + double specificity = (reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt > 0) ? 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt)) : 100; + logger.info(String.format("Resulting Truth Table Output\n\n" + + "---------------------------------------------------\n" + + "\t\t|\tALT\t|\tREF\t\n" + + "---------------------------------------------------\n" + + "called alt\t|\t%d\t|\t%d\n" + + "called ref\t|\t%d\t|\t%d\n" + + "---------------------------------------------------\n" + + "positive predictive value: %f%%\n" + + "negative predictive value: %f%%\n" + + "---------------------------------------------------\n" + + "sensitivity: %f%%\n" + + "specificity: %f%%\n" + + "---------------------------------------------------\n" + + "not confident: %d\n" + + "not covered: %d\n" + + "---------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index cb03d4c61..48cba6a1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -8,13 +8,14 @@ import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; import org.broadinstitute.sting.alignment.bwa.BWTFiles; import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.BaseUtils; @@ -29,19 +30,93 @@ import java.util.LinkedList; import java.util.List; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 6/13/11 - * Time: 2:12 PM - * To change this template use File | Settings | File Templates. + * Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation + * + *

    + * ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe + * sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within + * the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies + * reasons why the site may fail validation (nearby variation, for example). + *

    + * + *

    Input

    + *

    + * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an + * interval list defining the size of the amplicons around the sites to be validated + *

    + * + *

    Output

    + *

    + * Output is a FASTA-formatted file with some modifications at probe sites. For instance: + *

    + * >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414
    + * CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC
    + * >20:792122 Valid 20_792122
    + * TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT
    + * >20:994145 Valid 20_994145
    + * TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC
    + * >20:1074230 SITE_IS_FILTERED=1, 20_1074230
    + * ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC
    + * >20:1084330 DELETION=1, 20_1084330
    + * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
    + *
    + * are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: + *
    + * Valid                     // amplicon is valid
    + * SITE_IS_FILTERED=1        // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
    + * VARIANT_TOO_NEAR_PROBE=1  // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
    + * MULTIPLE_PROBES=1,        // multiple variants to be validated found inside the same amplicon
    + * DELETION=6,INSERTION=5,   // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate
    + * DELETION=1,               // deletion found inside the amplicon region, could shift mass-spec peak
    + * START_TOO_CLOSE,          // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer
    + * END_TOO_CLOSE,            // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
    + * NO_VARIANTS_FOUND,        // no variants found within the amplicon region
    + * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
    + * 

    + * + *

    Examples

    + *
    + *    java
    + *      -jar GenomeAnalysisTK.jar
    + *      -T ValidationAmplicons
    + *      -R /humgen/1kg/reference/human_g1k_v37.fasta
    + *      -BTI ProbeIntervals
    + *      -ProbeIntervals:table interval_table.table
    + *      -ValidateAlleles:vcf sites_to_validate.vcf
    + *      -MaskAlleles:vcf mask_sites.vcf
    + *      --virtualPrimerSize 30
    + *      -o probes.fasta
    + * 
    + * + * @author chartl + * @since July 2011 */ -@Requires(value={DataSource.REFERENCE}, referenceMetaData={@RMD(name="ProbeIntervals",type=TableFeature.class), -@RMD(name="ValidateAlleles",type=VariantContext.class),@RMD(name="MaskAlleles",type=VariantContext.class)}) +@Requires(value={DataSource.REFERENCE}) public class ValidationAmplicons extends RodWalker { + /** + * A Table-formatted file listing amplicon contig, start, stop, and a name for the amplicon (or probe) + */ + @Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+ + "intervals surrounding the probe sites amplicons should be designed for", required=true) + RodBinding probeIntervals; + /** + * A VCF file containing the bi-allelic sites for validation. Filtered records will prompt a warning, and will be flagged as filtered in the output fastq. + */ + @Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true) + RodBinding validateAlleles; + /** + * A VCF file containing variants to be masked. A mask variant overlapping a validation site will be ignored at the validation site. + */ + @Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true) + RodBinding maskAlleles; @Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false) boolean lowerCaseSNPs = false; + /** + * BWA single-end alignment is used as a primer specificity proxy. Low-complexity regions (that don't align back to themselves as a best hit) are lowercased. + * This changes the size of the k-mer used for alignment. + */ @Argument(doc="Size of the virtual primer to use for lower-casing regions with low specificity",fullName="virtualPrimerSize",required=false) int virtualPrimerSize = 20; @@ -99,9 +174,10 @@ public class ValidationAmplicons extends RodWalker { } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null || ! tracker.hasROD("ProbeIntervals")) { return null; } + if ( tracker == null || ! tracker.hasValues(probeIntervals)) { return null; } - GenomeLoc interval = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getLocation(); + TableFeature feature = tracker.getFirstValue(probeIntervals); + GenomeLoc interval = feature.getLocation(); //logger.debug(interval); if ( prevInterval == null || ! interval.equals(prevInterval) ) { // we're in a new interval, we should: @@ -129,16 +205,16 @@ public class ValidationAmplicons extends RodWalker { rawSequence = new StringBuilder(); sequenceInvalid = false; invReason = new LinkedList(); - logger.debug(Utils.join("\t",((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues())); - probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getValue(1); + logger.debug(Utils.join("\t",feature.getAllValues())); + probeName = feature.getValue(1); indelCounter = 0; } // step 3 (or 1 if not new): // build up the sequence - VariantContext mask = tracker.getVariantContext(ref,"MaskAlleles",ref.getLocus()); - VariantContext validate = tracker.getVariantContext(ref,"ValidateAlleles",ref.getLocus()); + VariantContext mask = tracker.getFirstValue(maskAlleles, ref.getLocus()); + VariantContext validate = tracker.getFirstValue(validateAlleles,ref.getLocus()); if ( mask == null && validate == null ) { if ( indelCounter > 0 ) { @@ -184,17 +260,17 @@ public class ValidationAmplicons extends RodWalker { } else /* (mask != null && validate == null ) */ { if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) { logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed."); - logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles()))); + logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isSimpleInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles()))); sequenceInvalid = true; - invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION"); + invReason.add(mask.isSimpleInsertion() ? "INSERTION" : "DELETION"); // note: indelCounter could be > 0 (could have small deletion within larger one). This always selects // the larger event. - int indelCounterNew = mask.isInsertion() ? 2 : mask.getEnd()-mask.getStart(); + int indelCounterNew = mask.isSimpleInsertion() ? 2 : mask.getEnd()-mask.getStart(); if ( indelCounterNew > indelCounter ) { indelCounter = indelCounterNew; } //sequence.append((char) ref.getBase()); - //sequence.append(mask.isInsertion() ? 'I' : 'D'); + //sequence.append(mask.isSimpleInsertion() ? 'I' : 'D'); sequence.append("N"); indelCounter--; rawSequence.append(Character.toUpperCase((char) ref.getBase())); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index fe3173506..28f4f2a56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -3,13 +3,11 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.Reference; @@ -17,17 +15,18 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator; +import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.JexlExpression; import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.gatk.walkers.varianteval.util.*; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -39,15 +38,82 @@ import java.util.*; /** * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more) + * + *

    + * Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of + * raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls + * to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics + * by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid + * degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in + * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation + * and stratification modules. + * + *

    Input

    + *

    + * One or more variant sets to evaluate plus any number of comparison sets. + *

    + * + *

    Output

    + *

    + * Evaluation tables detailing the results of the eval modules which were applied. + * For example: + *

    + * output.eval.gatkreport:
    + * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
    + * CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
    + * CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
    + * CountVariants  dbsnp     CpG      eval     none            known    65900028        47068        0         47068         0.00071423  ...
    + * CountVariants  dbsnp     CpG      eval     none            novel    65900028        88702        0         88702         0.00134601  ...
    + * CountVariants  dbsnp     all      eval     none            all      65900028        330818       0         330818        0.00502000  ...
    + * CountVariants  dbsnp     all      eval     none            known    65900028        120685       0         120685        0.00183133  ...
    + * CountVariants  dbsnp     all      eval     none            novel    65900028        210133       0         210133        0.00318866  ...
    + * CountVariants  dbsnp     non_CpG  eval     none            all      65900028        195048       0         195048        0.00295976  ...
    + * CountVariants  dbsnp     non_CpG  eval     none            known    65900028        73617        0         73617         0.00111710  ...
    + * CountVariants  dbsnp     non_CpG  eval     none            novel    65900028        121431       0         121431        0.00184265  ...
    + * ...
    + * 
    + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T VariantEval \
    + *   -o output.eval.gatkreport \
    + *   --eval:set1 set1.vcf \
    + *   --eval:set2 set2.vcf \
    + *   [--comp comp.vcf]
    + * 
    + * */ @Reference(window=@Window(start=-50, stop=50)) public class VariantEvalWalker extends RodWalker implements TreeReducible { - // Output arguments + @Output protected PrintStream out; + /** + * The variant file(s) to evaluate. + */ + @Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true) + public List> evals; + + /** + * The variant file(s) to compare against. + */ + @Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false) + public List> compsProvided = Collections.emptyList(); + private List> comps = new ArrayList>(); + + /** + * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants. + * Other sets can be specified with the -knownName (--known_names) argument. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + // Help arguments - @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit") + @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false) protected Boolean LIST = false; // Partitioning the data arguments @@ -60,24 +126,27 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="sample", shortName="sn", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false) protected Set SAMPLE_EXPRESSIONS; + /** + * List of rod tracks to be used for specifying "known" variants other than dbSNP. + */ @Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false) - protected String[] KNOWN_NAMES = {DbSNPHelper.STANDARD_DBSNP_TRACK_NAME}; + protected HashSet KNOWN_NAMES = new HashSet(); + List> knowns = new ArrayList>(); // Stratification arguments @Argument(fullName="stratificationModule", shortName="ST", doc="One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required=false) protected String[] STRATIFICATIONS_TO_USE = {}; - @Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)") + @Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)", required=false) protected Boolean NO_STANDARD_STRATIFICATIONS = false; - @Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false) - protected Set typesToUse = null; - - // Evaluator arguments + /** + * See the -list argument to view available modules. + */ @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false) protected String[] MODULES_TO_USE = {}; - @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)") + @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)", required=false) protected Boolean NO_STANDARD_MODULES = false; // Other arguments @@ -87,23 +156,23 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false) protected double MIN_PHASE_QUALITY = 10.0; - @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + /** + * This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined. + */ + @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false) protected String FAMILY_STRUCTURE; @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; - @Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false) - private String TRANCHE_FILENAME = null; - @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) private File ancestralAlignmentsFile = null; + @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false) + private boolean requireStrictAlleleMatch = false; + // Variables private Set jexlExpressions = new TreeSet(); - private Set compNames = new TreeSet(); - private Set knownNames = new TreeSet(); - private Set evalNames = new TreeSet(); private Set sampleNamesForEvaluation = new TreeSet(); private Set sampleNamesForStratification = new TreeSet(); @@ -115,6 +184,10 @@ public class VariantEvalWalker extends RodWalker implements Tr // The set of all possible evaluation contexts private HashMap evaluationContexts = null; + // important stratifications + private boolean byFilterIsEnabled = false; + private boolean perSampleIsEnabled = false; + // Output report private GATKReport report = null; @@ -134,29 +207,27 @@ public class VariantEvalWalker extends RodWalker implements Tr // Just list the modules, and exit quickly. if (LIST) { variantEvalUtils.listModulesAndExit(); } - // Categorize each rod as an eval or a comp rod. - for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) { - if ( d.getName().startsWith("eval") ) { - evalNames.add(d.getName()); - } else if ( d.getName().startsWith("comp") || d.getName().startsWith(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { - compNames.add(d.getName()); - } else { - logger.info(String.format("Not evaluating ROD binding '%s' because the name did not start with %s, comp, or eval", d.getName(), Utils.join(", ", KNOWN_NAMES))); - } - } - - // Barf if we don't have any eval tracks. - if (evalNames.size() == 0) { - throw new UserException("No evaluation tracks were specified. Please bind one or more callsets to evaluate using the -B argument with a trackname that starts with the word 'eval'."); + // maintain the full list of comps + comps.addAll(compsProvided); + if ( dbsnp.dbsnp.isBound() ) { + comps.add(dbsnp.dbsnp); + knowns.add(dbsnp.dbsnp); } // Add a dummy comp track if none exists - if (compNames.size() == 0) { - compNames.add("none"); + if ( comps.size() == 0 ) + comps.add(new RodBinding(VariantContext.class, "none", "UNBOUND", "", new Tags())); + + // Set up set of additional knowns + for ( RodBinding compRod : comps ) { + if ( KNOWN_NAMES.contains(compRod.getName()) ) + knowns.add(compRod); } - // Set up set of known names - knownNames.addAll(Arrays.asList(KNOWN_NAMES)); + // Collect the eval rod names + Set evalNames = new TreeSet(); + for ( RodBinding evalRod : evals ) + evalNames.add(evalRod.getName()); // Now that we have all the rods categorized, determine the sample list from the eval rods. Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames); @@ -177,19 +248,15 @@ public class VariantEvalWalker extends RodWalker implements Tr jexlExpressions.add(sjexl); } - // Add select expressions for anything in the tranches file - if ( TRANCHE_FILENAME != null ) { - // we are going to build a few select names automatically from the tranches file - for ( Tranche t : Tranche.readTranches(new File(TRANCHE_FILENAME)) ) { - logger.info("Adding select for all variant above the pCut of : " + t); - SELECT_EXPS.add(String.format(VariantRecalibrator.VQS_LOD_KEY + " >= %.2f", t.minVQSLod)); - SELECT_NAMES.add(String.format("TS-%.2f", t.ts)); - } - } - // Initialize the set of stratifications and evaluations to use stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE); Set> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE); + for ( VariantStratifier vs : getStratificationObjects() ) { + if ( vs.getClass().getSimpleName().equals("Filter") ) + byFilterIsEnabled = true; + else if ( vs.getClass().getSimpleName().equals("Sample") ) + perSampleIsEnabled = true; + } // Initialize the evaluation contexts evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null); @@ -221,45 +288,67 @@ public class VariantEvalWalker extends RodWalker implements Tr if (tracker != null) { String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases()); - // track sample vc - HashMap> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames, typesToUse != null); + // --------- track --------- sample - VariantContexts - + HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled); + HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false); - for ( String compName : compNames ) { - VariantContext comp = vcs.containsKey(compName) && vcs.get(compName) != null && vcs.get(compName).containsKey(ALL_SAMPLE_NAME) ? vcs.get(compName).get(ALL_SAMPLE_NAME) : null; + // for each eval track + for ( final RodBinding evalRod : evals ) { + final HashMap> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : new HashMap>(0); - for ( String evalName : evalNames ) { - for ( String sampleName : sampleNamesForStratification ) { - VariantContext eval = vcs.containsKey(evalName) && vcs.get(evalName) != null ? vcs.get(evalName).get(sampleName) : null; + // for each sample stratifier + for ( final String sampleName : sampleNamesForStratification ) { + Set evalSetBySample = evalSet.get(sampleName); + if ( evalSetBySample == null ) { + evalSetBySample = new HashSet(1); + evalSetBySample.add(null); + } - if ( typesToUse != null ) { - if ( eval != null && ! typesToUse.contains(eval.getType()) ) eval = null; - if ( comp != null && ! typesToUse.contains(comp.getType()) ) comp = null; -// if ( eval != null ) logger.info("Keeping " + eval); - } - - if (eval != null && aastr != null) { + // for each eval in the track + for ( VariantContext eval : evalSetBySample ) { + // deal with ancestral alleles if requested + if ( eval != null && aastr != null ) { HashMap newAts = new HashMap(eval.getAttributes()); newAts.put("ANCESTRALALLELE", aastr); - eval = VariantContext.modifyAttributes(eval, newAts); } - HashMap> stateMap = new HashMap>(); - for ( VariantStratifier vs : stratificationObjects ) { - ArrayList states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName); - stateMap.put(vs, states); - } + // for each comp track + for ( final RodBinding compRod : comps ) { + // no sample stratification for comps + final HashMap> compSetHash = compVCs.get(compRod); + final Set compSet = (compSetHash == null || compSetHash.size() == 0) ? new HashSet(0) : compVCs.get(compRod).values().iterator().next(); - ArrayList stateKeys = new ArrayList(); - variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); + // find the comp + final VariantContext comp = findMatchingComp(eval, compSet); - HashSet stateKeysHash = new HashSet(stateKeys); + HashMap> stateMap = new HashMap>(); + for ( VariantStratifier vs : stratificationObjects ) { + List states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName); + stateMap.put(vs, states); + } - for ( StateKey stateKey : stateKeysHash ) { - NewEvaluationContext nec = evaluationContexts.get(stateKey); + ArrayList stateKeys = new ArrayList(); + variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); - synchronized (nec) { - nec.apply(tracker, ref, context, comp, eval); + HashSet stateKeysHash = new HashSet(stateKeys); + + for ( StateKey stateKey : stateKeysHash ) { + NewEvaluationContext nec = evaluationContexts.get(stateKey); + + // eval against the comp + synchronized (nec) { + nec.apply(tracker, ref, context, comp, eval); + } + + // eval=null against all comps of different type + for ( VariantContext otherComp : compSet ) { + if ( otherComp != comp ) { + synchronized (nec) { + nec.apply(tracker, ref, context, otherComp, null); + } + } + } } } } @@ -270,6 +359,38 @@ public class VariantEvalWalker extends RodWalker implements Tr return null; } + private VariantContext findMatchingComp(final VariantContext eval, final Set comps) { + // if no comps, return null + if ( comps == null || comps.isEmpty() ) + return null; + + // if no eval, return any comp + if ( eval == null ) + return comps.iterator().next(); + + // find all of the matching comps + List matchingComps = new ArrayList(comps.size()); + for ( VariantContext comp : comps ) { + if ( comp.getType() == eval.getType() ) + matchingComps.add(comp); + } + + // if no matching comp, return null + if ( matchingComps.size() == 0 ) + return null; + + // find the comp which matches both the reference allele and alternate allele from eval + Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0); + for ( VariantContext comp : matchingComps ) { + Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0); + if ( (altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())) ) + return comp; + } + + // if none match, just return the first one unless we require a strict match + return (requireStrictAlleleMatch ? null : matchingComps.get(0)); + } + public Integer treeReduce(Integer lhs, Integer rhs) { return null; } @Override @@ -395,15 +516,15 @@ public class VariantEvalWalker extends RodWalker implements Tr public static String getAllSampleName() { return ALL_SAMPLE_NAME; } - public Set getKnownNames() { return knownNames; } + public List> getKnowns() { return knowns; } - public Set getEvalNames() { return evalNames; } + public List> getEvals() { return evals; } public Set getSampleNamesForEvaluation() { return sampleNamesForEvaluation; } public Set getSampleNamesForStratification() { return sampleNamesForStratification; } - public Set getCompNames() { return compNames; } + public List> getComps() { return comps; } public Set getJexlExpressions() { return jexlExpressions; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java index 255a54737..9facb11b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java @@ -22,9 +22,6 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { @DataPoint(description = "number of eval SNP sites") long nEvalVariants = 0; - @DataPoint(description = "number of comp SNP sites") - long nCompVariants = 0; - @DataPoint(description = "number of eval sites outside of comp sites") long novelSites = 0; @@ -75,12 +72,9 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { } public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - boolean evalIsGood = eval != null && eval.isVariant(); - boolean expectingIndels = eval != null && eval.isIndel(); + boolean evalIsGood = eval != null && eval.isPolymorphic(); + boolean compIsGood = comp != null && comp.isNotFiltered(); - boolean compIsGood = expectingIndels ? comp != null && comp.isNotFiltered() && comp.isIndel() : comp != null && comp.isNotFiltered() && comp.isSNP() ; - - if (compIsGood) nCompVariants++; // count the number of comp events if (evalIsGood) nEvalVariants++; // count the number of eval events if (compIsGood && evalIsGood) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 8c281b2f8..2913c97a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -39,8 +39,10 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public long nInsertions = 0; @DataPoint(description = "Number of deletions") public long nDeletions = 0; - @DataPoint(description = "Number of complex loci") + @DataPoint(description = "Number of complex indels") public long nComplex = 0; + @DataPoint(description = "Number of mixed loci (loci that can't be classified as a SNP, Indel or MNP)") + public long nMixed = 0; @DataPoint(description = "Number of no calls loci") @@ -93,28 +95,44 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { nCalledLoci++; - if (vc1.isVariant()) nVariantLoci++; - switch (vc1.getType()) { - case NO_VARIATION: - nRefLoci++; - break; - case SNP: - nSNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; - break; - case MNP: - nMNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; - break; - case INDEL: - if (vc1.isInsertion()) nInsertions++; - else nDeletions++; - break; - case MIXED: - nComplex++; - break; - default: - throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); + // Note from Eric: + // This is really not correct. What we really want here is a polymorphic vs. monomorphic count (i.e. on the Genotypes). + // So in order to maintain consistency with the previous implementation (and the intention of the original author), I've + // added in a proxy check for monomorphic status here. + // Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call. + if ( vc1.isMonomorphic() ) { + nRefLoci++; + } else { + switch (vc1.getType()) { + case NO_VARIATION: + // shouldn't get here + break; + case SNP: + nVariantLoci++; + nSNPs++; + if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + break; + case MNP: + nVariantLoci++; + nMNPs++; + if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + break; + case INDEL: + nVariantLoci++; + if (vc1.isSimpleInsertion()) + nInsertions++; + else if (vc1.isSimpleDeletion()) + nDeletions++; + else + nComplex++; + break; + case MIXED: + nVariantLoci++; + nMixed++; + break; + default: + throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); + } } String refStr = vc1.getReference().getBaseString().toUpperCase(); @@ -173,8 +191,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { heterozygosity = perLocusRate(nHets); heterozygosityPerBp = perLocusRInverseRate(nHets); hetHomRatio = ratio(nHets, nHomVar); - indelRate = perLocusRate(nDeletions + nInsertions); - indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions); + indelRate = perLocusRate(nDeletions + nInsertions + nComplex); + indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions + nComplex); deletionInsertionRatio = ratio(nDeletions, nInsertions); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java index 77def0f30..ffe7c185f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java @@ -90,18 +90,19 @@ public class IndelLengthHistogram extends VariantEvaluator { public int getComparisonOrder() { return 1; } // need only the evals public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( ! vc1.isBiallelic() && vc1.isIndel() ) { - //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); - return vc1.toString(); // biallelic sites are output - } - if ( vc1.isIndel() ) { - if ( vc1.isInsertion() ) { + if ( vc1.isIndel() && vc1.isPolymorphic() ) { + + if ( ! vc1.isBiallelic() ) { + //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); + return vc1.toString(); // biallelic sites are output + } + + // only count simple insertions/deletions, not complex indels + if ( vc1.isSimpleInsertion() ) { indelHistogram.update(vc1.getAlternateAllele(0).length()); - } else if ( vc1.isDeletion() ) { + } else if ( vc1.isSimpleDeletion() ) { indelHistogram.update(-vc1.getReference().length()); - } else { - throw new ReviewedStingException("Indel type that is not insertion or deletion."); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java deleted file mode 100755 index 6e1b76acd..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java +++ /dev/null @@ -1,221 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.ArrayList; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author delangel - * @since Apr 11, 2010 - */ - -@Analysis(name = "Indel Metrics by allele count", description = "Shows various stats binned by allele count") -public class IndelMetricsByAC extends VariantEvaluator { - // a mapping from quality score histogram bin to Ti/Tv ratio - @DataPoint(description = "Indel Metrics by allele count") - IndelMetricsByAc metrics = null; - - int numSamples = 0; - - public void initialize(VariantEvalWalker walker) { - numSamples = walker.getNumSamples(); - } - - //@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count") - //AlleleCountStats alleleCountStats = null; - private static final int INDEL_SIZE_LIMIT = 100; - private static final int NUM_SCALAR_COLUMNS = 6; - static int len2Index(int ind) { - return ind+INDEL_SIZE_LIMIT; - } - - static int index2len(int ind) { - return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS; - } - - protected final static String[] METRIC_COLUMNS; - static { - METRIC_COLUMNS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1]; - METRIC_COLUMNS[0] = "AC"; - METRIC_COLUMNS[1] = "nIns"; - METRIC_COLUMNS[2] = "nDels"; - METRIC_COLUMNS[3] = "n"; - METRIC_COLUMNS[4] = "nComplex"; - METRIC_COLUMNS[5] = "nLong"; - - for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++) - METRIC_COLUMNS[k] = "indel_size_len"+Integer.valueOf(index2len(k)); - } - - class IndelMetricsAtAC { - public int ac = -1, nIns =0, nDel = 0, nComplex = 0, nLong; - public int sizeCount[] = new int[2*INDEL_SIZE_LIMIT+1]; - - public IndelMetricsAtAC(int ac) { this.ac = ac; } - - public void update(VariantContext eval) { - int eventLength = 0; - if ( eval.isInsertion() ) { - eventLength = eval.getAlternateAllele(0).length(); - nIns++; - } else if ( eval.isDeletion() ) { - eventLength = -eval.getReference().length(); - nDel++; - } - else { - nComplex++; - } - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) - sizeCount[len2Index(eventLength)]++; - else - nLong++; - - - - } - - // corresponding to METRIC_COLUMNS - public String getColumn(int i) { - if (i >= NUM_SCALAR_COLUMNS && i <=NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT) - return String.valueOf(sizeCount[i-NUM_SCALAR_COLUMNS]); - - switch (i) { - case 0: return String.valueOf(ac); - case 1: return String.valueOf(nIns); - case 2: return String.valueOf(nDel); - case 3: return String.valueOf(nIns + nDel); - case 4: return String.valueOf(nComplex); - case 5: return String.valueOf(nLong); - - default: - throw new ReviewedStingException("Unexpected column " + i); - } - } - } - - class IndelMetricsByAc implements TableType { - ArrayList metrics = new ArrayList(); - Object[] rows = null; - - public IndelMetricsByAc( int nchromosomes ) { - rows = new Object[nchromosomes+1]; - metrics = new ArrayList(nchromosomes+1); - for ( int i = 0; i < nchromosomes + 1; i++ ) { - metrics.add(new IndelMetricsAtAC(i)); - rows[i] = "ac" + i; - } - } - - public Object[] getRowKeys() { - return rows; - } - - public Object[] getColumnKeys() { - return METRIC_COLUMNS; - } - - public String getName() { - return "IndelMetricsByAc"; - } - - // - public String getCell(int ac, int y) { - return metrics.get(ac).getColumn(y); - } - - public String toString() { - return ""; - } - - public void incrValue( VariantContext eval ) { - int ac = -1; - - if ( eval.hasGenotypes() ) - ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); - else if ( eval.hasAttribute("AC") ) { - ac = Integer.valueOf(eval.getAttributeAsString("AC")); - } - - if ( ac != -1 ) - metrics.get(ac).update(eval); - } - } - - //public IndelMetricsByAC(VariantEvalWalker parent) { - //super(parent); - // don't do anything - //} - - public String getName() { - return "IndelMetricsByAC"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - final String interesting = null; - - if (eval != null ) { - if ( metrics == null ) { - int nSamples = numSamples; - //int nSamples = 2; - if ( nSamples != -1 ) - metrics = new IndelMetricsByAc(2 * nSamples); - } - - if ( eval.isIndel() && eval.isBiallelic() && - metrics != null ) { - metrics.incrValue(eval); - } - } - - return interesting; // This module doesn't capture any interesting sites, so return null - } - - //public void finalizeEvaluation() { - // - //} -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java index d99196ecf..f70e6c2de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java @@ -44,7 +44,7 @@ public class IndelStatistics extends VariantEvaluator { @DataPoint(description = "Indel Statistics") IndelStats indelStats = null; - @DataPoint(description = "Indel Classification") + // @DataPoint(description = "Indel Classification") IndelClasses indelClasses = null; int numSamples = 0; @@ -57,13 +57,13 @@ public class IndelStatistics extends VariantEvaluator { private static final int IND_HET = 0; private static final int IND_INS = 1; private static final int IND_DEL = 2; - private static final int IND_AT_CG_RATIO = 3; + private static final int IND_COMPLEX = 3; private static final int IND_HET_INS = 4; private static final int IND_HOM_INS = 5; private static final int IND_HET_DEL = 6; private static final int IND_HOM_DEL = 7; private static final int IND_HOM_REF = 8; - private static final int IND_COMPLEX = 9; + private static final int IND_MIXED = 9; private static final int IND_LONG = 10; private static final int IND_AT_EXP = 11; private static final int IND_CG_EXP = 12; @@ -79,15 +79,14 @@ public class IndelStatistics extends VariantEvaluator { } static class IndelStats implements TableType { - protected final static String ALL_SAMPLES_KEY = "allSamples"; - protected final static String[] COLUMN_KEYS; + protected final static String[] COLUMN_KEYS; static { COLUMN_KEYS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1]; COLUMN_KEYS[0] = "heterozygosity"; COLUMN_KEYS[1] = "insertions"; COLUMN_KEYS[2] = "deletions"; - COLUMN_KEYS[3] = "AT_CG_expansion_ratio"; + COLUMN_KEYS[3] = "complex"; COLUMN_KEYS[4] = "het_insertions"; COLUMN_KEYS[5] = "homozygous_insertions"; COLUMN_KEYS[6] = "het_deletions"; @@ -104,13 +103,10 @@ public class IndelStatistics extends VariantEvaluator { } // map of sample to statistics - protected final HashMap indelSummary = new HashMap(); + protected final int[] indelSummary; public IndelStats(final VariantContext vc) { - indelSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]); - for( final String sample : vc.getGenotypes().keySet() ) { - indelSummary.put(sample, new int[COLUMN_KEYS.length]); - } + indelSummary = new int[COLUMN_KEYS.length]; } /** @@ -118,19 +114,10 @@ public class IndelStatistics extends VariantEvaluator { * @return one row per sample */ public Object[] getRowKeys() { - return indelSummary.keySet().toArray(new String[indelSummary.size()]); + return new String[]{"all"}; } public Object getCell(int x, int y) { - final Object[] rowKeys = getRowKeys(); - if (y == IND_AT_CG_RATIO) { - - int at = indelSummary.get(rowKeys[x])[IND_AT_EXP]; - int cg = indelSummary.get(rowKeys[x])[IND_CG_EXP]; - return String.format("%4.2f",((double)at) / (Math.max(cg, 1))); - } - else - return String.format("%d",indelSummary.get(rowKeys[x])[y]); - + return String.format("%d",indelSummary[y]); } /** @@ -160,96 +147,49 @@ public class IndelStatistics extends VariantEvaluator { int eventLength = 0; boolean isInsertion = false, isDeletion = false; - if ( vc.isInsertion() ) { + if ( vc.isSimpleInsertion() ) { eventLength = vc.getAlternateAllele(0).length(); - indelSummary.get(ALL_SAMPLES_KEY)[IND_INS]++; + indelSummary[IND_INS]++; isInsertion = true; - } else if ( vc.isDeletion() ) { - indelSummary.get(ALL_SAMPLES_KEY)[IND_DEL]++; + } else if ( vc.isSimpleDeletion() ) { + indelSummary[IND_DEL]++; eventLength = -vc.getReference().length(); isDeletion = true; } - else { - indelSummary.get(ALL_SAMPLES_KEY)[IND_COMPLEX]++; + else if (vc.isComplexIndel()) { + indelSummary[IND_COMPLEX]++; } + else if (vc.isMixed()) + indelSummary[IND_MIXED]++; + if (IndelUtils.isATExpansion(vc,ref)) - indelSummary.get(ALL_SAMPLES_KEY)[IND_AT_EXP]++; + indelSummary[IND_AT_EXP]++; if (IndelUtils.isCGExpansion(vc,ref)) - indelSummary.get(ALL_SAMPLES_KEY)[IND_CG_EXP]++; + indelSummary[IND_CG_EXP]++; // make sure event doesn't overstep array boundaries - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) { - indelSummary.get(ALL_SAMPLES_KEY)[len2Index(eventLength)]++; - if (eventLength % 3 != 0) - indelSummary.get(ALL_SAMPLES_KEY)[IND_FRAMESHIFT]++; - } - else - indelSummary.get(ALL_SAMPLES_KEY)[IND_LONG]++; - - - for( final String sample : vc.getGenotypes().keySet() ) { - if ( indelSummary.containsKey(sample) ) { - Genotype g = vc.getGenotype(sample); - boolean isVariant = (g.isCalled() && !g.isHomRef()); - if (isVariant) { - // update ins/del count - if (isInsertion) { - indelSummary.get(sample)[IND_INS]++; - } - else if (isDeletion) - indelSummary.get(sample)[IND_DEL]++; - else - indelSummary.get(sample)[IND_COMPLEX]++; - - // update histogram - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) { - indelSummary.get(sample)[len2Index(eventLength)]++; - if (eventLength % 3 != 0) - indelSummary.get(sample)[IND_FRAMESHIFT]++; - } - else - indelSummary.get(sample)[IND_LONG]++; - - if (g.isHet()) - if (isInsertion) - indelSummary.get(sample)[IND_HET_INS]++; - else if (isDeletion) - indelSummary.get(sample)[IND_HET_DEL]++; - else - if (isInsertion) - indelSummary.get(sample)[IND_HOM_INS]++; - else if (isDeletion) - indelSummary.get(sample)[IND_HOM_DEL]++; - - if (IndelUtils.isATExpansion(vc,ref)) - indelSummary.get(sample)[IND_AT_EXP]++; - if (IndelUtils.isCGExpansion(vc,ref)) - indelSummary.get(sample)[IND_CG_EXP]++; - - - } - else - indelSummary.get(sample)[IND_HOM_REF]++; + if (vc.isSimpleDeletion() || vc.isSimpleInsertion()) { + if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) { + indelSummary[len2Index(eventLength)]++; + if (eventLength % 3 != 0) + indelSummary[IND_FRAMESHIFT]++; } + else + indelSummary[IND_LONG]++; } - } } static class IndelClasses implements TableType { - protected final static String ALL_SAMPLES_KEY = "allSamples"; protected final static String[] columnNames = IndelUtils.getIndelClassificationNames(); // map of sample to statistics - protected final HashMap indelClassSummary = new HashMap(); + protected final int[] indelClassSummary; public IndelClasses(final VariantContext vc) { - indelClassSummary.put(ALL_SAMPLES_KEY, new int[columnNames.length]); - for( final String sample : vc.getGenotypes().keySet() ) { - indelClassSummary.put(sample, new int[columnNames.length]); - } + indelClassSummary = new int[columnNames.length]; } /** @@ -257,11 +197,10 @@ public class IndelStatistics extends VariantEvaluator { * @return one row per sample */ public Object[] getRowKeys() { - return indelClassSummary.keySet().toArray(new String[indelClassSummary.size()]); + return new String[]{"all"}; } public Object getCell(int x, int y) { - final Object[] rowKeys = getRowKeys(); - return String.format("%d",indelClassSummary.get(rowKeys[x])[y]); + return String.format("%d",indelClassSummary[y]); } /** @@ -285,18 +224,7 @@ public class IndelStatistics extends VariantEvaluator { } private void incrementSampleStat(VariantContext vc, int index) { - indelClassSummary.get(ALL_SAMPLES_KEY)[index]++; - for( final String sample : vc.getGenotypes().keySet() ) { - if ( indelClassSummary.containsKey(sample) ) { - Genotype g = vc.getGenotype(sample); - boolean isVariant = (g.isCalled() && !g.isHomRef()); - if (isVariant) - // update count - indelClassSummary.get(sample)[index]++; - - } - } - + indelClassSummary[index]++; } /* * increment the specified value @@ -342,18 +270,15 @@ public class IndelStatistics extends VariantEvaluator { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (eval != null ) { + if (eval != null && eval.isPolymorphic()) { if ( indelStats == null ) { - int nSamples = numSamples; - - if ( nSamples != -1 ) - indelStats = new IndelStats(eval); + indelStats = new IndelStats(eval); } if ( indelClasses == null ) { indelClasses = new IndelClasses(eval); } - if ( eval.isIndel() && eval.isBiallelic() ) { + if ( eval.isIndel() || eval.isMixed() ) { if (indelStats != null ) indelStats.incrValue(eval, ref); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java index d466645ea..203c15a85 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java @@ -166,7 +166,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval } } - if ( eval.isSNP() && eval.isBiallelic() && metrics != null ) { + if ( eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() && metrics != null ) { metrics.incrValue(eval); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java index ec43cbd55..e51623c3c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ThetaVariantEvaluator.java @@ -37,77 +37,74 @@ public class ThetaVariantEvaluator extends VariantEvaluator { } public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) { + if (vc == null || !vc.isSNP() || !vc.hasGenotypes() || vc.isMonomorphic()) { return null; //no interesting sites } - if (vc.hasGenotypes()) { + //this maps allele to a count + ConcurrentMap alleleCounts = new ConcurrentHashMap(); - //this maps allele to a count - ConcurrentMap alleleCounts = new ConcurrentHashMap(); + int numHetsHere = 0; + float numGenosHere = 0; + int numIndsHere = 0; - int numHetsHere = 0; - float numGenosHere = 0; - int numIndsHere = 0; + for (Genotype genotype : vc.getGenotypes().values()) { + numIndsHere++; + if (!genotype.isNoCall()) { + //increment stats for heterozygosity + if (genotype.isHet()) { + numHetsHere++; + } - for (Genotype genotype : vc.getGenotypes().values()) { - numIndsHere++; - if (!genotype.isNoCall()) { - //increment stats for heterozygosity - if (genotype.isHet()) { - numHetsHere++; - } + numGenosHere++; + //increment stats for pairwise mismatches - numGenosHere++; - //increment stats for pairwise mismatches - - for (Allele allele : genotype.getAlleles()) { - if (allele.isNonNull() && allele.isCalled()) { - String alleleString = allele.toString(); - alleleCounts.putIfAbsent(alleleString, 0); - alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1); - } + for (Allele allele : genotype.getAlleles()) { + if (allele.isNonNull() && allele.isCalled()) { + String alleleString = allele.toString(); + alleleCounts.putIfAbsent(alleleString, 0); + alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1); } } } - if (numGenosHere > 0) { - //only if have one called genotype at least - this.numSites++; + } + if (numGenosHere > 0) { + //only if have one called genotype at least + this.numSites++; - this.totalHet += numHetsHere / numGenosHere; + this.totalHet += numHetsHere / numGenosHere; - //compute based on num sites - float harmonicFactor = 0; - for (int i = 1; i <= numIndsHere; i++) { - harmonicFactor += 1.0 / i; - } - this.thetaRegionNumSites += 1.0 / harmonicFactor; + //compute based on num sites + float harmonicFactor = 0; + for (int i = 1; i <= numIndsHere; i++) { + harmonicFactor += 1.0 / i; + } + this.thetaRegionNumSites += 1.0 / harmonicFactor; - //now compute pairwise mismatches - float numPairwise = 0; - float numDiffs = 0; - for (String allele1 : alleleCounts.keySet()) { - int allele1Count = alleleCounts.get(allele1); + //now compute pairwise mismatches + float numPairwise = 0; + float numDiffs = 0; + for (String allele1 : alleleCounts.keySet()) { + int allele1Count = alleleCounts.get(allele1); - for (String allele2 : alleleCounts.keySet()) { - if (allele1.compareTo(allele2) < 0) { - continue; - } - if (allele1 .compareTo(allele2) == 0) { - numPairwise += allele1Count * (allele1Count - 1) * .5; + for (String allele2 : alleleCounts.keySet()) { + if (allele1.compareTo(allele2) < 0) { + continue; + } + if (allele1 .compareTo(allele2) == 0) { + numPairwise += allele1Count * (allele1Count - 1) * .5; - } - else { - int allele2Count = alleleCounts.get(allele2); - numPairwise += allele1Count * allele2Count; - numDiffs += allele1Count * allele2Count; - } + } + else { + int allele2Count = alleleCounts.get(allele2); + numPairwise += allele1Count * allele2Count; + numDiffs += allele1Count * allele2Count; } } + } - if (numPairwise > 0) { - this.totalAvgDiffs += numDiffs / numPairwise; - } + if (numPairwise > 0) { + this.totalAvgDiffs += numDiffs / numPairwise; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index be957abd7..1feb37e01 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -40,7 +40,7 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv } public void updateTiTv(VariantContext vc, boolean updateStandard) { - if (vc != null && vc.isSNP() && vc.isBiallelic()) { + if (vc != null && vc.isSNP() && vc.isBiallelic() && vc.isPolymorphic()) { if (VariantContextUtils.isTransition(vc)) { if (updateStandard) nTiInComp++; else nTi++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 9c331b577..307b4f684 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -117,7 +117,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { public SiteStatus calcSiteStatus(VariantContext vc) { if ( vc == null ) return SiteStatus.NO_CALL; if ( vc.isFiltered() ) return SiteStatus.FILTERED; - if ( ! vc.isVariant() ) return SiteStatus.MONO; + if ( vc.isMonomorphic() ) return SiteStatus.MONO; + if ( vc.hasGenotypes() ) return SiteStatus.POLY; // must be polymorphic if isMonomorphic was false and there are genotypes if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { int ac = 0; @@ -132,8 +133,6 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { else ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY); return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO; - } else if ( vc.hasGenotypes() ) { - return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO; } else { return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do //return SiteStatus.NO_CALL; // we can't figure out what to do diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java index e29e7ed50..83a1c2f3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java @@ -8,6 +8,8 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.NewEvaluationConte import org.broadinstitute.sting.gatk.walkers.varianteval.util.StateKey; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.Collection; + public abstract class VariantEvaluator { public void initialize(VariantEvalWalker walker) {} @@ -17,25 +19,18 @@ public abstract class VariantEvaluator { public abstract int getComparisonOrder(); // called at all sites, regardless of eval context itself; useful for counting processed bases - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { } + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + } + + public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return null; } - public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, NewEvaluationContext group) { - return update1(vc1, tracker, ref, context); - } - - - public String update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return null; } - public String update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, NewEvaluationContext group) { - return update2(vc1, vc2, tracker, ref, context); - } - public void finalizeEvaluation() {} protected double rate(long n, long d) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java index b6ad55b18..263227938 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java @@ -232,7 +232,7 @@ public class VariantQualityScore extends VariantEvaluator { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { final String interesting = null; - if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) + if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) if( titvStats == null ) { titvStats = new TiTvStats(); } titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 411493d4f..3cc22cc52 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -1,23 +1,28 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.List; +/** + * Stratifies the eval RODs by the allele count of the alternate allele + * + * Looks at the AC value in the INFO field, and uses that value if present. If absent, + * computes the AC from the genotypes themselves. For no AC can be computed, 0 is used. + */ public class AlleleCount extends VariantStratifier { - // needs to know the variant context - private ArrayList states = new ArrayList(); - @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { + List> evals = getVariantEvalWalker().getEvals(); + // we can only work with a single eval VCF, and it must have genotypes - if ( evalNames.size() != 1 ) + if ( evals.size() != 1 ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf"); // There are 2 x n sample chromosomes for diploids @@ -33,11 +38,7 @@ public class AlleleCount extends VariantStratifier { getVariantEvalWalker().getLogger().info("AlleleCount using " + nchrom + " chromosomes"); } - public ArrayList getAllStates() { - return states; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(1); if (eval != null) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java index 2ffc7716c..3d2dda651 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java @@ -2,30 +2,28 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.List; +/** + * Stratifies the eval RODs by the allele frequency of the alternate allele + * + * Uses a constant 0.005 frequency grid, and projects the AF INFO field value. Requires + * that AF be present in every ROD, otherwise this stratification throws an exception + */ public class AlleleFrequency extends VariantStratifier { - // needs to know the variant context - private ArrayList states; - @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { states = new ArrayList(); for( double a = 0.000; a <= 1.005; a += 0.005 ) { states.add(String.format("%.3f", a)); } } - public ArrayList getAllStates() { - return states; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); if (eval != null) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java index c6975808f..1f31ebfa7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java @@ -1,31 +1,26 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.List; + +/** + * Required stratification grouping output by each comp ROD + */ public class CompRod extends VariantStratifier implements RequiredStratification { - // Needs to know the comp rods - private Set compNames; - private ArrayList states; - @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - this.compNames = compNames; - - states = new ArrayList(); - states.addAll(compNames); + public void initialize() { + for ( RodBinding rod : getVariantEvalWalker().getComps() ) + states.add(rod.getName()); } - public ArrayList getAllStates() { - return states; - } - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); relevantStates.add(compName); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java index c14355035..c45a73231 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java @@ -2,28 +2,22 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.List; +/** + * Stratifies the evaluation by each contig in the reference sequence + */ public class Contig extends VariantStratifier { - // needs to know the variant context - private ArrayList states; - @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - states = new ArrayList(); - states.addAll(contigNames); + public void initialize() { + states.addAll(getVariantEvalWalker().getContigNames()); states.add("all"); } - public ArrayList getAllStates() { - return states; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); if (eval != null) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java index 3e8a6ed17..539cd21ef 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java @@ -2,33 +2,37 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.List; +/** + * CpG is a stratification module for VariantEval that divides the input data by within/not within a CpG site + * + *

    + * It is a three-state stratification: + *

      + *
    • The locus is a CpG site ("CpG") + *
    • The locus is not a CpG site ("non_CpG") + *
    • The locus is either a CpG or not a CpG site ("all") + *
    + * A CpG site is defined as a site where the reference base at a locus is a C and the adjacent reference base in the 3' direction is a G. + */ public class CpG extends VariantStratifier { - private ArrayList states; - @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - states = new ArrayList(); + public void initialize() { states.add("all"); states.add("CpG"); states.add("non_CpG"); } - public ArrayList getAllStates() { - return states; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { boolean isCpG = false; if (ref != null && ref.getBases() != null) { String fwRefBases = new String(ref.getBases()); - String leftFlank = fwRefBases.substring((fwRefBases.length()/2) - 1, (fwRefBases.length()/2) + 1); + //String leftFlank = fwRefBases.substring((fwRefBases.length()/2) - 1, (fwRefBases.length()/2) + 1); String rightFlank = fwRefBases.substring((fwRefBases.length()/2), (fwRefBases.length()/2) + 2); //if (leftFlank.equalsIgnoreCase("CG") || leftFlank.equalsIgnoreCase("GC") || rightFlank.equalsIgnoreCase("CG") || rightFlank.equalsIgnoreCase("GC")) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java index 155a66186..3223626c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java @@ -2,21 +2,21 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; -import java.util.Set; +import java.util.List; +/** + * Experimental stratification by the degeneracy of an amino acid, according to VCF annotation. Not safe + */ public class Degeneracy extends VariantStratifier { - private ArrayList states; - private HashMap> degeneracies; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { + public void initialize() { states = new ArrayList(); states.add("1-fold"); states.add("2-fold"); @@ -79,11 +79,7 @@ public class Degeneracy extends VariantStratifier { } } - public ArrayList getAllStates() { - return states; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); relevantStates.add("all"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java index 40f952fd2..e276adc32 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java @@ -1,31 +1,25 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.List; +/** + * Required stratification grouping output by each eval ROD + */ public class EvalRod extends VariantStratifier implements RequiredStratification { - // needs to know the eval rods - private Set evalNames; - private ArrayList states; - @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - this.evalNames = evalNames; - + public void initialize() { states = new ArrayList(); - states.addAll(evalNames); + for ( RodBinding rod : getVariantEvalWalker().getEvals() ) + states.add(rod.getName()); } - public ArrayList getAllStates() { - return states; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); relevantStates.add(evalName); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java index 3b7a419f2..aacfae993 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Filter.java @@ -2,29 +2,23 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.List; +/** + * Stratifies by the FILTER status (PASS, FAIL) of the eval records + */ public class Filter extends VariantStratifier { - // needs to know the variant context - private ArrayList states; - @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - states = new ArrayList(); + public void initialize() { states.add("called"); states.add("filtered"); states.add("raw"); } - public ArrayList getAllStates() { - return states; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); relevantStates.add("raw"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index c6c094f8e..88ffcaaeb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -2,39 +2,44 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.List; +/** + * Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation. + */ public class FunctionalClass extends VariantStratifier { - // needs to know the variant context - private ArrayList states; + + public enum FunctionalType { + silent, + missense, + nonsense + } + @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - states = new ArrayList(); + public void initialize() { states.add("all"); - states.add("silent"); - states.add("missense"); - states.add("nonsense"); + for ( FunctionalType type : FunctionalType.values() ) + states.add(type.name()); } - public ArrayList getAllStates() { - return states; - } - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); relevantStates.add("all"); if (eval != null && eval.isVariant()) { - String type = null; + FunctionalType type = null; if (eval.hasAttribute("refseq.functionalClass")) { - type = eval.getAttributeAsString("refseq.functionalClass"); + try { + type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass")); + } catch ( Exception e ) {} // don't error out if the type isn't supported } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -42,24 +47,33 @@ public class FunctionalClass extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); - - if ( newtype != null && !newtype.equalsIgnoreCase("null") && - ( type == null || - ( type.equals("silent") && !newtype.equals("silent") ) || - ( type.equals("missense") && newtype.equals("nonsense") ) ) - ) { - type = newtype; + String newtypeStr = eval.getAttributeAsString(key); + if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) { + try { + FunctionalType newType = FunctionalType.valueOf(newtypeStr); + if ( type == null || + ( type == FunctionalType.silent && newType != FunctionalType.silent ) || + ( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) { + type = newType; + } + } catch ( Exception e ) {} // don't error out if the type isn't supported } annotationId++; } while (eval.hasAttribute(key)); + + } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName() ) ) { + SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName()).toString()); + if ( snpEffType == SnpEff.EffectType.STOP_GAINED ) + type = FunctionalType.nonsense; + else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING ) + type = FunctionalType.missense; + else if ( snpEffType == SnpEff.EffectType.SYNONYMOUS_CODING ) + type = FunctionalType.silent; } - if (type != null) { - if (type.equals("silent")) { relevantStates.add("silent"); } - else if (type.equals("missense")) { relevantStates.add("missense"); } - else if (type.equals("nonsense")) { relevantStates.add("nonsense"); } + if ( type != null ) { + relevantStates.add(type.name()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java index 76efedbf4..c0cab4534 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java @@ -6,30 +6,30 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatc import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import java.util.List; import java.util.ArrayList; import java.util.Set; +/** + * Stratifies the eval RODs by user-supplied JEXL expressions + * + * See http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions for more details + */ public class JexlExpression extends VariantStratifier implements StandardStratification { // needs to know the jexl expressions private Set jexlExpressions; - private ArrayList states; @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - this.jexlExpressions = jexlExpressions; + public void initialize() { + jexlExpressions = getVariantEvalWalker().getJexlExpressions(); - states = new ArrayList(); states.add("none"); for ( SortableJexlVCMatchExp jexlExpression : jexlExpressions ) { states.add(jexlExpression.name); } } - public ArrayList getAllStates() { - return states; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); relevantStates.add("none"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java index a0973a088..77d98d33b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java @@ -1,58 +1,37 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.ArrayList; -import java.util.Collection; -import java.util.EnumSet; -import java.util.Set; +import java.util.*; +/** + * Stratifies by whether a site in in the list of known RODs (e.g., dbsnp by default) + */ public class Novelty extends VariantStratifier implements StandardStratification { // needs the variant contexts and known names - private Set knownNames; - private ArrayList states; + private List> knowns; + @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - this.knownNames = knownNames; - - states = new ArrayList(); - states.add("all"); - states.add("known"); - states.add("novel"); + public void initialize() { + states = new ArrayList(Arrays.asList("all", "known", "novel")); + knowns = getVariantEvalWalker().getKnowns(); } - public ArrayList getAllStates() { - return states; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - boolean isNovel = true; - - if (tracker != null) { - for (String knownName : knownNames) { - if (tracker.hasROD(knownName)) { - EnumSet allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION); - if (eval != null) { - allowableTypes.add(eval.getType()); - } - - Collection knownComps = tracker.getVariantContexts(ref, knownName, allowableTypes, ref.getLocus(), true, true); - - isNovel = knownComps.size() == 0; - - break; + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + if (tracker != null && eval != null) { + final Collection knownComps = tracker.getValues(knowns, ref.getLocus()); + for ( final VariantContext c : knownComps ) { + // loop over sites, looking for something that matches the type eval + if ( eval.getType() == c.getType() ) { + return Arrays.asList("all", "known"); } } } - ArrayList relevantStates = new ArrayList(); - relevantStates.add("all"); - relevantStates.add(isNovel ? "novel" : "known"); - - return relevantStates; + return Arrays.asList("all", "novel"); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java index a2a3eb3fb..c697b5b7a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java @@ -2,30 +2,25 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.ArrayList; -import java.util.Set; +import java.util.Arrays; +import java.util.List; +/** + * Stratifies the eval RODs by each sample in the eval ROD. + * + * This allows the system to analyze each sample separately. Since many evaluations + * only consider non-reference sites, stratifying by sample results in meaningful + * calculations for CompOverlap + */ public class Sample extends VariantStratifier { - // needs the sample names - private ArrayList samples; - @Override - public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) { - samples = new ArrayList(); - samples.addAll(sampleNames); + public void initialize() { + states.addAll(getVariantEvalWalker().getSampleNamesForStratification()); } - public ArrayList getAllStates() { - return samples; - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { - ArrayList relevantStates = new ArrayList(); - relevantStates.add(sampleName); - - return relevantStates; + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + return Arrays.asList(sampleName); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java index 2c4b8bc46..5cae2fb15 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantStratifier.java @@ -3,14 +3,15 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; -import java.util.Set; +import java.util.Arrays; +import java.util.List; public abstract class VariantStratifier implements Comparable { private VariantEvalWalker variantEvalWalker; + protected ArrayList states = new ArrayList(); /** * @return a reference to the parent VariantEvalWalker running this stratification @@ -27,17 +28,17 @@ public abstract class VariantStratifier implements Comparable { this.variantEvalWalker = variantEvalWalker; } - public abstract void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames); + public abstract void initialize(); - public ArrayList getAllStates() { - return new ArrayList(); - } - - public ArrayList getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { return null; } public int compareTo(Object o1) { return this.getClass().getSimpleName().compareTo(o1.getClass().getSimpleName()); } + + public ArrayList getAllStates() { + return states; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java new file mode 100644 index 000000000..7d25498a5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/VariantType.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Stratifies the eval variants by their type (SNP, INDEL, ETC) + */ +public class VariantType extends VariantStratifier { + @Override + public void initialize() { + for ( VariantContext.Type t : VariantContext.Type.values() ) { + states.add(t.toString()); + } + } + + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + return eval == null ? Collections.emptyList() : Arrays.asList(eval.getType().toString()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 0a915db37..92e7c6554 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util; import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; @@ -56,8 +57,9 @@ public class VariantEvalUtils { /** * Initialize required, standard and user-specified stratification objects * - * @param noStandardStrats don't use the standard stratifications - * @param modulesToUse the list of stratification modules to use + * @param variantEvalWalker the parent walker + * @param noStandardStrats don't use the standard stratifications + * @param modulesToUse the list of stratification modules to use * @return set of stratifications to use */ public TreeSet initializeStratificationObjects(VariantEvalWalker variantEvalWalker, boolean noStandardStrats, String[] modulesToUse) { @@ -101,7 +103,7 @@ public class VariantEvalUtils { try { VariantStratifier vs = c.newInstance(); vs.setVariantEvalWalker(variantEvalWalker); - vs.initialize(variantEvalWalker.getJexlExpressions(), variantEvalWalker.getCompNames(), variantEvalWalker.getKnownNames(), variantEvalWalker.getEvalNames(), variantEvalWalker.getSampleNamesForStratification(), variantEvalWalker.getContigNames()); + vs.initialize(); strats.add(vs); } catch (InstantiationException e) { @@ -256,46 +258,6 @@ public class VariantEvalUtils { return report; } - /** - * Figure out what the allowable variation types are based on the eval context - * - * @param tracker the reference metadata tracker - * @param ref the reference context - * @param compNames the comp track names - * @param evalNames the evaluation track names - * @return the set of allowable variation types - */ - public EnumSet getAllowableVariationTypes(RefMetaDataTracker tracker, - ReferenceContext ref, - Set compNames, - Set evalNames, - boolean dynamicSelectTypes ) { - if ( dynamicSelectTypes ) { // todo -- this code is really conceptually broken - EnumSet allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION); - - if (tracker != null) { - Collection evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false); - - for (VariantContext vc : evalvcs) { - allowableTypes.add(vc.getType()); - } - - if (allowableTypes.size() == 1) { - // We didn't find any variation in the eval track, so now let's look at the comp track for allowable types - Collection compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false); - - for (VariantContext vc : compvcs) { - allowableTypes.add(vc.getType()); - } - } - } - - return allowableTypes; - } else { - return EnumSet.allOf(VariantContext.Type.class); - } - } - /** * Subset a VariantContext to a single sample * @@ -304,10 +266,7 @@ public class VariantEvalUtils { * @return a new VariantContext with just the requested sample */ public VariantContext getSubsetOfVariantContext(VariantContext vc, String sampleName) { - ArrayList sampleNames = new ArrayList(); - sampleNames.add(sampleName); - - return getSubsetOfVariantContext(vc, sampleNames); + return getSubsetOfVariantContext(vc, Arrays.asList(sampleName)); } /** @@ -318,7 +277,7 @@ public class VariantEvalUtils { * @return a new VariantContext with just the requested samples */ public VariantContext getSubsetOfVariantContext(VariantContext vc, Collection sampleNames) { - VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values()); + VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values(), vc.getAlleles()); HashMap newAts = new HashMap(vcsub.getAttributes()); @@ -344,86 +303,59 @@ public class VariantEvalUtils { * * @param tracker the metadata tracker * @param ref the reference context - * @param trackNames the list of track names to process - * @param allowableTypes a set of allowable variation types + * @param tracks the list of tracks to process * @param byFilter if false, only accept PASSing VariantContexts. Otherwise, accept both PASSing and filtered * sites * @param subsetBySample if false, do not separate the track into per-sample VCs * @param trackPerSample if false, don't stratify per sample (and don't cut up the VariantContext like we would need * to do this) - * @return a mapping of track names to a list of VariantContext objects + * + * @return the mapping of track to VC list that should be populated */ - public HashMap> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set trackNames, EnumSet allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample) { - HashMap> bindings = new HashMap>(); + public HashMap, HashMap>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List> tracks, boolean byFilter, boolean subsetBySample, boolean trackPerSample) { + if ( tracker == null ) + return null; - for (String trackName : trackNames) { - HashMap vcs = new HashMap(); + HashMap, HashMap>> bindings = new HashMap, HashMap>>(); - Collection contexts = tracker == null ? null : tracker.getVariantContexts(ref, trackName, allowableTypes, ref.getLocus(), true, true); - VariantContext vc = contexts != null && contexts.size() == 1 ? contexts.iterator().next() : null; + for ( RodBinding track : tracks ) { + HashMap> mapping = new HashMap>(); - // First, filter the VariantContext to represent only the samples for evaluation - if (vc != null) { + for ( VariantContext vc : tracker.getValues(track, ref.getLocus()) ) { + + // First, filter the VariantContext to represent only the samples for evaluation VariantContext vcsub = vc; - if (subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation())) { + if ( subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation()) ) { vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); } - if ((byFilter || !vcsub.isFiltered())) { - vcs.put(VariantEvalWalker.getAllSampleName(), vcsub); + if ( (byFilter || !vcsub.isFiltered()) ) { + addMapping(mapping, VariantEvalWalker.getAllSampleName(), vcsub); } // Now, if stratifying, split the subsetted vc per sample and add each as a new context - if (vc.hasGenotypes() && trackPerSample) { - for (String sampleName : variantEvalWalker.getSampleNamesForEvaluation()) { + if ( vc.hasGenotypes() && trackPerSample ) { + for ( String sampleName : variantEvalWalker.getSampleNamesForEvaluation() ) { VariantContext samplevc = getSubsetOfVariantContext(vc, sampleName); - if ((byFilter || !samplevc.isFiltered())) { - vcs.put(sampleName, samplevc); + if ( byFilter || !samplevc.isFiltered() ) { + addMapping(mapping, sampleName, samplevc); } } } - - bindings.put(trackName, vcs); } + + bindings.put(track, mapping); } return bindings; } - /** - * Maps track names to sample name to VariantContext objects. For eval tracks, VariantContexts per specified sample - * are also included. - * - * @param tracker the metadata tracker - * @param ref the reference context - * @param compNames the list of comp names to process - * @param evalNames the list of eval names to process - * @return a mapping of track names to a list of VariantContext objects - */ - public HashMap> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set compNames, Set evalNames, boolean dynamicSelectTypes) { - HashMap> vcs = new HashMap>(); - - EnumSet allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames, dynamicSelectTypes); - - boolean byFilter = false; - boolean perSampleIsEnabled = false; - for (VariantStratifier vs : variantEvalWalker.getStratificationObjects()) { - if (vs.getClass().getSimpleName().equals("Filter")) { - byFilter = true; - } else if (vs.getClass().getSimpleName().equals("Sample")) { - perSampleIsEnabled = true; - } - } - - HashMap> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled); - HashMap> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false); - - vcs.putAll(compBindings); - vcs.putAll(evalBindings); - - return vcs; + private void addMapping(HashMap> mappings, String sample, VariantContext vc) { + if ( !mappings.containsKey(sample) ) + mappings.put(sample, new HashSet()); + mappings.get(sample).add(vc); } /** @@ -436,12 +368,12 @@ public class VariantEvalUtils { * @param stateKeys all the state keys * @return a list of state keys */ - public ArrayList initializeStateKeys(HashMap> stateMap, Stack>> stateStack, StateKey stateKey, ArrayList stateKeys) { + public ArrayList initializeStateKeys(HashMap> stateMap, Stack>> stateStack, StateKey stateKey, ArrayList stateKeys) { if (stateStack == null) { - stateStack = new Stack>>(); + stateStack = new Stack>>(); for (VariantStratifier vs : stateMap.keySet()) { - HashMap> oneSetOfStates = new HashMap>(); + HashMap> oneSetOfStates = new HashMap>(); oneSetOfStates.put(vs, stateMap.get(vs)); stateStack.add(oneSetOfStates); @@ -449,10 +381,10 @@ public class VariantEvalUtils { } if (!stateStack.isEmpty()) { - Stack>> newStateStack = new Stack>>(); + Stack>> newStateStack = new Stack>>(); newStateStack.addAll(stateStack); - HashMap> oneSetOfStates = newStateStack.pop(); + HashMap> oneSetOfStates = newStateStack.pop(); VariantStratifier vs = oneSetOfStates.keySet().iterator().next(); for (String state : oneSetOfStates.get(vs)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 403c67d3e..16f1abf1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -28,9 +28,9 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; @@ -45,12 +45,43 @@ import java.io.FileNotFoundException; import java.util.*; /** - * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration + * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration * - * @author rpoplin - * @since Mar 14, 2011 + *

    + * Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value + * and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level + * have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered + * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a + * slightly lower quality level. + * + *

    + * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration + * + *

    Input

    + *

    + * The input raw variants to be recalibrated. + *

    + * The recalibration table file in CSV format that was generated by the VariantRecalibrator walker. + *

    + * The tranches file that was generated by the VariantRecalibrator walker. + * + *

    Output

    + *

    + * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level. + * + *

    Examples

    + *
    + * java -Xmx3g -jar GenomeAnalysisTK.jar \
    + *   -T ApplyRecalibration \
    + *   -R reference/human_g1k_v37.fasta \
    + *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
    + *   --ts_filter_level 99.0 \
    + *   -tranchesFile path/to/output.tranches \
    + *   -recalFile path/to/output.recal \
    + *   -o path/to/output.recalibrated.filtered.vcf
    + * 
    * - * @help.summary Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration */ public class ApplyRecalibration extends RodWalker { @@ -58,7 +89,12 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// // Inputs ///////////////////////////// - @Input(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true) + /** + * These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling. + */ + @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) + public List> input; + @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true) private File RECAL_FILE; @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true) private File TRANCHES_FILE; @@ -66,7 +102,7 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// // Outputs ///////////////////////////// - @Output( doc="The output filtered, recalibrated VCF file", required=true) + @Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true) private VCFWriter vcfWriter = null; ///////////////////////////// @@ -74,7 +110,7 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) private double TS_FILTER_LEVEL = 99.0; - @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false) + @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false) private String[] IGNORE_INPUT_FILTERS = null; @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false) public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; @@ -103,17 +139,8 @@ public class ApplyRecalibration extends RodWalker { } Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity) - for( final ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) { - if( d.getName().startsWith("input") ) { - inputNames.add(d.getName()); - logger.info("Found input variant track with name " + d.getName()); - } else { - logger.info("Not evaluating ROD binding " + d.getName()); - } - } - - if( inputNames.size() == 0 ) { - throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." ); + for( final RodBinding rod : input ) { + inputNames.add( rod.getName() ); } if( IGNORE_INPUT_FILTERS != null ) { @@ -170,7 +197,7 @@ public class ApplyRecalibration extends RodWalker { return 1; } - for( VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) { + for( VariantContext vc : tracker.getValues(input, context.getLocation()) ) { if( vc != null ) { if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { String filterString = null; @@ -206,9 +233,9 @@ public class ApplyRecalibration extends RodWalker { filters.add(filterString); vc = VariantContext.modifyFilters(vc, filters); } - vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs), ref.getBase() ); + vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs) ); } else { // valid VC but not compatible with this mode, so just emit the variant untouched - vcfWriter.add( vc, ref.getBase() ); + vcfWriter.add( vc ); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index 17461de2f..3fa9c3883 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -207,6 +207,7 @@ public class GaussianMixtureModel { for( final boolean isNull : datum.isNull ) { if( isNull ) { return evaluateDatumMarginalized( datum ); } } + // Fill an array with the log10 probability coming from each Gaussian and then use MathUtils to sum them up correctly final double[] pVarInGaussianLog10 = new double[gaussians.size()]; int gaussianIndex = 0; for( final MultivariateGaussian gaussian : gaussians ) { @@ -215,6 +216,7 @@ public class GaussianMixtureModel { return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) } + // Used only to decide which covariate dimension is most divergent in order to report in the culprit info field annotation public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) { if(datum.isNull[iii]) { return null; } @@ -229,7 +231,7 @@ public class GaussianMixtureModel { } public double evaluateDatumMarginalized( final VariantDatum datum ) { - int numSamples = 0; + int numRandomDraws = 0; double sumPVarInGaussian = 0.0; final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization final double[] pVarInGaussianLog10 = new double[gaussians.size()]; @@ -248,10 +250,10 @@ public class GaussianMixtureModel { // add this sample's probability to the pile in order to take an average in the end sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k)) - numSamples++; + numRandomDraws++; } } } - return Math.log10( sumPVarInGaussian / ((double) numSamples) ); + return Math.log10( sumPVarInGaussian / ((double) numRandomDraws) ); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java index 6c1a7ddbc..5f688d001 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrainingSet.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; /** * Created by IntelliJ IDEA. @@ -36,7 +38,7 @@ import org.broadinstitute.sting.commandline.Tags; public class TrainingSet { - public String name; + public RodBinding rodBinding; public boolean isKnown = false; public boolean isTraining = false; public boolean isAntiTraining = false; @@ -46,8 +48,11 @@ public class TrainingSet { protected final static Logger logger = Logger.getLogger(TrainingSet.class); - public TrainingSet( final String name, final Tags tags ) { - this.name = name; + public TrainingSet( final RodBinding rodBinding) { + this.rodBinding = rodBinding; + + final Tags tags = rodBinding.getTags(); + final String name = rodBinding.getName(); // Parse the tags to decide which tracks have which properties if( tags != null ) { @@ -61,11 +66,11 @@ public class TrainingSet { // Report back to the user which tracks were found and the properties that were detected if( !isConsensus && !isAntiTraining ) { - logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", this.name, isKnown, isTraining, isTruth, prior) ); + logger.info( String.format( "Found %s track: \tKnown = %s \tTraining = %s \tTruth = %s \tPrior = Q%.1f", name, isKnown, isTraining, isTruth, prior) ); } else if( isConsensus ) { - logger.info( String.format( "Found consensus track: %s", this.name) ); + logger.info( String.format( "Found consensus track: %s", name) ); } else { - logger.info( String.format( "Found bad sites training track: %s", this.name) ); + logger.info( String.format( "Found bad sites training track: %s", name) ); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 67d54a408..e04bfab76 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -26,10 +26,10 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.List; /** @@ -50,11 +51,10 @@ public class VariantDataManager { private ExpandingArrayList data; private final double[] meanVector; private final double[] varianceVector; // this is really the standard deviation - public final ArrayList annotationKeys; - private final ExpandingArrayList trainingSets; + public final List annotationKeys; private final VariantRecalibratorArgumentCollection VRAC; protected final static Logger logger = Logger.getLogger(VariantDataManager.class); - + protected final List trainingSets; public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { this.data = null; @@ -62,7 +62,7 @@ public class VariantDataManager { this.VRAC = VRAC; meanVector = new double[this.annotationKeys.size()]; varianceVector = new double[this.annotationKeys.size()]; - trainingSets = new ExpandingArrayList(); + trainingSets = new ArrayList(); } public void setData( final ExpandingArrayList data ) { @@ -105,30 +105,30 @@ public class VariantDataManager { } } - public void addTrainingSet( final TrainingSet trainingSet ) { - trainingSets.add( trainingSet ); - } + public void addTrainingSet( final TrainingSet trainingSet ) { + trainingSets.add( trainingSet ); + } - public boolean checkHasTrainingSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isTraining ) { return true; } - } - return false; - } + public boolean checkHasTrainingSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTraining ) { return true; } + } + return false; + } - public boolean checkHasTruthSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isTruth ) { return true; } - } - return false; - } + public boolean checkHasTruthSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTruth ) { return true; } + } + return false; + } - public boolean checkHasKnownSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isKnown ) { return true; } - } - return false; - } + public boolean checkHasKnownSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isKnown ) { return true; } + } + return false; + } public ExpandingArrayList getTrainingData() { final ExpandingArrayList trainingData = new ExpandingArrayList(); @@ -240,6 +240,15 @@ public class VariantDataManager { if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } + + if (vc.isIndel() && annotationKey.equalsIgnoreCase("QD")) { + // normalize QD by event length for indel case + int eventLength = Math.abs(vc.getAlternateAllele(0).getBaseString().length() - vc.getReference().getBaseString().length()); // ignore multi-allelic complication here for now + if (eventLength > 0) { // sanity check + value /= (double)eventLength; + } + } + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } } catch( Exception e ) { @@ -249,20 +258,16 @@ public class VariantDataManager { return value; } - public void parseTrainingSets( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) { + public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC ) { datum.isKnown = false; datum.atTruthSite = false; datum.atTrainingSite = false; datum.atAntiTrainingSite = false; datum.prior = 2.0; - datum.consensusCount = 0; for( final TrainingSet trainingSet : trainingSets ) { - for( final VariantContext trainVC : tracker.getVariantContexts( ref, trainingSet.name, null, context.getLocation(), false, false ) ) { - if( trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && - ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()) ) { - + for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) { + if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { datum.isKnown = datum.isKnown || trainingSet.isKnown; datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth; datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining; @@ -272,11 +277,16 @@ public class VariantDataManager { if( trainVC != null ) { datum.atAntiTrainingSite = datum.atAntiTrainingSite || trainingSet.isAntiTraining; } - } } } + private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { + return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && + ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()); + } + public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { for( final VariantDatum datum : data ) { RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 8179463eb..529d17285 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -25,13 +25,9 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.ArgumentCollection; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; @@ -49,21 +45,81 @@ import java.io.PrintStream; import java.util.*; /** - * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score + * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. * - * User: rpoplin - * Date: 3/12/11 + *

    + * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker. + * + *

    + * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. + * One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. + * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship + * between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic + * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided + * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive + * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the + * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is + * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. + * + *

    + * See the GATK wiki for a tutorial and example recalibration accuracy plots. + * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration + * + *

    Input

    + *

    + * The input raw variants to be recalibrated. + *

    + * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. + * + *

    Output

    + *

    + * A recalibration table file in CSV format that is used by the ApplyRecalibration walker. + *

    + * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. + * + *

    Example

    + *
    + * java -Xmx4g -jar GenomeAnalysisTK.jar \
    + *   -T VariantRecalibrator \
    + *   -R reference/human_g1k_v37.fasta \
    + *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
    + *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
    + *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
    + *   -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \
    + *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
    + *   -recalFile path/to/output.recal \
    + *   -tranchesFile path/to/output.tranches \
    + *   -rscriptFile path/to/output.plots.R
    + * 
    * - * @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score */ public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { - public static final String VQS_LOD_KEY = "VQSLOD"; - public static final String CULPRIT_KEY = "culprit"; + public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model + public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); + ///////////////////////////// + // Inputs + ///////////////////////////// + /** + * These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling. + */ + @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) + public List> input; + + /** + * Any set of VCF files to use as lists of training, truth, or known sites. + * Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model. + * Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used. + * Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes. + * Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants. + */ + @Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false) + public List> resource = Collections.emptyList(); + ///////////////////////////// // Outputs ///////////////////////////// @@ -75,13 +131,29 @@ public class VariantRecalibrator extends RodWalker ignoreInputFilterSet = new TreeSet(); - private final Set inputNames = new HashSet(); private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC ); //--------------------------------------------------------------------------------------------------------------- @@ -125,13 +196,14 @@ public class VariantRecalibrator extends RodWalker rod : resource ) { + dataManager.addTrainingSet( new TrainingSet( rod ) ); } if( !dataManager.checkHasTrainingSet() ) { @@ -140,16 +212,6 @@ public class VariantRecalibrator extends RodWalker + * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data + * and any number of sources can be input. This tool currently supports two different combination types for each of + * variants (the first 8 fields of the VCF) and genotypes (the rest). * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. * Union: assumes each rod represents the same set of samples (although this is not enforced); using the - * priority list (if provided), emits a single record instance at every position represented in the rods. + * priority list (if provided), it emits a single record instance at every position represented in the rods. + * + * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD + * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect, + * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs + * can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just + * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single + * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out + * in the detailed example on the wiki. + * + *

    Input

    + *

    + * One or more variant sets to combine. + *

    + * + *

    Output

    + *

    + * A combined VCF. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T CombineVariants \
    + *   --variant input1.vcf \
    + *   --variant input2.vcf \
    + *   -o output.vcf \
    + *   -genotypeMergeOptions UNIQUIFY
    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T CombineVariants \
    + *   --variant:foo input1.vcf \
    + *   --variant:bar input2.vcf \
    + *   -o output.vcf \
    + *   -genotypeMergeOptions PRIORITIZE
    + *   -priority foo,bar
    + * 
    + * */ @Reference(window=@Window(start=-50,stop=50)) -@Requires(value={}) public class CombineVariants extends RodWalker { + /** + * The VCF files to merge together + * + * variants can take any number of arguments on the command line. Each -V argument + * will be included in the final merged output VCF. If no explicit name is provided, + * the -V arguments will be named using the default algorithm: variants, variants2, variants3, etc. + * The user can override this by providing an explicit name -V:name,vcf for each -V argument, + * and each named argument will be labeled as such in the output (i.e., set=name rather than + * set=variants2). The order of arguments does not matter unless except for the naming, so + * if you provide an rod priority list and no explicit names than variants, variants2, etc + * are techincally order dependent. It is strongly recommended to provide explicit names when + * a rod priority list is provided. + */ + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public List> variants; @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - // the types of combinations we currently allow - @Argument(shortName="genotypeMergeOptions", doc="How should we merge genotype records for samples shared across the ROD files?", required=false) + @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) public VariantContextUtils.GenotypeMergeType genotypeMergeOption = VariantContextUtils.GenotypeMergeType.PRIORITIZE; - @Argument(shortName="filteredRecordsMergeType", doc="How should we deal with records seen at the same site in the VCF, but with different FILTER fields? KEEP_IF_ANY_UNFILTERED PASSes the record if any record is unfiltered, KEEP_IF_ALL_UNFILTERED requires all records to be unfiltered", required=false) + @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; - @Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=false) + /** + * Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided. + */ + @Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false) public String PRIORITY_STRING = null; @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false) public boolean printComplexMerges = false; - @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF", required=false) + @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false) public boolean filteredAreUncalled = false; - @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype INFO field", required=false) + /** + * Used to generate a sites-only file. + */ + @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) public boolean minimalVCF = false; - @Argument(fullName="setKey", shortName="setKey", doc="Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from. Set to null if you don't want the set field emitted.", required=false) + /** + * Set to 'null' if you don't want the set field emitted. + */ + @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false) public String SET_KEY = "set"; - @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls so that one can simply perform a merge sort to combine the VCFs into one, drastically reducing the runtime.", required=false) + /** + * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime.. + */ + @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) public boolean ASSUME_IDENTICAL_SAMPLES = false; - @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if variant is present in at least N input files.", required=false) + @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false) public int minimumN = 1; - @Hidden - @Argument(fullName="masterMerge", shortName="master", doc="Master merge mode -- experts only. You need to look at the code to understand it", required=false) - public boolean master = false; - @Hidden @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; @@ -150,7 +211,7 @@ public class CombineVariants extends RodWalker { // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus - Collection vcs = tracker.getAllVariantContexts(ref, null, context.getLocation(), true, false); + Collection vcs = tracker.getValues(variants, context.getLocation()); if ( sitesOnlyVCF ) { vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); @@ -158,7 +219,7 @@ public class CombineVariants extends RodWalker { if ( ASSUME_IDENTICAL_SAMPLES ) { for ( final VariantContext vc : vcs ) { - vcfWriter.add( vc, ref.getBase() ); + vcfWriter.add(vc); } return vcs.isEmpty() ? 0 : 1; @@ -174,17 +235,13 @@ public class CombineVariants extends RodWalker { return 0; List mergedVCs = new ArrayList(); - if ( master ) { - mergedVCs.add(VariantContextUtils.masterMerge(vcs, "master")); - } else { - Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); - // iterate over the types so that it's deterministic - for ( VariantContext.Type type : VariantContext.Type.values() ) { - if ( VCsByType.containsKey(type) ) - mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), - priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - ref.getBase(), SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); - } + Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); + // iterate over the types so that it's deterministic + for ( VariantContext.Type type : VariantContext.Type.values() ) { + if ( VCsByType.containsKey(type) ) + mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), + priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } for ( VariantContext mergedVC : mergedVCs ) { @@ -198,7 +255,7 @@ public class CombineVariants extends RodWalker { VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes); if ( minimalVCF ) annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, Arrays.asList(SET_KEY)); - vcfWriter.add(annotatedMergedVC, ref.getBase()); + vcfWriter.add(annotatedMergedVC); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java index b45ee1b67..4c2222f3a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -24,7 +24,9 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -44,9 +46,11 @@ import java.util.Set; * Filters a lifted-over VCF file for ref bases that have been changed. */ @Reference(window=@Window(start=0,stop=100)) -@Requires(value={},referenceMetaData=@RMD(name="variant",type= VariantContext.class)) public class FilterLiftedVariants extends RodWalker { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + private static final int MAX_VARIANT_SIZE = 100; @Output(doc="File to which variants should be written",required=true) @@ -55,10 +59,11 @@ public class FilterLiftedVariants extends RodWalker { private long failedLocs = 0, totalLocs = 0; public void initialize() { - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant")); - Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); + String trackName = variantCollection.variants.getName(); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); + Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey("variant") ? vcfHeaders.get("variant").getMetaData() : null, samples); + final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey(trackName) ? vcfHeaders.get(trackName).getMetaData() : null, samples); writer.writeHeader(vcfHeader); } @@ -78,14 +83,14 @@ public class FilterLiftedVariants extends RodWalker { if ( failed ) failedLocs++; else - writer.add(vc, ref[0]); + writer.add(vc); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) return 0; - Collection VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false); + Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); for ( VariantContext vc : VCs ) filterAndWrite(ref.getBases(), vc); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index 2ebd183f4..c9f330db5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -28,7 +28,9 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -44,21 +46,49 @@ import java.util.*; /** * Left-aligns indels from a variants file. + * + *

    + * LeftAlignVariants is a tool that takes a VCF file and left-aligns any indels inside it. The same indel can often be + * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to + * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + * + *

    Input

    + *

    + * A variant set to left-align. + *

    + * + *

    Output

    + *

    + * A left-aligned VCF. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T LeftAlignVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf
    + * 
    + * */ @Reference(window=@Window(start=-200,stop=200)) -@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class)) public class LeftAlignVariants extends RodWalker { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + @Output(doc="File to which variants should be written",required=true) protected VCFWriter baseWriter = null; private SortingVCFWriter writer; public void initialize() { - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant")); - Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); + String trackName = variantCollection.variants.getName(); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); + Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - Set headerLines = vcfHeaders.get("variant").getMetaData(); + Set headerLines = vcfHeaders.get(trackName).getMetaData(); baseWriter.writeHeader(new VCFHeader(headerLines, samples)); writer = new SortingVCFWriter(baseWriter, 200); @@ -68,7 +98,7 @@ public class LeftAlignVariants extends RodWalker { if ( tracker == null ) return 0; - Collection VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false); + Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); int changedSites = 0; for ( VariantContext vc : VCs ) @@ -90,10 +120,10 @@ public class LeftAlignVariants extends RodWalker { private int alignAndWrite(VariantContext vc, final ReferenceContext ref) { - if ( vc.isBiallelic() && vc.isIndel() ) + if ( vc.isBiallelic() && vc.isIndel() && !vc.isComplexIndel() ) return writeLeftAlignedIndel(vc, ref); else { - writer.add(vc, ref.getBase()); + writer.add(vc); return 0; } } @@ -103,13 +133,13 @@ public class LeftAlignVariants extends RodWalker { // get the indel length int indelLength; - if ( vc.isDeletion() ) + if ( vc.isSimpleDeletion() ) indelLength = vc.getReference().length(); else indelLength = vc.getAlternateAllele(0).length(); if ( indelLength > 200 ) { - writer.add(vc, ref.getBase()); + writer.add(vc); return 0; } @@ -120,7 +150,7 @@ public class LeftAlignVariants extends RodWalker { // create a CIGAR string to represent the event ArrayList elements = new ArrayList(); elements.add(new CigarElement(originalIndex, CigarOperator.M)); - elements.add(new CigarElement(indelLength, vc.isDeletion() ? CigarOperator.D : CigarOperator.I)); + elements.add(new CigarElement(indelLength, vc.isSimpleDeletion() ? CigarOperator.D : CigarOperator.I)); elements.add(new CigarElement(refSeq.length - originalIndex, CigarOperator.M)); Cigar originalCigar = new Cigar(elements); @@ -135,32 +165,27 @@ public class LeftAlignVariants extends RodWalker { int indelIndex = originalIndex-difference; byte[] newBases = new byte[indelLength]; - System.arraycopy((vc.isDeletion() ? refSeq : originalIndel), indelIndex, newBases, 0, indelLength); - Allele newAllele = Allele.create(newBases, vc.isDeletion()); - newVC = updateAllele(newVC, newAllele); + System.arraycopy((vc.isSimpleDeletion() ? refSeq : originalIndel), indelIndex, newBases, 0, indelLength); + Allele newAllele = Allele.create(newBases, vc.isSimpleDeletion()); + newVC = updateAllele(newVC, newAllele, refSeq[indelIndex-1]); - // we need to update the reference base just in case it changed - Map attrs = new HashMap(newVC.getAttributes()); - attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refSeq[indelIndex-1]); - newVC = VariantContext.modifyAttributes(newVC, attrs); - - writer.add(newVC, refSeq[indelIndex-1]); + writer.add(newVC); return 1; } else { - writer.add(vc, ref.getBase()); + writer.add(vc); return 0; } } private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) { - byte[] hap = new byte[ref.length + (indelLength * (vc.isDeletion() ? -1 : 1))]; + byte[] hap = new byte[ref.length + (indelLength * (vc.isSimpleDeletion() ? -1 : 1))]; // add the bases before the indel System.arraycopy(ref, 0, hap, 0, indexOfRef); int currentPos = indexOfRef; // take care of the indel - if ( vc.isDeletion() ) { + if ( vc.isSimpleDeletion() ) { indexOfRef += indelLength; } else { System.arraycopy(vc.getAlternateAllele(0).getBases(), 0, hap, currentPos, indelLength); @@ -173,7 +198,7 @@ public class LeftAlignVariants extends RodWalker { return hap; } - public static VariantContext updateAllele(VariantContext vc, Allele newAllele) { + public static VariantContext updateAllele(VariantContext vc, Allele newAllele, Byte refBaseForIndel) { // create a mapping from original allele to new allele HashMap alleleMap = new HashMap(vc.getAlleles().size()); if ( newAllele.isReference() ) { @@ -197,6 +222,6 @@ public class LeftAlignVariants extends RodWalker { newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); } - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), refBaseForIndel); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 4f05c8aac..1c76a21ea 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -29,13 +29,11 @@ import net.sf.picard.liftover.LiftOver; import net.sf.picard.util.Interval; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -49,9 +47,11 @@ import java.util.*; /** * Lifts a VCF file over from one build to another. Note that the resulting VCF could be mis-sorted. */ -@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class)) public class LiftoverVariants extends RodWalker { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + @Output(doc="File to which variants should be written",required=true) protected File file = null; protected StandardVCFWriter writer = null; @@ -85,12 +85,13 @@ public class LiftoverVariants extends RodWalker { throw new UserException.BadInput("the chain file you are using is not compatible with the reference you are trying to lift over to; please use the appropriate chain file for the given reference"); } - Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant")); - Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); + String trackName = variantCollection.variants.getName(); + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(trackName)); + Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); Set metaData = new HashSet(); - if ( vcfHeaders.containsKey("variant") ) - metaData.addAll(vcfHeaders.get("variant").getMetaData()); + if ( vcfHeaders.containsKey(trackName) ) + metaData.addAll(vcfHeaders.get(trackName).getMetaData()); if ( RECORD_ORIGINAL_LOCATION ) { metaData.add(new VCFInfoHeaderLine("OriginalChr", 1, VCFHeaderLineType.String, "Original contig name for the record")); metaData.add(new VCFInfoHeaderLine("OriginalStart", 1, VCFHeaderLineType.Integer, "Original start position for the record")); @@ -125,14 +126,14 @@ public class LiftoverVariants extends RodWalker { vc = VariantContext.modifyAttributes(vc, attrs); } - VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, ref.getBase(), false); + VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, false); if ( originalVC.isSNP() && originalVC.isBiallelic() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(), originalVC.getReference(), originalVC.getAlternateAllele(0), newVC.getReference(), newVC.getAlternateAllele(0))); } - writer.add(vc, ref.getBase()); + writer.add(vc); successfulIntervals++; } else { failedIntervals++; @@ -143,7 +144,7 @@ public class LiftoverVariants extends RodWalker { if ( tracker == null ) return 0; - Collection VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false); + Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); for ( VariantContext vc : VCs ) convertAndWrite(vc, ref); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java index f0756d884..1fefd20fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java @@ -24,14 +24,12 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -39,17 +37,16 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashSet; -import java.util.Set; +import java.util.*; /** * Takes a VCF file, randomly splits variants into two different sets, and outputs 2 new VCFs with the results. */ -@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class)) public class RandomlySplitVariants extends RodWalker { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + @Output(fullName="out1", shortName="o1", doc="File #1 to which variants should be written", required=true) protected VCFWriter vcfWriter1 = null; @@ -61,8 +58,6 @@ public class RandomlySplitVariants extends RodWalker { @Argument(fullName="fractionToOut1", shortName="fraction", doc="Fraction of records to be placed in out1 (must be 0 >= fraction <= 1); all other records are placed in out2", required=false) protected double fraction = 0.5; - protected static final String INPUT_VARIANT_ROD_BINDING_NAME = "variant"; - protected int iFraction; /** @@ -74,8 +69,7 @@ public class RandomlySplitVariants extends RodWalker { iFraction = (int)(fraction * 1000.0); // setup the header info - final ArrayList inputNames = new ArrayList(); - inputNames.add( INPUT_VARIANT_ROD_BINDING_NAME ); + final List inputNames = Arrays.asList(variantCollection.variants.getName()); Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames); Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); @@ -97,13 +91,13 @@ public class RandomlySplitVariants extends RodWalker { if ( tracker == null ) return 0; - Collection vcs = tracker.getVariantContexts(ref, INPUT_VARIANT_ROD_BINDING_NAME, null, context.getLocation(), true, false); + Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); for ( VariantContext vc : vcs ) { int random = GenomeAnalysisEngine.getRandomGenerator().nextInt(1000); if ( random < iFraction ) - vcfWriter1.add(vc, ref.getBase()); + vcfWriter1.add(vc); else - vcfWriter2.add(vc, ref.getBase()); + vcfWriter2.add(vc); } return 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index ac6797609..018c4dcc2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,84 +24,239 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Input; +import org.apache.poi.hpsf.Variant; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; import java.io.FileNotFoundException; import java.io.PrintStream; -import java.lang.annotation.AnnotationFormatError; import java.util.*; /** - * Takes a VCF file, selects variants based on sample(s) in which it was found and/or on various annotation criteria, - * recompute the value of certain annotations based on the new sample set, and output a new VCF with the results. + * Selects variants from a VCF source. + * + *

    + * Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses + * (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain + * requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose. + * Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a + * pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of + * coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are + * documented in the Using JEXL expressions section (http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions). + * One can optionally include concordance or discordance tracks for use in selecting overlapping variants. + * + *

    Input

    + *

    + * A variant set to select from. + *

    + * + *

    Output

    + *

    + * A selected VCF. + *

    + * + *

    Examples

    + *
    + * Select two samples out of a VCF with many samples:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -sn SAMPLE_A_PARC \
    + *   -sn SAMPLE_B_ACTG
    + *
    + * Select two samples and any sample that matches a regular expression:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -sn SAMPLE_1_PARC \
    + *   -sn SAMPLE_1_ACTG \
    + *   -sn 'SAMPLE.+PARC'
    + *
    + * Select any sample that matches a regular expression and sites where the QD annotation is more than 10:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -sn 'SAMPLE.+PARC'
    + *   -select "QD > 10.0"
    + *
    + * Select a sample and exclude non-variant loci and filtered loci:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -sn SAMPLE_1_ACTG \
    + *   -env \
    + *   -ef
    + *
    + * Select a sample and restrict the output vcf to a set of intervals:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -L /path/to/my.interval_list \
    + *   -sn SAMPLE_1_ACTG
    + *
    + * Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called by this dataset):
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant hapmap.vcf \
    + *   --discordance myCalls.vcf
    + *   -o output.vcf \
    + *   -sn mySample
    + *
    + * Select all calls made by both myCalls and hisCalls (useful to take a look at what is consistent between the two callers):
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant myCalls.vcf \
    + *   --concordance hisCalls.vcf
    + *   -o output.vcf \
    + *   -sn mySample
    + *
    + * Generating a VCF of all the variants that are mendelian violations:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -family NA12891+NA12892=NA12878 \
    + *   -mvq 50 \
    + *   -o violations.vcf
    + *
    + * Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -number 1000
    + *
    + * Creating a set with 50% of the total number of variants in the variant VCF:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -fraction 0.5
    + *
    + * Select only indels from a VCF:
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -selectType INDEL
    + *
    + * Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T SelectVariants \
    + *   --variant input.vcf \
    + *   -o output.vcf \
    + *   -selectType SNP -selectType MNP \
    + *   -restrictAllelesTo MULTIALLELIC
    + *
    + * 
    + * */ -@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class)) public class SelectVariants extends RodWalker { + @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + /** + * A site is considered discordant if there exists some sample in the variant track that has a non-reference genotype + * and either the site isn't present in this track, the sample isn't present in this track, + * or the sample is called reference in this track. + */ + @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false) + private RodBinding discordanceTrack; + + /** + * A site is considered concordant if (1) we are not looking for specific samples and there is a variant called + * in both the variant and concordance tracks or (2) every sample present in the variant track is present in the + * concordance track and they have the sample genotype call. + */ + @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false) + private RodBinding concordanceTrack; @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) - public Set sampleNames; + @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) + public Set sampleNames = new HashSet(0); - @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times.", required=false) - public Set sampleExpressions; + @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) + public Set sampleExpressions ; - @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) + @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) public Set sampleFiles; - @Argument(shortName="select", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false) + /** + * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. + */ + @Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false) + public Set XLsampleNames = new HashSet(0); + + /** + * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. + */ + @Input(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false) + public Set XLsampleFiles = new HashSet(0); + + /** + * Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated. + */ + @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false) public ArrayList SELECT_EXPRESSIONS = new ArrayList(); - @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) + @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) private boolean EXCLUDE_NON_VARIANTS = false; - @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis.", required=false) + @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) private boolean EXCLUDE_FILTERED = false; - @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't include filtered loci.", required=false) + + /** + * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf. + * For example, a multiallelic record such as: + * 1 100 . A AAA,AAAAA + * will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as: + * 1 100 . A T + * will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC + */ + @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false) + private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL; + + @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; - @Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called on a ROD comparison track. Use -disc ROD_NAME", required=false) - private String discordanceRodName = ""; - - @Argument(fullName="concordance", shortName = "conc", doc="Output variants that were also called on a ROD comparison track. Use -conc ROD_NAME", required=false) - private String concordanceRodName = ""; - @Hidden - @Argument(fullName="inputAF", shortName = "inputAF", doc="", required=false) - private String inputAFRodName = ""; - - @Hidden - @Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) + @Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) private boolean KEEP_AF_SPECTRUM = false; @Hidden @@ -109,51 +264,71 @@ public class SelectVariants extends RodWalker { private File AF_FILE = new File(""); @Hidden - @Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + @Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false) private File FAMILY_STRUCTURE_FILE = null; - @Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + /** + * String formatted as dad+mom=child where these parameters determine which sample names are examined. + */ + @Argument(fullName="family_structure", shortName="family", doc="string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) private String FAMILY_STRUCTURE = ""; - @Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only. Sample metadata information will be taken from YAML file (passed with -SM)", required=false) + /** + * This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure. + */ + @Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false) private Boolean MENDELIAN_VIOLATIONS = false; @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; - @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track. Variants are kept in memory to guarantee that n variants will be output, so use it only for a reasonable number of variants. Use select_random_fraction for larger numbers of variants", required=false) + /** + * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so use it only for a reasonable + * number of variants. Use --select_random_fraction for larger numbers of variants. + */ + @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) private int numRandom = 0; - @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track. Routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions", required=false) + /** + * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. + */ + @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) private double fractionRandom = 0; - @Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs.", required=false) - private boolean SELECT_SNPS = false; + /** + * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. + * When specified one or more times, a particular type of variant is selected. + * + */ + @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) + private List TYPES_TO_INCLUDE = new ArrayList(); - @Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false) - private boolean SELECT_INDELS = false; @Hidden - @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) - private String outMVFile = null; + @Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false) + private String outMVFile = null; /* Private class used to store the intermediate variants in the integer random selection process */ private class RandomVariantStructure { private VariantContext vc; - private byte refBase; - RandomVariantStructure(VariantContext vcP, byte refBaseP) { + RandomVariantStructure(VariantContext vcP) { vc = vcP; - refBase = refBaseP; } - public void set (VariantContext vcP, byte refBaseP) { + public void set (VariantContext vcP) { vc = vcP; - refBase = refBaseP; } } + public enum NumberAlleleRestriction { + ALL, + BIALLELIC, + MULTIALLELIC + } + + private ArrayList selectedTypes = new ArrayList(); private ArrayList selectNames = new ArrayList(); private List jexls = null; @@ -165,9 +340,6 @@ public class SelectVariants extends RodWalker { private Set mvSet = new HashSet(); - /* default name for the variant dataset (VCF) */ - private final String variantRodName = "variant"; - /* variables used by the SELECT RANDOM modules */ private boolean SELECT_RANDOM_NUMBER = false; @@ -183,8 +355,7 @@ public class SelectVariants extends RodWalker { private ArrayList afBoosts = null; double bkDelta = 0.0; - - private PrintStream outMVFileStream = null; + private PrintStream outMVFileStream = null; /** @@ -192,8 +363,7 @@ public class SelectVariants extends RodWalker { */ public void initialize() { // Get list of samples to include in the output - ArrayList rodNames = new ArrayList(); - rodNames.add(variantRodName); + List rodNames = Arrays.asList(variantCollection.variants.getName()); Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); @@ -201,20 +371,42 @@ public class SelectVariants extends RodWalker { Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); + // first, add any requested samples samples.addAll(samplesFromFile); samples.addAll(samplesFromExpressions); - if (sampleNames != null) - samples.addAll(sampleNames); + samples.addAll(sampleNames); - if(samples.isEmpty()) { + // if none were requested, we want all of them + if ( samples.isEmpty() ) { samples.addAll(vcfSamples); NO_SAMPLES_SPECIFIED = true; } - for (String sample : samples) { - logger.info("Including sample '" + sample + "'"); - } + // now, exclude any requested samples + Collection XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); + samples.removeAll(XLsamplesFromFile); + samples.removeAll(XLsampleNames); + if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED ) + throw new UserException("All samples requested to be included were also requested to be excluded."); + + for ( String sample : samples ) + logger.info("Including sample '" + sample + "'"); + + + + // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include + if (TYPES_TO_INCLUDE.isEmpty()) { + + for (VariantContext.Type t : VariantContext.Type.values()) + selectedTypes.add(t); + + } + else { + for (VariantContext.Type t : TYPES_TO_INCLUDE) + selectedTypes.add(t); + + } // Initialize VCF header Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); headerLines.add(new VCFHeaderLine("source", "SelectVariants")); @@ -235,11 +427,11 @@ public class SelectVariants extends RodWalker { jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS); // Look at the parameters to decide which analysis to perform - DISCORDANCE_ONLY = discordanceRodName.length() > 0; - if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName); + DISCORDANCE_ONLY = discordanceTrack.isBound(); + if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceTrack.getName()); - CONCORDANCE_ONLY = concordanceRodName.length() > 0; - if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceRodName); + CONCORDANCE_ONLY = concordanceTrack.isBound(); + if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName()); if (MENDELIAN_VIOLATIONS) { if ( FAMILY_STRUCTURE_FILE != null) { @@ -274,7 +466,7 @@ public class SelectVariants extends RodWalker { } SELECT_RANDOM_FRACTION = fractionRandom > 0; - if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + fractionRandom + "% of the variants at random from the variant track"); + if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); if (KEEP_AF_SPECTRUM) { @@ -317,7 +509,7 @@ public class SelectVariants extends RodWalker { if ( tracker == null ) return 0; - Collection vcs = tracker.getVariantContexts(ref, variantRodName, null, context.getLocation(), true, false); + Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); if ( vcs == null || vcs.size() == 0) { return 0; @@ -345,22 +537,23 @@ public class SelectVariants extends RodWalker { break; } if (DISCORDANCE_ONLY) { - Collection compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false); + Collection compVCs = tracker.getValues(discordanceTrack, context.getLocation()); if (!isDiscordant(vc, compVCs)) return 0; } if (CONCORDANCE_ONLY) { - Collection compVCs = tracker.getVariantContexts(ref, concordanceRodName, null, context.getLocation(), true, false); + Collection compVCs = tracker.getValues(concordanceTrack, context.getLocation()); if (!isConcordant(vc, compVCs)) return 0; } - // TODO - add ability to also select MNPs - // TODO - move variant selection arguments to the engine so other walkers can also do this - if (SELECT_INDELS && !(vc.isIndel() || vc.isMixed())) + if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic()) continue; - if (SELECT_SNPS && !vc.isSNP()) + if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic()) + continue; + + if (!selectedTypes.contains(vc.getType())) continue; VariantContext sub = subsetRecord(vc, samples); @@ -374,7 +567,7 @@ public class SelectVariants extends RodWalker { randomlyAddVariant(++variantNumber, sub, ref.getBase()); } else if (!SELECT_RANDOM_FRACTION || (!KEEP_AF_SPECTRUM && GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { - vcfWriter.add(sub, ref.getBase()); + vcfWriter.add(sub); } else { if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) { @@ -422,7 +615,7 @@ public class SelectVariants extends RodWalker { //System.out.format("%s .. %4.4f\n",afo.toString(), af); if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * afBoost * afBoost) - vcfWriter.add(sub, ref.getBase()); + vcfWriter.add(sub); } @@ -529,7 +722,7 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_NUMBER) { int positionToPrint = positionToAdd; for (int i=0; i { VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles()); + // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate) + if ( vc.getAlleles().size() != sub.getAlleles().size() ) + sub = VariantContext.modifyGenotypes(sub, VariantContextUtils.stripPLs(vc.getGenotypes())); + HashMap attributes = new HashMap(sub.getAttributes()); int depth = 0; @@ -592,13 +789,13 @@ public class SelectVariants extends RodWalker { private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) { if (nVariantsAdded < numRandom) - variantArray[nVariantsAdded++] = new RandomVariantStructure(vc, refBase); + variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); else { double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble(); double t = (1.0/(rank-numRandom+1)); if ( v < t) { - variantArray[positionToAdd].set(vc, refBase); + variantArray[positionToAdd].set(vc); nVariantsAdded++; positionToAdd = nextCircularPosition(positionToAdd); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index 0644c669b..fdfca982c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -26,14 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broad.tribble.TribbleException; -import org.broad.tribble.dbsnp.DbSNPFeature; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -42,18 +40,40 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; import java.util.Collection; import java.util.HashSet; -import java.util.List; import java.util.Set; /** - * Validates a variants file. + * Strictly validates a variants file. + * + *

    + * ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it. + * Checks include the correctness of the reference base(s), accuracy of AC & AN values, tests against rsIDs + * when a dbSNP file is provided, and that all alternate alleles are present in at least one sample. + * + *

    Input

    + *

    + * A variant set to filter. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T ValidateVariants \
    + *   --variant input.vcf \
    + *   --dbsnp dbsnp.vcf
    + * 
    + * */ @Reference(window=@Window(start=0,stop=100)) -@Requires(value={},referenceMetaData=@RMD(name=ValidateVariants.TARGET_ROD_NAME, type=VariantContext.class)) public class ValidateVariants extends RodWalker { - protected static final String TARGET_ROD_NAME = "variant"; + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); public enum ValidationType { ALL, REF, IDS, ALLELES, CHR_COUNTS @@ -63,10 +83,13 @@ public class ValidateVariants extends RodWalker { @Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false) protected ValidationType type = ValidationType.ALL; - @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "should we skip validation on filtered records?", required = false) + /** + * By default, even filtered records are validated. + */ + @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false) protected Boolean DO_NOT_VALIDATE_FILTERED = false; - @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "should we just emit warnings on errors instead of terminating the run?", required = false) + @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false) protected Boolean WARN_ON_ERROR = false; private long numErrors = 0; @@ -74,19 +97,14 @@ public class ValidateVariants extends RodWalker { private File file = null; public void initialize() { - for ( ReferenceOrderedDataSource source : getToolkit().getRodDataSources() ) { - if ( source.getName().equals(TARGET_ROD_NAME) ) { - file = source.getFile(); - break; - } - } + file = new File(variantCollection.variants.getSource()); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) return 0; - Collection VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false); + Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); for ( VariantContext vc : VCs ) validate(vc, tracker, ref); @@ -112,11 +130,11 @@ public class ValidateVariants extends RodWalker { Allele reportedRefAllele = vc.getReference(); Allele observedRefAllele; // insertions - if ( vc.isInsertion() ) { + if ( vc.isSimpleInsertion() ) { observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING); } // deletions - else if ( vc.isDeletion() || vc.isMixed() || vc.isMNP() ) { + else if ( vc.isSimpleDeletion() || vc.isMixed() || vc.isMNP() ) { // we can't validate arbitrarily long deletions if ( reportedRefAllele.length() > 100 ) { logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart())); @@ -142,22 +160,19 @@ public class ValidateVariants extends RodWalker { // get the RS IDs Set rsIDs = null; - if ( tracker.hasROD(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { - List dbsnpList = tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME); + if ( tracker.hasValues(dbsnp.dbsnp) ) { rsIDs = new HashSet(); - for ( Object d : dbsnpList ) { - if (d instanceof DbSNPFeature ) - rsIDs.add(((DbSNPFeature)d).getRsID()); - } + for ( VariantContext rsID : tracker.getValues(dbsnp.dbsnp, ref.getLocus()) ) + rsIDs.add(rsID.getID()); } try { switch( type ) { case ALL: - vc.extraStrictValidation(observedRefAllele, rsIDs); + vc.extraStrictValidation(observedRefAllele, ref.getBase(), rsIDs); break; case REF: - vc.validateReferenceBases(observedRefAllele); + vc.validateReferenceBases(observedRefAllele, ref.getBase()); break; case IDS: vc.validateRSIDs(rsIDs); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index 86bb3b0e8..8eaf976d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -25,8 +25,8 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -34,7 +34,6 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -42,22 +41,66 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; /** - * Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes) + * Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes) + * + *

    + * The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes). + * The tool produces a VCF that is annotated with information pertaining to plate quality control and by + * default is soft-filtered by high no-call rate or low Hardy-Weinberg probability. + * If you have .ped files, please first convert them to VCF format + * (see http://www.broadinstitute.org/gsa/wiki/index.php/Converting_ped_to_vcf). + * + *

    Input

    + *

    + * A validation VCF to annotate. + *

    + * + *

    Output

    + *

    + * An annotated VCF. Additionally, a table like the following will be output: + *

    + *     Total number of samples assayed:                  185
    + *     Total number of records processed:                152
    + *     Number of Hardy-Weinberg violations:              34 (22%)
    + *     Number of no-call violations:                     12 (7%)
    + *     Number of homozygous variant violations:          0 (0%)
    + *     Number of records passing all filters:            106 (69%)
    + *     Number of passing records that are polymorphic:   98 (92%)
    + * 
    + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T VariantValidationAssessor \
    + *   --variant input.vcf \
    + *   -o output.vcf
    + * 
    + * */ @Reference(window=@Window(start=0,stop=40)) -@Requires(value={},referenceMetaData=@RMD(name=VariantValidationAssessor.INPUT_VARIANT_ROD_BINDING_NAME, type=VariantContext.class)) -public class VariantValidationAssessor extends RodWalker,Integer> { +public class VariantValidationAssessor extends RodWalker { - public static final String INPUT_VARIANT_ROD_BINDING_NAME = "variant"; + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfwriter = null; - @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false) + @Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid", required=false) protected double maxHardy = 20.0; - @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid [default:0.05]", required=false) + + /** + * To disable, set to a value greater than 1. + */ + @Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid", required=false) protected double maxNoCall = 0.05; - @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false) + + /** + * To disable, set to a value greater than 1. + */ + @Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid", required=false) protected double maxHomNonref = 1.1; //@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+ @@ -68,7 +111,7 @@ public class VariantValidationAssessor extends RodWalker sampleNames = null; // variant context records - private ArrayList> records = new ArrayList>(); + private ArrayList records = new ArrayList(); // statistics private int numRecords = 0; @@ -89,11 +132,11 @@ public class VariantValidationAssessor extends RodWalker map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) return null; - VariantContext vc = tracker.getVariantContext(ref, INPUT_VARIANT_ROD_BINDING_NAME, ref.getLocus()); + VariantContext vc = tracker.getFirstValue(variantCollection.variants, ref.getLocus()); // ignore places where we don't have a variant if ( vc == null ) return null; @@ -101,10 +144,10 @@ public class VariantValidationAssessor extends RodWalker(vc.getSampleNames()); - return addVariantInformationToCall(ref, vc); + return addVariantInformationToCall(vc); } - public Integer reduce(Pair call, Integer numVariants) { + public Integer reduce(VariantContext call, Integer numVariants) { if ( call != null ) { numVariants++; records.add(call); @@ -113,8 +156,7 @@ public class VariantValidationAssessor extends RodWalker inputNames = new ArrayList(); - inputNames.add( INPUT_VARIANT_ROD_BINDING_NAME ); + final List inputNames = Arrays.asList(variantCollection.variants.getName()); // setup the header fields Set hInfo = new HashSet(); @@ -155,12 +197,12 @@ public class VariantValidationAssessor extends RodWalker record : records ) - vcfwriter.add(record.first, record.second); + for ( VariantContext record : records ) + vcfwriter.add(record); } - private Pair addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) { + private VariantContext addVariantInformationToCall(VariantContext vContext) { // check possible filters double hwPvalue = hardyWeinbergCalculation(vContext); @@ -202,9 +244,7 @@ public class VariantValidationAssessor extends RodWalker(vContext, ref.getBase()); + return VariantContext.modifyAttributes(vContext, infoMap); } private double hardyWeinbergCalculation(VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 39358dad5..2a877fb09 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -24,14 +24,13 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -41,120 +40,150 @@ import java.io.PrintStream; import java.util.*; /** - * Emits specific fields as dictated by the user from one or more VCF files. + * Emits specific fields from a VCF file to a tab-deliminated table + * + *

    + * This walker accepts a single VCF file and writes out user-selected fields from the + * VCF as a header-containing, tab-deliminated file. The user specifies one or more + * fields to print with the -F NAME, each of which appears as a single column in + * the output file, with a header named NAME, and the value of this field in the VCF + * one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding + * in the INFO field (AC=10). Note that this tool does not support capturing any + * GENOTYPE field values. If a VCF record is missing a value, then the tool by + * default throws an error, but the special value NA can be emitted instead with + * appropriate tool arguments. + * + *

    + * + *

    Input

    + *

    + *

      + *
    • A VCF file
    • + *
    • A list of -F fields to write
    • + *
    + *

    + * + *

    Output

    + *

    + * A table deliminated file containing the values of the requested fields in the VCF file + *

    + * + *

    Examples

    + *
    + *     -T $WalkerName \
    + *     -V file.vcf \
    + *     -F CHROM -F POS -F ID -F QUAL -F AC \
    + *     -o results.table
    + *
    + *     would produce a file that looks like:
    + *
    + *     CHROM    POS ID      QUAL    AC
    + *     1        10  .       50      1
    + *     1        20  rs10    99      10
    + *     et cetera...
    + * 
    + * + * @author Mark DePristo + * @since 2010 */ -@Requires(value={}) public class VariantsToTable extends RodWalker { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + @Output(doc="File to which results should be written",required=true) protected PrintStream out; - @Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true) - public ArrayList fieldsToTake = new ArrayList(); + /** + * -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10). + * Note that this tool does not support capturing any GENOTYPE field values. Note this argument + * accepts any number of inputs. So -F CHROM -F POS is allowed. + */ + @Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true) + public List fieldsToTake = new ArrayList(); - @Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records") + /** + * By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered). + * Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value. + */ + @Advanced + @Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false) public boolean showFiltered = false; - @Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false) + /** + * If provided, then this tool will exit with success after this number of records have been emitted to the file. + */ + @Advanced + @Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false) public int MAX_RECORDS = -1; int nRecords = 0; - @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) - public boolean keepMultiAllelic = false; + /** + * By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then + * VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this + * can make your resulting file unreadable and malformated according to tools like R, as the representation of + * multi-allelic INFO field values can be lists of values. + */ + @Advanced + @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) + public boolean keepMultiAllelic = false; + @Hidden + @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false) + public boolean logACSum = false; + + /** + * By default, this tool throws a UserException when it encounters a field without a value in some record. This + * is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being + * found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such + * fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument + * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error. + */ + @Advanced @Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false) public boolean ALLOW_MISSING_DATA = false; public void initialize() { + // print out the header out.println(Utils.join("\t", fieldsToTake)); } - public static abstract class Getter { public abstract String get(VariantContext vc); } - public static Map getters = new HashMap(); - - static { - // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT - getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); - getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); - getters.put("REF", new Getter() { - public String get(VariantContext vc) { - String x = ""; - if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) { - Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)); - x=x+new String(new byte[]{refByte}); - } - return x+vc.getReference().getDisplayString(); - } - }); - getters.put("ALT", new Getter() { - public String get(VariantContext vc) { - StringBuilder x = new StringBuilder(); - int n = vc.getAlternateAlleles().size(); - if ( n == 0 ) return "."; - if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) { - Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)); - x.append(new String(new byte[]{refByte})); - } - - for ( int i = 0; i < n; i++ ) { - if ( i != 0 ) x.append(","); - x.append(vc.getAlternateAllele(i).getDisplayString()); - } - return x.toString(); - } - }); - getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); - getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { - if ( vc.isSNP() && vc.isBiallelic() ) - return VariantContextUtils.isTransition(vc) ? "1" : "0"; - else - return "-1"; - }}); - getters.put("FILTER", new Getter() { public String get(VariantContext vc) { - return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } - }); - - getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); - getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); - getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); - getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); - getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); - getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); - getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); - getters.put("GQ", new Getter() { public String get(VariantContext vc) { - if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); - return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); - }}); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) // RodWalkers can make funky map calls return 0; - if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) { - Collection vcs = tracker.getAllVariantContexts(ref, context.getLocation()); - for ( VariantContext vc : vcs) { - if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) { - List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA); - out.println(Utils.join("\t", vals)); - } + for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) { + if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) { + List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum); + out.println(Utils.join("\t", vals)); } - - return 1; - } else { - if ( nRecords >= MAX_RECORDS ) { - logger.warn("Calling sys exit to leave after " + nRecords + " records"); - System.exit(0); // todo -- what's the recommend way to abort like this? - } - return 0; } + + return 1; + } + + @Override + public boolean isDone() { + boolean done = MAX_RECORDS != -1 && nRecords >= MAX_RECORDS; + if ( done) logger.warn("isDone() will return true to leave after " + nRecords + " records"); + return done ; } private static final boolean isWildCard(String s) { return s.endsWith("*"); } - public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { + /** + * Utility function that returns the list of values for each field in fields from vc. + * + * @param vc the VariantContext whose field values we can to capture + * @param fields a non-null list of fields to capture from VC + * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise + * provides a value of NA + * @param kma if true, multiallelic variants are to be kept + * @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count. + * @return + */ + private static List extractFields(VariantContext vc, List fields, boolean allowMissingData, boolean kma, boolean logsum) { List vals = new ArrayList(); for ( String field : fields ) { @@ -195,6 +224,9 @@ public class VariantsToTable extends RodWalker { for (int k=0; k < afd.length; k++) afd[k] = Double.valueOf(afs[k]); + if (kma && logsum) + af = MathUtils.sum(afd); + else af = MathUtils.arrayMax(afd); //af = Double.valueOf(afs[0]); @@ -212,13 +244,78 @@ public class VariantsToTable extends RodWalker { return vals; } - public Integer reduceInit() { - return 0; + public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { + return extractFields(vc, fields, allowMissingData, false, false); } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - + // + // default reduce -- doesn't do anything at all + // + public Integer reduceInit() { return 0; } + public Integer reduce(Integer counter, Integer sum) { return counter + sum; } public void onTraversalDone(Integer sum) {} + + // ---------------------------------------------------------------------------------------------------- + // + // static system for getting values from VC by name. + // + // ---------------------------------------------------------------------------------------------------- + + public static abstract class Getter { public abstract String get(VariantContext vc); } + public static Map getters = new HashMap(); + + static { + // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT + getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); + getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); + getters.put("REF", new Getter() { + public String get(VariantContext vc) { + String x = ""; + if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x=x+new String(new byte[]{refByte}); + } + return x+vc.getReference().getDisplayString(); + } + }); + getters.put("ALT", new Getter() { + public String get(VariantContext vc) { + StringBuilder x = new StringBuilder(); + int n = vc.getAlternateAlleles().size(); + if ( n == 0 ) return "."; + if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x.append(new String(new byte[]{refByte})); + } + + for ( int i = 0; i < n; i++ ) { + if ( i != 0 ) x.append(","); + x.append(vc.getAlternateAllele(i).getDisplayString()); + } + return x.toString(); + } + }); + getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); + getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { + if ( vc.isSNP() && vc.isBiallelic() ) + return VariantContextUtils.isTransition(vc) ? "1" : "0"; + else + return "-1"; + }}); + getters.put("FILTER", new Getter() { public String get(VariantContext vc) { + return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } + }); + + getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); + getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); + getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); + getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); + getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); + getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); + getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); + getters.put("GQ", new Getter() { public String get(VariantContext vc) { + if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); + return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); + }}); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index aa0e5987f..9b33f8537 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -26,22 +26,20 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import net.sf.samtools.util.CloseableIterator; -import org.broad.tribble.dbsnp.DbSNPCodec; -import org.broad.tribble.dbsnp.DbSNPFeature; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.hapmap.HapMapFeature; +import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -49,31 +47,67 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import java.io.File; import java.util.*; /** * Converts variants from other file formats to VCF format. + * + *

    + * Note that there must be a Tribble feature/codec for the file format as well as an adaptor. + * + *

    Input

    + *

    + * A variant file to filter. + *

    + * + *

    Output

    + *

    + * A VCF file. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -R ref.fasta \
    + *   -T VariantsToVCF \
    + *   -o output.vcf \
    + *   --variant:RawHapMap input.hapmap \
    + *   --dbsnp dbsnp.vcf
    + * 
    + * */ -@Requires(value={},referenceMetaData=@RMD(name=VariantsToVCF.INPUT_ROD_NAME, type=VariantContext.class)) @Reference(window=@Window(start=-40,stop=40)) public class VariantsToVCF extends RodWalker { @Output(doc="File to which variants should be written",required=true) protected VCFWriter baseWriter = null; - private SortingVCFWriter vcfwriter; // needed because hapmap indel records move + private SortingVCFWriter vcfwriter; // needed because hapmap/dbsnp indel records move - public static final String INPUT_ROD_NAME = "variant"; + /** + * Variants from this input file are used by this tool as input. + */ + @Input(fullName="variant", shortName = "V", doc="Input variant file", required=true) + public RodBinding variants; - @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false) + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + + /** + * This argument is used for data (like GELI) with genotypes but no sample names encoded within. + */ + @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod", required=false) protected String sampleName = null; + /** + * This argument is useful for fixing input VCFs with bad reference bases (the output will be a fixed version of the VCF). + */ + @Argument(fullName="fixRef", shortName="fixRef", doc="Fix common reference base in case there's an indel without padding", required=false) + protected boolean fixReferenceBase = false; + private Set allowedGenotypeFormatStrings = new HashSet(); private boolean wroteHeader = false; - // Don't allow mixed types for now - private EnumSet ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, - VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL, VariantContext.Type.MNP); - // for dealing with indels in hapmap CloseableIterator dbsnpIterator = null; @@ -85,7 +119,7 @@ public class VariantsToVCF extends RodWalker { if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) ) return 0; - String rsID = DbSNPHelper.rsIDOfFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); + String rsID = dbsnp == null ? null : VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP); Collection contexts = getVariantContexts(tracker, ref); @@ -97,108 +131,96 @@ public class VariantsToVCF extends RodWalker { } // set the appropriate sample name if necessary - if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(INPUT_ROD_NAME) ) { - Genotype g = Genotype.modifyName(vc.getGenotype(INPUT_ROD_NAME), sampleName); + if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) { + Genotype g = Genotype.modifyName(vc.getGenotype(variants.getName()), sampleName); Map genotypes = new HashMap(); genotypes.put(sampleName, g); vc = VariantContext.modifyGenotypes(vc, genotypes); } - writeRecord(vc, tracker, ref.getBase()); + if ( fixReferenceBase ) { + vc = VariantContext.modifyReferencePadding(vc, ref.getBase()); + } + + writeRecord(vc, tracker, ref.getLocus()); } return 1; } private Collection getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref) { - // we need to special case the HapMap format because indels aren't handled correctly - List features = tracker.getReferenceMetaData(INPUT_ROD_NAME, true); - if ( features.size() > 0 && features.get(0) instanceof HapMapFeature ) { - ArrayList hapmapVCs = new ArrayList(features.size()); - for ( Object feature : features ) { - HapMapFeature hapmap = (HapMapFeature)feature; - Byte refBase = null; - // if it's an indel, we need to figure out the alleles - if ( hapmap.getAlleles()[0].equals("-") ) { - Map alleleMap = new HashMap(2); + List features = tracker.getValues(variants, ref.getLocus()); + List VCs = new ArrayList(features.size()); - // get the dbsnp object corresponding to this record, so we can learn whether this is an insertion or deletion - DbSNPFeature dbsnp = getDbsnpFeature(hapmap.getName()); - if ( dbsnp == null || dbsnp.getVariantType().equalsIgnoreCase("mixed") ) - continue; + for ( Feature record : features ) { + if ( VariantContextAdaptors.canBeConvertedToVariantContext(record) ) { + // we need to special case the HapMap format because indels aren't handled correctly + if ( record instanceof RawHapMapFeature) { - boolean isInsertion = dbsnp.getVariantType().equalsIgnoreCase("insertion"); + // is it an indel? + RawHapMapFeature hapmap = (RawHapMapFeature)record; + if ( hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING) || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING) ) { + // get the dbsnp object corresponding to this record (needed to help us distinguish between insertions and deletions) + VariantContext dbsnpVC = getDbsnp(hapmap.getName()); + if ( dbsnpVC == null || dbsnpVC.isMixed() ) + continue; - alleleMap.put(HapMapFeature.DELETION, Allele.create(Allele.NULL_ALLELE_STRING, isInsertion)); - alleleMap.put(HapMapFeature.INSERTION, Allele.create(hapmap.getAlleles()[1], !isInsertion)); - hapmap.setActualAlleles(alleleMap); + Map alleleMap = new HashMap(2); + alleleMap.put(RawHapMapFeature.DELETION, Allele.create(Allele.NULL_ALLELE_STRING, dbsnpVC.isSimpleInsertion())); + alleleMap.put(RawHapMapFeature.INSERTION, Allele.create(((RawHapMapFeature)record).getAlleles()[1], !dbsnpVC.isSimpleInsertion())); + hapmap.setActualAlleles(alleleMap); - // also, use the correct positioning for insertions - if ( isInsertion ) - hapmap.updatePosition(dbsnp.getStart()); - else - hapmap.updatePosition(dbsnp.getStart() - 1); + // also, use the correct positioning for insertions + hapmap.updatePosition(dbsnpVC.getStart()); - if ( hapmap.getStart() < ref.getWindow().getStart() ) { - logger.warn("Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping..."); - continue; + if ( hapmap.getStart() < ref.getWindow().getStart() ) { + logger.warn("Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping..."); + continue; + } } - refBase = ref.getBases()[hapmap.getStart() - ref.getWindow().getStart()]; - } - VariantContext vc = VariantContextAdaptors.toVariantContext(INPUT_ROD_NAME, hapmap, ref); - if ( vc != null ) { - if ( refBase != null ) { - Map attrs = new HashMap(vc.getAttributes()); - attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refBase); - vc = VariantContext.modifyAttributes(vc, attrs); - } - hapmapVCs.add(vc); } + + // ok, we might actually be able to turn this record in a variant context + VariantContext vc = VariantContextAdaptors.toVariantContext(variants.getName(), record, ref); + + if ( vc != null ) // sometimes the track has odd stuff in it that can't be converted + VCs.add(vc); } - return hapmapVCs; } - // for everything else, we can just convert to VariantContext - return tracker.getVariantContexts(ref, INPUT_ROD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, ref.getLocus(), true, false); + return VCs; } - private DbSNPFeature getDbsnpFeature(String rsID) { + private VariantContext getDbsnp(String rsID) { if ( dbsnpIterator == null ) { - ReferenceOrderedDataSource dbsnpDataSource = null; - for ( ReferenceOrderedDataSource ds : getToolkit().getRodDataSources() ) { - if ( ds.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { - dbsnpDataSource = ds; - break; - } - } - if ( dbsnpDataSource == null ) + if ( dbsnp == null ) throw new UserException.BadInput("No dbSNP rod was provided, but one is needed to decipher the correct indel alleles from the HapMap records"); RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe); - dbsnpIterator = builder.createInstanceOfTrack(DbSNPCodec.class, dbsnpDataSource.getFile()).getIterator(); + dbsnpIterator = builder.createInstanceOfTrack(VCFCodec.class, new File(dbsnp.dbsnp.getSource())).getIterator(); // Note that we should really use some sort of seekable iterator here so that the search doesn't take forever // (but it's complicated because the hapmap location doesn't match the dbsnp location, so we don't know where to seek to) } while ( dbsnpIterator.hasNext() ) { GATKFeature feature = dbsnpIterator.next(); - DbSNPFeature dbsnp = (DbSNPFeature)feature.getUnderlyingObject(); - if ( dbsnp.getRsID().equals(rsID) ) - return dbsnp; + VariantContext vc = (VariantContext)feature.getUnderlyingObject(); + if ( vc.hasID() && vc.getID().equals(rsID) ) + return vc; } return null; } - private void writeRecord(VariantContext vc, RefMetaDataTracker tracker, byte ref) { + private void writeRecord(VariantContext vc, RefMetaDataTracker tracker, GenomeLoc loc) { if ( !wroteHeader ) { wroteHeader = true; // setup the header fields Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variants.getName()))); //hInfo.add(new VCFHeaderLine("source", "VariantsToVCF")); //hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); @@ -214,16 +236,16 @@ public class VariantsToVCF extends RodWalker { samples.add(sampleName); } else { // try VCF first - samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(INPUT_ROD_NAME)); + samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(variants.getName())); if ( samples.isEmpty() ) { - List rods = tracker.getReferenceMetaData(INPUT_ROD_NAME); - if ( rods.size() == 0 ) - throw new IllegalStateException("No rod data is present"); + List features = tracker.getValues(variants, loc); + if ( features.size() == 0 ) + throw new IllegalStateException("No rod data is present, but we just created a VariantContext"); - Object rod = rods.get(0); - if ( rod instanceof HapMapFeature) - samples.addAll(Arrays.asList(((HapMapFeature)rod).getSampleIDs())); + Feature f = features.get(0); + if ( f instanceof RawHapMapFeature ) + samples.addAll(Arrays.asList(((RawHapMapFeature)f).getSampleIDs())); else samples.addAll(vc.getSampleNames()); } @@ -233,7 +255,7 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); - vcfwriter.add(vc, ref); + vcfwriter.add(vc); } public Integer reduceInit() { diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java new file mode 100644 index 000000000..3716d3110 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobInfo.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import org.ggf.drmaa.DrmaaException; +import org.ggf.drmaa.JobInfo; + +import java.util.Map; + +/** + * JNA mapping from Java to C DRMAA binding. + */ +public class JnaJobInfo implements JobInfo { + + private final String jobId; + private final Map rusage; + private final boolean hasExited; + private final int exitStatus; + private final boolean hasSignaled; + private final String terminatingSignal; + private final boolean hasCoreDump; + private final boolean wasAborted; + + public JnaJobInfo(String jobId, Map rusage, boolean hasExited, int exitStatus, boolean hasSignaled, String terminatingSignal, boolean hasCoreDump, boolean wasAborted) { + this.jobId = jobId; + this.rusage = rusage; + this.hasExited = hasExited; + this.exitStatus = exitStatus; + this.hasSignaled = hasSignaled; + this.terminatingSignal = terminatingSignal; + this.hasCoreDump = hasCoreDump; + this.wasAborted = wasAborted; + } + + @Override + public String getJobId() throws DrmaaException { + return this.jobId; + } + + @Override + public Map getResourceUsage() throws DrmaaException { + return rusage; + } + + @Override + public boolean hasExited() throws DrmaaException { + return hasExited; + } + + @Override + public int getExitStatus() throws DrmaaException { + if (!hasExited) + throw new IllegalStateException("job has not exited"); + return exitStatus; + } + + @Override + public boolean hasSignaled() throws DrmaaException { + return hasSignaled; + } + + @Override + public String getTerminatingSignal() throws DrmaaException { + if (!hasSignaled) + throw new IllegalStateException("job has not signaled"); + return terminatingSignal; + } + + @Override + public boolean hasCoreDump() throws DrmaaException { + return hasCoreDump; + } + + @Override + public boolean wasAborted() throws DrmaaException { + return wasAborted; + } +} diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java new file mode 100644 index 000000000..58cd19926 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaJobTemplate.java @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import com.sun.jna.Pointer; +import org.ggf.drmaa.*; + +import java.util.*; + +/** + * JNA mapping from Java to C DRMAA binding. + */ +public class JnaJobTemplate implements JobTemplate { + private final JnaSession session; + private final Pointer jt; + + public JnaJobTemplate(JnaSession session, Pointer jt) { + this.session = session; + this.jt = jt; + } + + public Pointer getPointer() { + return jt; + } + + @Override + public void setRemoteCommand(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_REMOTE_COMMAND, s); + } + + @Override + public String getRemoteCommand() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_REMOTE_COMMAND); + } + + @SuppressWarnings("unchecked") + @Override + public void setArgs(List list) throws DrmaaException { + JnaSession.setVectorAttribute(jt, LibDrmaa.DRMAA_V_ARGV, list); + } + + @Override + public List getArgs() throws DrmaaException { + return JnaSession.getVectorAttribute(jt, LibDrmaa.DRMAA_V_ARGV); + } + + @Override + public void setJobSubmissionState(int state) throws DrmaaException { + String stateString; + if (state == JobTemplate.HOLD_STATE) + stateString = LibDrmaa.DRMAA_SUBMISSION_STATE_HOLD; + else if (state == JobTemplate.ACTIVE_STATE) + stateString = LibDrmaa.DRMAA_SUBMISSION_STATE_ACTIVE; + else + throw new InvalidAttributeValueException("jobSubmissionState attribute is invalid"); + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JS_STATE, stateString); + } + + @Override + public int getJobSubmissionState() throws DrmaaException { + int state; + String stateString = JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JS_STATE); + if (LibDrmaa.DRMAA_SUBMISSION_STATE_HOLD.equals(stateString)) + state = JobTemplate.HOLD_STATE; + else if (LibDrmaa.DRMAA_SUBMISSION_STATE_ACTIVE.equals(stateString)) + state = JobTemplate.ACTIVE_STATE; + else + throw new InvalidAttributeValueException("jobSubmissionState attribute is invalid"); + return state; + } + + @SuppressWarnings("unchecked") + @Override + public void setJobEnvironment(Map env) throws DrmaaException { + JnaSession.setVectorAttribute(jt, LibDrmaa.DRMAA_V_ENV, JnaSession.mapToCollection(env)); + } + + @SuppressWarnings("unchecked") + @Override + public Map getJobEnvironment() throws DrmaaException { + return JnaSession.collectionToMap(JnaSession.getVectorAttribute(jt, LibDrmaa.DRMAA_V_ENV)); + } + + @Override + public void setWorkingDirectory(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_WD, s); + } + + @Override + public String getWorkingDirectory() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_WD); + } + + @Override + public void setJobCategory(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JOB_CATEGORY, s); + } + + @Override + public String getJobCategory() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JOB_CATEGORY); + } + + @Override + public void setNativeSpecification(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_NATIVE_SPECIFICATION, s); + } + + @Override + public String getNativeSpecification() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_NATIVE_SPECIFICATION); + } + + @SuppressWarnings("unchecked") + @Override + public void setEmail(Set set) throws DrmaaException { + JnaSession.setVectorAttribute(jt, LibDrmaa.DRMAA_V_EMAIL, set); + } + + @SuppressWarnings("unchecked") + @Override + public Set getEmail() throws DrmaaException { + return new LinkedHashSet(JnaSession.getVectorAttribute(jt, LibDrmaa.DRMAA_V_EMAIL)); + } + + @Override + public void setBlockEmail(boolean b) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_BLOCK_EMAIL, b ? "1" : "0"); + } + + @Override + public boolean getBlockEmail() throws DrmaaException { + return "1".equals(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_BLOCK_EMAIL)); + } + + @Override + public void setStartTime(PartialTimestamp partialTimestamp) throws DrmaaException { + JnaSession.setPartialTime(jt, LibDrmaa.DRMAA_START_TIME, partialTimestamp); + } + + @Override + public PartialTimestamp getStartTime() throws DrmaaException { + return JnaSession.getPartialTime(jt, LibDrmaa.DRMAA_START_TIME); + } + + @Override + public void setJobName(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JOB_NAME, s); + } + + @Override + public String getJobName() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JOB_NAME); + } + + @Override + public void setInputPath(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_INPUT_PATH, s); + } + + @Override + public String getInputPath() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_INPUT_PATH); + } + + @Override + public void setOutputPath(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_OUTPUT_PATH, s); + } + + @Override + public String getOutputPath() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_OUTPUT_PATH); + } + + @Override + public void setErrorPath(String s) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_ERROR_PATH, s); + } + + @Override + public String getErrorPath() throws DrmaaException { + return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_ERROR_PATH); + } + + @Override + public void setJoinFiles(boolean b) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JOIN_FILES, b ? "y" : "n"); + } + + @Override + public boolean getJoinFiles() throws DrmaaException { + return "y".equals(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JOIN_FILES)); + } + + @Override + public void setTransferFiles(FileTransferMode fileTransferMode) throws DrmaaException { + StringBuilder buf = new StringBuilder(); + + if (fileTransferMode.getInputStream()) + buf.append('i'); + + if (fileTransferMode.getOutputStream()) + buf.append('o'); + + if (fileTransferMode.getErrorStream()) + buf.append('e'); + + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_TRANSFER_FILES, buf.toString()); + } + + @Override + public FileTransferMode getTransferFiles() throws DrmaaException { + String mode = JnaSession.getAttribute(jt, LibDrmaa.DRMAA_TRANSFER_FILES); + + if (mode == null) + return null; + + FileTransferMode fileTransferMode = new FileTransferMode(); + fileTransferMode.setInputStream(mode.indexOf('i') >= 0); + fileTransferMode.setOutputStream(mode.indexOf('o') >= 0); + fileTransferMode.setErrorStream(mode.indexOf('e') >= 0); + return fileTransferMode; + } + + @Override + public void setDeadlineTime(PartialTimestamp partialTimestamp) throws DrmaaException { + JnaSession.setPartialTime(jt, LibDrmaa.DRMAA_DEADLINE_TIME, partialTimestamp); + } + + @Override + public PartialTimestamp getDeadlineTime() throws DrmaaException { + return JnaSession.getPartialTime(jt, LibDrmaa.DRMAA_DEADLINE_TIME); + } + + @Override + public void setHardWallclockTimeLimit(long l) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_WCT_HLIMIT, JnaSession.formatLimit(l)); + } + + @Override + public long getHardWallclockTimeLimit() throws DrmaaException { + return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_WCT_HLIMIT)); + } + + @Override + public void setSoftWallclockTimeLimit(long l) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_WCT_SLIMIT, JnaSession.formatLimit(l)); + } + + @Override + public long getSoftWallclockTimeLimit() throws DrmaaException { + return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_WCT_SLIMIT)); + } + + @Override + public void setHardRunDurationLimit(long l) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_DURATION_HLIMIT, JnaSession.formatLimit(l)); + } + + @Override + public long getHardRunDurationLimit() throws DrmaaException { + return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_DURATION_HLIMIT)); + } + + @Override + public void setSoftRunDurationLimit(long l) throws DrmaaException { + JnaSession.setAttribute(jt, LibDrmaa.DRMAA_DURATION_SLIMIT, JnaSession.formatLimit(l)); + } + + @Override + public long getSoftRunDurationLimit() throws DrmaaException { + return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_DURATION_SLIMIT)); + } + + @Override + public Set getAttributeNames() throws DrmaaException { + return JnaSession.getAttrNames(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof JnaJobTemplate)) + return false; + JnaJobTemplate other = (JnaJobTemplate) obj; + return this.jt.equals(other.jt) && this.session.equals(other.session); + } + + @Override + public int hashCode() { + return jt.hashCode(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java new file mode 100644 index 000000000..480113e1e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java @@ -0,0 +1,450 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import com.sun.jna.Memory; +import com.sun.jna.NativeLong; +import com.sun.jna.Pointer; +import com.sun.jna.StringArray; +import com.sun.jna.ptr.IntByReference; +import com.sun.jna.ptr.PointerByReference; +import org.ggf.drmaa.*; + +import java.text.ParseException; +import java.util.*; + +/** + * JNA mapping from Java to C DRMAA binding. + * See: Java and C Binding Documents on http://drmaa.org + */ +public class JnaSession implements Session { + private static final PartialTimestampFormat PARTIAL_TIMESTAMP_FORMAT = new PartialTimestampFormat(); + private static final ThreadLocal threadError = new ThreadLocal() { + @Override + protected Memory initialValue() { + return new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER); + } + }; + + @Override + public void init(String contact) throws DrmaaException { + checkError(LibDrmaa.drmaa_init(contact, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @Override + public void exit() throws DrmaaException { + checkError(LibDrmaa.drmaa_exit(getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @Override + public JobTemplate createJobTemplate() throws DrmaaException { + PointerByReference jtRef = new PointerByReference(); + checkError(LibDrmaa.drmaa_allocate_job_template(jtRef, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return new JnaJobTemplate(this, jtRef.getValue()); + } + + @Override + public void deleteJobTemplate(JobTemplate jobTemplate) throws DrmaaException { + JnaJobTemplate jnaJobTemplate = (JnaJobTemplate) jobTemplate; + checkError(LibDrmaa.drmaa_delete_job_template(jnaJobTemplate.getPointer(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @Override + public String runJob(JobTemplate jobTemplate) throws DrmaaException { + Memory jobId = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER); + JnaJobTemplate jnaJobTemplate = (JnaJobTemplate) jobTemplate; + checkError(LibDrmaa.drmaa_run_job(jobId, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, jnaJobTemplate.getPointer(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return jobId.getString(0); + } + + @Override + public List runBulkJobs(JobTemplate jobTemplate, int start, int end, int incr) throws DrmaaException { + PointerByReference jobIds = new PointerByReference(); + JnaJobTemplate jnaJobTemplate = (JnaJobTemplate) jobTemplate; + checkError(LibDrmaa.drmaa_run_bulk_jobs(jobIds, jnaJobTemplate.getPointer(), start, end, incr, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + try { + return getJobIds(jobIds); + } finally { + releaseJobIds(jobIds); + } + } + + @Override + public void control(String jobId, int action) throws DrmaaException { + checkError(LibDrmaa.drmaa_control(jobId, action, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @SuppressWarnings("unchecked") + @Override + public void synchronize(List list, long timeout, boolean dispose) throws DrmaaException { + StringArray jobIds = new StringArray((String[]) list.toArray(new String[list.size()])); + checkError(LibDrmaa.drmaa_synchronize(jobIds, new NativeLong(timeout), dispose ? 1 : 0, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + @Override + public JobInfo wait(String jobId, long timeout) throws DrmaaException { + Memory jobIdOut = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER); + IntByReference stat = new IntByReference(); + PointerByReference rusage = new PointerByReference(); + IntByReference exited = new IntByReference(); + IntByReference exitStatus = new IntByReference(); + IntByReference signaled = new IntByReference(); + Memory signal = new Memory(LibDrmaa.DRMAA_SIGNAL_BUFFER); + IntByReference coreDumped = new IntByReference(); + IntByReference aborted = new IntByReference(); + + int errnum; + + errnum = LibDrmaa.drmaa_wait(jobId, jobIdOut, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, stat, new NativeLong(timeout), rusage, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + Map rusageMap; + if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_RUSAGE) { + rusageMap = null; + } else { + try { + rusageMap = collectionToMap(getAttrValues(rusage)); + } finally { + releaseAttrValues(rusage); + } + } + + checkError(LibDrmaa.drmaa_wifexited(exited, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + + if (exited.getValue() != 0) { + checkError(LibDrmaa.drmaa_wexitstatus(exitStatus, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + checkError(LibDrmaa.drmaa_wifsignaled(signaled, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + + if (signaled.getValue() != 0) { + checkError(LibDrmaa.drmaa_wtermsig(signal, LibDrmaa.DRMAA_SIGNAL_BUFFER_LEN, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + checkError(LibDrmaa.drmaa_wcoredump(coreDumped, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + checkError(LibDrmaa.drmaa_wifaborted(aborted, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + + return new JnaJobInfo(jobIdOut.getString(0), rusageMap, exited.getValue() != 0, exitStatus.getValue(), + signaled.getValue() != 0, signal.getString(0), coreDumped.getValue() != 0, aborted.getValue() != 0); + } + + @Override + public int getJobProgramStatus(String jobId) throws DrmaaException { + IntByReference remotePs = new IntByReference(); + checkError(LibDrmaa.drmaa_job_ps(jobId, remotePs, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return remotePs.getValue(); + } + + @Override + public String getContact() { + Memory contact = new Memory(LibDrmaa.DRMAA_CONTACT_BUFFER); + try { + checkError(LibDrmaa.drmaa_get_contact(contact, LibDrmaa.DRMAA_CONTACT_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } catch (DrmaaException e) { + // DRMAA spec says this method should throw DrmaaException. + // Why doesn't interface implement this? + throw new RuntimeException(e); + } + return contact.getString(0); + } + + @Override + public Version getVersion() { + IntByReference major = new IntByReference(); + IntByReference minor = new IntByReference(); + try { + checkError(LibDrmaa.drmaa_version(major, minor, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } catch (DrmaaException e) { + // DRMAA spec says this method should throw DrmaaException. + // Why doesn't interface implement this? + throw new RuntimeException(e); + } + return new Version(major.getValue(), minor.getValue()); + } + + @Override + public String getDrmSystem() { + Memory drmSystem = new Memory(LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER); + try { + checkError(LibDrmaa.drmaa_get_DRM_system(drmSystem, LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } catch (DrmaaException e) { + // DRMAA spec says this method should throw DrmaaException. + // Why doesn't interface implement this? + throw new RuntimeException(e); + } + return drmSystem.getString(0); + } + + @Override + public String getDrmaaImplementation() { + Memory drmaaImplementation = new Memory(LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER); + try { + checkError(LibDrmaa.drmaa_get_DRMAA_implementation(drmaaImplementation, LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } catch (DrmaaException e) { + // DRMAA spec says this method should throw DrmaaException. + // Why doesn't interface implement this? + throw new RuntimeException(e); + } + return drmaaImplementation.getString(0); + } + + public static void setAttribute(Pointer jt, String name, String value) throws DrmaaException { + checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + public static String getAttribute(Pointer jt, String name) throws DrmaaException { + Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); + checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return attrBuffer.getString(0); + } + + public static void setVectorAttribute(Pointer jt, String name, Collection values) throws DrmaaException { + StringArray valuesArray = new StringArray(values.toArray(new String[values.size()])); + checkError(LibDrmaa.drmaa_set_vector_attribute(jt, name, valuesArray, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + + public static List getVectorAttribute(Pointer jt, String name) throws DrmaaException { + PointerByReference values = new PointerByReference(); + checkError(LibDrmaa.drmaa_get_vector_attribute(jt, name, values, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + try { + return getAttrValues(values); + } finally { + releaseAttrValues(values); + } + } + + public static void setPartialTime(Pointer jt, String name, PartialTimestamp partialTimestamp) throws DrmaaException { + setAttribute(jt, name, PARTIAL_TIMESTAMP_FORMAT.format(partialTimestamp)); + } + + public static PartialTimestamp getPartialTime(Pointer jt, String name) throws DrmaaException { + String time = getAttribute(jt, name); + if (time == null) + return null; + try { + return PARTIAL_TIMESTAMP_FORMAT.parse(time); + } catch (ParseException e) { + throw new InternalException(name + " property is unparsable"); + } + } + + public static Set getAttrNames() throws DrmaaException { + PointerByReference values = new PointerByReference(); + checkError(LibDrmaa.drmaa_get_attribute_names(values, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + try { + return new LinkedHashSet(getAttrNames(values)); + } finally { + releaseAttrNames(values); + } + } + + public static Collection mapToCollection(Map map) { + Collection collection = new LinkedHashSet(); + for (Map.Entry entry: map.entrySet()) + collection.add(entry.getKey() + "=" + entry.getValue()); + return collection; + } + + public static Map collectionToMap(Collection list) { + Map map = new LinkedHashMap(); + for (String entry: list) { + if (entry == null) + continue; + int equals = entry.indexOf('='); + if (equals < 0) + continue; + map.put(entry.substring(0, equals), entry.substring(equals + 1)); + } + return map; + } + + public static String formatLimit(long secs) { + long seconds = (secs % 60); + long minutes = (secs / 60) % 60; + long hours = (secs / 3600); + return String.format("%d:%02d:%02d", hours, minutes, seconds); + } + + public static long parseLimit(String limit) { + long seconds = 0; + if (limit != null) { + for (String token: limit.split(":")) { + seconds *= 60; + seconds += Long.parseLong(token); + } + } + return seconds; + } + + private static List getAttrNames(PointerByReference names) throws DrmaaException { + List namesList = new ArrayList(); + IntByReference size = new IntByReference(); + int errnum; + + errnum = LibDrmaa.drmaa_get_num_attr_names(names.getValue(), size); + checkError(errnum, "unable to get attribute names"); + int num = size.getValue(); + + Memory value = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); + for (int i = 1; i <= num; i++) { + errnum = LibDrmaa.drmaa_get_next_attr_name(names.getValue(), value, LibDrmaa.DRMAA_ATTR_BUFFER_LEN); + checkError(errnum, "unable to get attribute name " + i); + if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS) + break; + namesList.add(value.getString(0)); + } + + return namesList; + } + + private static List getAttrValues(PointerByReference values) throws DrmaaException { + List valuesList = new ArrayList(); + IntByReference size = new IntByReference(); + int errnum; + + errnum = LibDrmaa.drmaa_get_num_attr_values(values.getValue(), size); + checkError(errnum, "unable to get attribute values"); + int num = size.getValue(); + + Memory value = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); + for (int i = 1; i <= num; i++) { + errnum = LibDrmaa.drmaa_get_next_attr_value(values.getValue(), value, LibDrmaa.DRMAA_ATTR_BUFFER_LEN); + checkError(errnum, "unable to get attribute value " + i); + if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS) + break; + valuesList.add(value.getString(0)); + } + + return valuesList; + } + + private static List getJobIds(PointerByReference jobIds) throws DrmaaException { + List jobIdsList = new ArrayList(); + IntByReference size = new IntByReference(); + int errnum; + + errnum = LibDrmaa.drmaa_get_num_job_ids(jobIds.getValue(), size); + checkError(errnum, "unable to get jobIds"); + int num = size.getValue(); + + Memory value = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER); + for (int i = 1; i <= num; i++) { + errnum = LibDrmaa.drmaa_get_next_job_id(jobIds.getValue(), value, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN); + checkError(errnum, "unable to get jobId " + i); + if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS) + break; + jobIdsList.add(value.getString(0)); + } + + return jobIdsList; + } + + private static void releaseAttrNames(PointerByReference names) throws DrmaaException { + LibDrmaa.drmaa_release_attr_names(names.getValue()); + } + + private static void releaseAttrValues(PointerByReference values) throws DrmaaException { + LibDrmaa.drmaa_release_attr_values(values.getValue()); + } + + private static void releaseJobIds(PointerByReference jobIds) throws DrmaaException { + LibDrmaa.drmaa_release_job_ids(jobIds.getValue()); + } + + private static Memory getError() { + return threadError.get(); + } + + private static void checkError(int errnum) throws DrmaaException { + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + checkError(errnum, getError().getString(0)); + } + + private static void checkError(int errnum, String error) throws DrmaaException { + switch (errnum) { + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS: + break; + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INTERNAL_ERROR: + throw new InternalException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE: + throw new DrmCommunicationException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_AUTH_FAILURE: + throw new AuthorizationException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_ARGUMENT: + throw new IllegalArgumentException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_ACTIVE_SESSION: + throw new NoActiveSessionException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MEMORY: + throw new OutOfMemoryError(error); + + /* -------------- init and exit specific --------------- */ + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_CONTACT_STRING: + throw new InvalidContactStringException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DEFAULT_CONTACT_STRING_ERROR: + throw new DefaultContactStringException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_DEFAULT_CONTACT_STRING_SELECTED: + throw new NoDefaultContactStringException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DRMS_INIT_FAILED: + throw new DrmsInitException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_ALREADY_ACTIVE_SESSION: + throw new AlreadyActiveSessionException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DRMS_EXIT_ERROR: + throw new DrmsExitException(error); + + /* ---------------- job attributes specific -------------- */ + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_ATTRIBUTE_FORMAT: + throw new InvalidAttributeFormatException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE: + throw new InvalidAttributeValueException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES: + throw new ConflictingAttributeValuesException(error); + + /* --------------------- job submission specific -------------- */ + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_TRY_LATER: + throw new TryLaterException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DENIED_BY_DRM: + throw new DeniedByDrmException(error); + + /* ------------------------------- job control specific ---------------- */ + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_JOB: + throw new InvalidJobException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_RESUME_INCONSISTENT_STATE: + throw new ResumeInconsistentStateException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUSPEND_INCONSISTENT_STATE: + throw new SuspendInconsistentStateException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_HOLD_INCONSISTENT_STATE: + throw new HoldInconsistentStateException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_RELEASE_INCONSISTENT_STATE: + throw new ReleaseInconsistentStateException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_EXIT_TIMEOUT: + throw new ExitTimeoutException(error); + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_RUSAGE: + break; + case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS: + break; + default: + throw new IllegalArgumentException(String.format("Unknown error code %d: %s", errnum, error)); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java new file mode 100644 index 000000000..a1460b7f4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionFactory.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import org.ggf.drmaa.Session; +import org.ggf.drmaa.SessionFactory; + +/** + * JNA mapping from Java to C DRMAA binding. + */ +@SuppressWarnings("unused") +public class JnaSessionFactory extends SessionFactory { + @Override + public Session getSession() { + return new JnaSession(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java new file mode 100644 index 000000000..1244d3023 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaa.java @@ -0,0 +1,754 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/*___INFO__MARK_BEGIN__*/ +/************************************************************************* + * + * The Contents of this file are made available subject to the terms of + * the Sun Industry Standards Source License Version 1.2 + * + * Sun Microsystems Inc., March, 2001 + * + * + * Sun Industry Standards Source License Version 1.2 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.2 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2001 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + ************************************************************************/ +/*___INFO__MARK_END__*/ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import com.sun.jna.*; +import com.sun.jna.ptr.IntByReference; +import com.sun.jna.ptr.PointerByReference; + +@SuppressWarnings("unused") +public class LibDrmaa { + static { + Native.register("drmaa"); + } + +/* see www.drmaa.org for more details on the DRMAA specification */ +/****** DRMAA/-DRMAA_Interface ************************************************* +* NAME +* DRMAA_Interface -- DRMAA interface +* +* FUNCTION +* The enlisted functions specify the C/C++ binding of the DRMAA interface +* specification. +* +* SEE ALSO +* DRMAA/drmaa_get_next_attr_name() +* DRMAA/drmaa_get_next_attr_value() +* DRMAA/drmaa_get_next_job_id() +* DRMAA/drmaa_release_attr_names() +* DRMAA/drmaa_release_attr_values() +* DRMAA/drmaa_release_job_ids() +* DRMAA/drmaa_init() +* DRMAA/drmaa_exit() +* DRMAA/drmaa_allocate_job_template() +* DRMAA/drmaa_delete_job_template() +* DRMAA/drmaa_set_attribute() +* DRMAA/drmaa_get_attribute() +* DRMAA/drmaa_set_vector_attribute() +* DRMAA/drmaa_get_vector_attribute() +* DRMAA/drmaa_get_attribute_names() +* DRMAA/drmaa_get_vector_attribute_names() +* DRMAA/drmaa_run_job() +* DRMAA/drmaa_run_bulk_jobs() +* DRMAA/drmaa_control() +* DRMAA/drmaa_synchronize() +* DRMAA/drmaa_wait() +* DRMAA/drmaa_wifexited() +* DRMAA/drmaa_wexitstatus() +* DRMAA/drmaa_wifsignaled() +* DRMAA/drmaa_wtermsig() +* DRMAA/drmaa_wcoredump() +* DRMAA/drmaa_wifaborted() +* DRMAA/drmaa_job_ps() +* DRMAA/drmaa_strerror() +* DRMAA/drmaa_get_contact() +* DRMAA/drmaa_version() +* DRMAA/drmaa_get_DRM_system() +*******************************************************************************/ + +/* ------------------- Constants ------------------- */ +/* + * some not yet agreed buffer length constants + * these are recommended minimum values + */ + +/* drmaa_get_attribute() */ +public static final long DRMAA_ATTR_BUFFER = 1024; +public static final NativeLong DRMAA_ATTR_BUFFER_LEN = new NativeLong(DRMAA_ATTR_BUFFER - 1); + +/* drmaa_get_contact() */ +public static final long DRMAA_CONTACT_BUFFER = 1024; +public static final NativeLong DRMAA_CONTACT_BUFFER_LEN = new NativeLong(DRMAA_CONTACT_BUFFER - 1); + +/* drmaa_get_DRM_system() */ +public static final long DRMAA_DRM_SYSTEM_BUFFER = 1024; +public static final NativeLong DRMAA_DRM_SYSTEM_BUFFER_LEN = new NativeLong(DRMAA_DRM_SYSTEM_BUFFER - 1); + +/* drmaa_get_DRM_system() */ +public static final long DRMAA_DRMAA_IMPLEMENTATION_BUFFER = 1024; +public static final NativeLong DRMAA_DRMAA_IMPLEMENTATION_BUFFER_LEN = new NativeLong(DRMAA_DRMAA_IMPLEMENTATION_BUFFER - 1); + +/* + * Agreed buffer length constants + * these are recommended minimum values + */ +public static final long DRMAA_ERROR_STRING_BUFFER = 1024; +public static final long DRMAA_JOBNAME_BUFFER = 1024; +public static final long DRMAA_SIGNAL_BUFFER = 32; + +public static final NativeLong DRMAA_ERROR_STRING_BUFFER_LEN = new NativeLong(DRMAA_ERROR_STRING_BUFFER - 1); +public static final NativeLong DRMAA_JOBNAME_BUFFER_LEN = new NativeLong(DRMAA_JOBNAME_BUFFER - 1); +public static final NativeLong DRMAA_SIGNAL_BUFFER_LEN = new NativeLong(DRMAA_SIGNAL_BUFFER - 1); + +/* + * Agreed constants + */ +public static final NativeLong DRMAA_TIMEOUT_WAIT_FOREVER = new NativeLong(-1); +public static final NativeLong DRMAA_TIMEOUT_NO_WAIT = new NativeLong(0); + +public static final String DRMAA_JOB_IDS_SESSION_ANY = "DRMAA_JOB_IDS_SESSION_ANY"; +public static final String DRMAA_JOB_IDS_SESSION_ALL = "DRMAA_JOB_IDS_SESSION_ALL"; + +public static final String DRMAA_SUBMISSION_STATE_ACTIVE = "drmaa_active"; +public static final String DRMAA_SUBMISSION_STATE_HOLD = "drmaa_hold"; + +/* + * Agreed placeholder names + */ +public static final String DRMAA_PLACEHOLDER_INCR = "$drmaa_incr_ph$"; +public static final String DRMAA_PLACEHOLDER_HD = "$drmaa_hd_ph$"; +public static final String DRMAA_PLACEHOLDER_WD = "$drmaa_wd_ph$"; + +/* + * Agreed names of job template attributes + */ +public static final String DRMAA_REMOTE_COMMAND = "drmaa_remote_command"; +public static final String DRMAA_JS_STATE = "drmaa_js_state"; +public static final String DRMAA_WD = "drmaa_wd"; +public static final String DRMAA_JOB_CATEGORY = "drmaa_job_category"; +public static final String DRMAA_NATIVE_SPECIFICATION = "drmaa_native_specification"; +public static final String DRMAA_BLOCK_EMAIL = "drmaa_block_email"; +public static final String DRMAA_START_TIME = "drmaa_start_time"; +public static final String DRMAA_JOB_NAME = "drmaa_job_name"; +public static final String DRMAA_INPUT_PATH = "drmaa_input_path"; +public static final String DRMAA_OUTPUT_PATH = "drmaa_output_path"; +public static final String DRMAA_ERROR_PATH = "drmaa_error_path"; +public static final String DRMAA_JOIN_FILES = "drmaa_join_files"; +public static final String DRMAA_TRANSFER_FILES = "drmaa_transfer_files"; +public static final String DRMAA_DEADLINE_TIME = "drmaa_deadline_time"; +public static final String DRMAA_WCT_HLIMIT = "drmaa_wct_hlimit"; +public static final String DRMAA_WCT_SLIMIT = "drmaa_wct_slimit"; +public static final String DRMAA_DURATION_HLIMIT = "drmaa_duration_hlimit"; +public static final String DRMAA_DURATION_SLIMIT = "drmaa_duration_slimit"; + +/* names of job template vector attributes */ +public static final String DRMAA_V_ARGV = "drmaa_v_argv"; +public static final String DRMAA_V_ENV = "drmaa_v_env"; +public static final String DRMAA_V_EMAIL = "drmaa_v_email"; + +/* + * DRMAA errno values + * + * do not touch these values are agreed !!! + */ +public static interface DRMAA_ERRNO { + /* -------------- these are relevant to all sections ---------------- */ + public static final int DRMAA_ERRNO_SUCCESS = 0; /* Routine returned normally with success. */ + public static final int DRMAA_ERRNO_INTERNAL_ERROR = 1; /* Unexpected or internal DRMAA error like memory allocation, system call failure, etc. */ + public static final int DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE = 2; /* Could not contact DRM system for this request. */ + public static final int DRMAA_ERRNO_AUTH_FAILURE = 3; /* The specified request is not processed successfully due to authorization failure. */ + public static final int DRMAA_ERRNO_INVALID_ARGUMENT = 4; /* The input value for an argument is invalid. */ + public static final int DRMAA_ERRNO_NO_ACTIVE_SESSION = 5; /* Exit routine failed because there is no active session */ + public static final int DRMAA_ERRNO_NO_MEMORY = 6; /* failed allocating memory */ + + /* -------------- init and exit specific --------------- */ + public static final int DRMAA_ERRNO_INVALID_CONTACT_STRING = 7; /* Initialization failed due to invalid contact string. */ + public static final int DRMAA_ERRNO_DEFAULT_CONTACT_STRING_ERROR = 8; /* DRMAA could not use the default contact string to connect to DRM system. */ + public static final int DRMAA_ERRNO_NO_DEFAULT_CONTACT_STRING_SELECTED = 9; /* No default contact string was provided or selected. DRMAA requires that the default contact string is selected when there is more than one default contact string due to multiple DRMAA implementation contained in the binary module. */ + public static final int DRMAA_ERRNO_DRMS_INIT_FAILED = 10; /* Initialization failed due to failure to init DRM system. */ + public static final int DRMAA_ERRNO_ALREADY_ACTIVE_SESSION = 11; /* Initialization failed due to existing DRMAA session. */ + public static final int DRMAA_ERRNO_DRMS_EXIT_ERROR = 12; /* DRM system disengagement failed. */ + + /* ---------------- job attributes specific -------------- */ + public static final int DRMAA_ERRNO_INVALID_ATTRIBUTE_FORMAT = 13; /* The format for the job attribute value is invalid. */ + public static final int DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE = 14; /* The value for the job attribute is invalid. */ + public static final int DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES = 15; /* The value of this attribute is conflicting with a previously set attributes. */ + + /* --------------------- job submission specific -------------- */ + public static final int DRMAA_ERRNO_TRY_LATER = 16; /* Could not pass job now to DRM system. A retry may succeed however (saturation). */ + public static final int DRMAA_ERRNO_DENIED_BY_DRM = 17; /* The DRM system rejected the job. The job will never be accepted due to DRM configuration or job template settings. */ + + /* ------------------------------- job control specific ---------------- */ + public static final int DRMAA_ERRNO_INVALID_JOB = 18; /* The job specified by the 'jobid' does not exist. */ + public static final int DRMAA_ERRNO_RESUME_INCONSISTENT_STATE = 19; /* The job has not been suspended. The RESUME request will not be processed. */ + public static final int DRMAA_ERRNO_SUSPEND_INCONSISTENT_STATE = 20; /* The job has not been running, and it cannot be suspended. */ + public static final int DRMAA_ERRNO_HOLD_INCONSISTENT_STATE = 21; /* The job cannot be moved to a HOLD state. */ + public static final int DRMAA_ERRNO_RELEASE_INCONSISTENT_STATE = 22; /* The job is not in a HOLD state. */ + public static final int DRMAA_ERRNO_EXIT_TIMEOUT = 23; /* We have encountered a time-out condition for drmaa_synchronize or drmaa_wait. */ + public static final int DRMAA_ERRNO_NO_RUSAGE = 24; /* This error code is returned by drmaa_wait() when a job has finished but no rusage and stat data could be provided. */ + public static final int DRMAA_ERRNO_NO_MORE_ELEMENTS = 25; /* There are no more elements in the opaque string vector. */ + + public static final int DRMAA_NO_ERRNO = 26; +} + +/* + * Agreed DRMAA job states as returned by drmaa_job_ps() + */ +public static interface DRMAA_PS { + public static final int DRMAA_PS_UNDETERMINED = 0x00; /* process status cannot be determined */ + public static final int DRMAA_PS_QUEUED_ACTIVE = 0x10; /* job is queued and active */ + public static final int DRMAA_PS_SYSTEM_ON_HOLD = 0x11; /* job is queued and in system hold */ + public static final int DRMAA_PS_USER_ON_HOLD = 0x12; /* job is queued and in user hold */ + public static final int DRMAA_PS_USER_SYSTEM_ON_HOLD = 0x13; /* job is queued and in user and system hold */ + public static final int DRMAA_PS_RUNNING = 0x20; /* job is running */ + public static final int DRMAA_PS_SYSTEM_SUSPENDED = 0x21; /* job is system suspended */ + public static final int DRMAA_PS_USER_SUSPENDED = 0x22; /* job is user suspended */ + public static final int DRMAA_PS_USER_SYSTEM_SUSPENDED = 0x23; /* job is user and system suspended */ + public static final int DRMAA_PS_DONE = 0x30; /* job finished normally */ + public static final int DRMAA_PS_FAILED = 0x40; /* job finished, but failed */ +} + +/* + * Agreed DRMAA actions for drmaa_control() + */ +public static interface DRMAA_CONTROL { + public static final int DRMAA_CONTROL_SUSPEND = 0; + public static final int DRMAA_CONTROL_RESUME = 1; + public static final int DRMAA_CONTROL_HOLD = 2; + public static final int DRMAA_CONTROL_RELEASE = 3; + public static final int DRMAA_CONTROL_TERMINATE = 4; +} + +/* ------------------- Data types ------------------- */ +/* + * Agreed opaque DRMAA job template + * struct drmaa_job_template_s is in japiP.h + */ +//typedef struct drmaa_job_template_s drmaa_job_template_t; + +/* ---------- C/C++ language binding specific interfaces -------- */ + +//typedef struct drmaa_attr_names_s drmaa_attr_names_t; +//typedef struct drmaa_attr_values_s drmaa_attr_values_t; +//typedef struct drmaa_job_ids_s drmaa_job_ids_t; + +/* + * get next string attribute from iterator + * + * returns DRMAA_ERRNO_SUCCESS or DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE + * if no such exists + */ + +public static native int drmaa_get_next_attr_name(/* drmaa_attr_names_t* */ Pointer values, Pointer value, + NativeLong value_len); +public static native int drmaa_get_next_attr_value(/* drmaa_attr_names_t* */ Pointer values, Pointer value, + NativeLong value_len); +public static native int drmaa_get_next_job_id(/* drmaa_job_ids_t* */ Pointer values, Pointer value, + NativeLong value_len); + +/* + * get element count of opaque string vector + * + * Gives the number of elements in the opaque string vector. Useful for + * copying the contents into an array. + */ +public static native int drmaa_get_num_attr_names(/* drmaa_attr_names_t* */ Pointer values, IntByReference size); +public static native int drmaa_get_num_attr_values(/* drmaa_attr_values_t* */ Pointer values, IntByReference size); +public static native int drmaa_get_num_job_ids(/* drmaa_job_ids_t* */ Pointer values, IntByReference size); + +/* + * release opaque string vector + * + * Opaque string vectors can be used without any constraint + * until the release function has been called. + */ +public static native void drmaa_release_attr_names(/* drmaa_attr_names_t* */ Pointer values); +public static native void drmaa_release_attr_values(/* drmaa_attr_values_t* */ Pointer values); +public static native void drmaa_release_job_ids(/* drmaa_job_ids_t* */ Pointer values); + +/* ------------------- init/exit routines ------------------- */ +/* + * Initialize DRMAA API library and create a new DRMAA Session. 'Contact' + * is an implementation dependent string which MAY be used to specify + * which DRM system to use. This routine MUST be called before any + * other DRMAA calls, except for drmaa_version(). + * If 'contact' is NULL, the default DRM system SHALL be used provided there is + * only one DRMAA implementation in the provided binary module. When these is + * more than one DRMAA implementation in the binary module, drmaa_init() SHALL + * return the DRMAA_ERRNO_NO_DEFAULT_CONTACT_STRING_SELECTED error. drmaa_init() + * SHOULD be called by only one of the threads. The main thread is RECOMMENDED. + * A call by another thread SHALL return DRMAA_ERRNO_ALREADY_ACTIVE_SESSION. + * When 'contact' is a a semi-colon separated list of name=value strings, the + * strings will be parsed and interpreted. The current list of accepted names + * is: + * session -- the id of the session to which to reconnect +#if 0 + * sge_root -- the SGE_ROOT to use + * sge_cell -- the SGE_CELL to use +#endif + * + * drmaa_init() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INVALID_CONTACT_STRING, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_ALREADY_ACTIVE_SESSION, + * DRMAA_ERRNO_NO_DEFAULT_CONTACT_STRING_SELECTED, or + * DRMAA_ERRNO_DEFAULT_CONTACT_STRING_ERROR. + */ +public static native int drmaa_init(String contact, Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * Disengage from DRMAA library and allow the DRMAA library to perform + * any necessary internal clean up. + * This routine SHALL end the current DRMAA Session, but SHALL NOT effect any + * jobs (e.g., queued and running jobs SHALL remain queued and running). + * drmaa_exit() SHOULD be called by only one of the threads. Other thread calls + * to drmaa_exit() MAY fail since there is no active session. + * + * drmaa_exit() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRMS_EXIT_ERROR or + * DRMAA_ERRNO_NO_ACTIVE_SESSION. + */ +public static native int drmaa_exit(Pointer error_diagnosis, NativeLong error_diag_len); + +/* ------------------- job template routines ------------------- */ + +/* + * Allocate a new job template. + * + * drmaa_allocate_job_template() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_INTERNAL_ERROR or + * DRMAA_ERRNO_NO_MEMORY. + */ +public static native int drmaa_allocate_job_template(/* drmaa_job_template_t** */ PointerByReference jt, Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * Deallocate a job template. This routine has no effect on jobs. + * + * drmaa_delete_job_template() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE or + * DRMAA_ERRNO_INTERNAL_ERROR. + */ +public static native int drmaa_delete_job_template(/* drmaa_job_template_t* */ Pointer jt, Pointer error_diagnosis, + NativeLong error_diag_len); + + +/* + * Adds ('name', 'value') pair to list of attributes in job template 'jt'. + * Only non-vector attributes SHALL be passed. + * + * drmaa_set_attribute() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INVALID_ATTRIBUTE_FORMAT, + * DRMAA_ERRNO_INVALID_ARGUMENT, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE or + * DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES. + */ +public static native int drmaa_set_attribute(/* drmaa_job_template_t* */ Pointer jt, String name, + String value, Pointer error_diagnosis, + NativeLong error_diag_len); + + +/* + * If 'name' is an existing non-vector attribute name in the job + * template 'jt', then the value of 'name' SHALL be returned; otherwise, + * NULL is returned. + * + * drmaa_get_attribute() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE. + */ +public static native int drmaa_get_attribute(/* drmaa_job_template_t* */ Pointer jt, String name, Pointer value, + NativeLong value_len, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* Adds ('name', 'values') pair to list of vector attributes in job template + * 'jt'. Only vector attributes SHALL be passed. + * A 'value' string vector containing n elements must be n+1 elements long, with + * the nth value, i.e. value[n], being set to NULL as a delimitor. + * + * drmaa_set_vector_attribute() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_INVALID_ATTRIBUTE_FORMAT, + * DRMAA_ERRNO_INVALID_ARGUMENT, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES. + */ +public static native int drmaa_set_vector_attribute(/* drmaa_job_template_t* */ Pointer jt, String name, + Pointer value, Pointer error_diagnosis, + NativeLong error_diag_len); + + +/* + * If 'name' is an existing vector attribute name in the job template 'jt', + * then the values of 'name' are returned; otherwise, NULL is returned. + * + * drmaa_get_vector_attribute() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE. + */ +public static native int drmaa_get_vector_attribute(/* drmaa_job_template_t* */ Pointer jt, String name, + /* drmaa_attr_values_t ** */ PointerByReference values, + Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * SHALL return the set of supported attribute names whose associated + * value type is String. This set SHALL include supported DRMAA reserved + * attribute names and native attribute names. + * + * drmaa_get_attribute_names() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_NO_MEMORY. + */ +public static native int drmaa_get_attribute_names(/* drmaa_attr_names_t ** */ PointerByReference values, + Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * SHALL return the set of supported attribute names whose associated + * value type is String Vector. This set SHALL include supported DRMAA reserved + * attribute names and native attribute names. + * + * drmaa_get_vector_attribute_names() SHALL return DRMAA_ERRNO_SUCCESS on + * success, otherwise: + * DRMAA_ERRNO_NO_MEMORY. + */ +public static native int drmaa_get_vector_attribute_names(/* drmaa_attr_names_t ** */ PointerByReference values, + Pointer error_diagnosis, + NativeLong error_diag_len); + +/* ------------------- job submission routines ------------------- */ + +/* + * Submit a job with attributes defined in the job template 'jt'. + * The job identifier 'job_id' is a printable, NULL terminated string, + * identical to that returned by the underlying DRM system. + * + * drmaa_run_job() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_TRY_LATER, + * DRMAA_ERRNO_DENIED_BY_DRM, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE or + * DRMAA_ERRNO_AUTH_FAILURE. + */ +public static native int drmaa_run_job(Pointer job_id, NativeLong job_id_len, + /* drmaa_job_template_t * */ Pointer jt, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * Submit a set of parametric jobs, dependent on the implied loop index, each + * with attributes defined in the job template 'jt'. + * The job identifiers 'job_ids' SHALL all be printable, + * NULL terminated strings, identical to those returned by the underlying + * DRM system. Nonnegative loop bounds SHALL NOT use file names + * that start with minus sign like command line options. + * DRMAA defines a special index placeholder, drmaa_incr_ph, (which has the + * value "$incr_pl$") that is used to construct parametric job templates. + * For example: + * //C++ string syntax used + * drmaa_set_attribute(pjt, "stderr", drmaa_incr_ph + ".err" ); + * + * drmaa_run_bulk_jobs() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_TRY_LATER, + * DRMAA_ERRNO_DENIED_BY_DRM, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE or + * DRMAA_ERRNO_AUTH_FAILURE. + */ +public static native int drmaa_run_bulk_jobs(/* drmaa_job_ids_t ** */ PointerByReference jobids, + /* drmaa_job_template_t * */ Pointer jt, int start, int end, + int incr, Pointer error_diagnosis, NativeLong error_diag_len); + +/* ------------------- job control routines ------------------- */ + +/* + * Start, stop, restart, or kill the job identified by 'job_id'. + * If 'job_id' is DRMAA_JOB_IDS_SESSION_ALL then this routine + * acts on all jobs *submitted* during this DRMAA session. + * The legal values for 'action' and their meanings SHALL be: + * DRMAA_CONTROL_SUSPEND: stop the job, + * DRMAA_CONTROL_RESUME: (re)start the job, + * DRMAA_CONTROL_HOLD: put the job on-hold, + * DRMAA_CONTROL_RELEASE: release the hold on the job, and + * DRMAA_CONTROL_TERMINATE: kill the job. + * + * This routine SHALL return once the action has been acknowledged by + * the DRM system, but does not necessarily wait until the action + * has been completed. + * + * drmaa_control() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_AUTH_FAILURE, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_RESUME_INCONSISTENT_STATE, + * DRMAA_ERRNO_SUSPEND_INCONSISTENT_STATE, + * DRMAA_ERRNO_HOLD_INCONSISTENT_STATE, + * DRMAA_ERRNO_RELEASE_INCONSISTENT_STATE or + * DRMAA_ERRNO_INVALID_JOB. + */ +public static native int drmaa_control(String jobid, int action, Pointer error_diagnosis, + NativeLong error_diag_len); + + +/* + * Wait until all jobs specified by 'job_ids' have finished + * execution. If 'job_ids' is DRMAA_JOB_IDS_SESSION_ALL then this routine + * waits for all jobs *submitted* during this DRMAA session. The timeout value + * is used to specify the number of seconds to wait for the job to fail finish + * before returning if a result is not immediately available. The value + * DRMAA_TIMEOUT_WAIT_FOREVER can be used to specify that routine should wait + * indefinitely for a result. The value DRMAA_TIMEOUT_NO_WAIT can be used to + * specify that the routine should return immediately if no result is available. + * If the call exits before timeout, all the jobs have + * been waited on or there was an interrupt. + * If the invocation exits on timeout, the return code is + * DRMAA_ERRNO_EXIT_TIMEOUT. The caller SHOULD check system time before and + * after this call in order to check how much time has passed. + * + * The dispose parameter specifies how to treat reaping information: + * True=1 "fake reap", i.e. dispose of the rusage data + * False=0 do not reap + * + * A 'job_ids' string vector containing n elements must be n+1 elements long, + * with the nth value, i.e. job_ids[n], being set to NULL as a delimitor. + * + * drmaa_synchronize() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_AUTH_FAILURE, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_EXIT_TIMEOUT or + * DRMAA_ERRNO_INVALID_JOB. + */ +public static native int drmaa_synchronize(Pointer job_ids, NativeLong timeout, int dispose, + Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * This routine SHALL wait for a job with job_id to fail or finish execution. If + * the special string, DRMAA_JOB_IDS_SESSION_ANY is provided as the job_id, + * this routine SHALL wait for any job from the session. This routine is modeled + * on the wait3 POSIX routine. The timeout value is used to specify the number + * of seconds to wait for the job to fail finish before returning if a result is + * not immediately available. The value DRMAA_TIMEOUT_WAIT_FOREVER can be + * used to specify that routine should wait indefinitely for a result. The value + * DRMAA_TIMEOUT_NO_WAIT may be specified that the routine should return + * immediately if no result is available. + * If the call exits before timeout ,the job has been waited on + * successfully or there was an interrupt. + * If the invocation exits on timeout, the return code is + * DRMAA_ERRNO_EXIT_TIMEOUT. The caller SHOULD check system time before and + * after this call in order to check how much time has passed. + * The routine reaps jobs on a successful call, so any subsequent calls + * to drmaa_wait SHOULD fail returning an error DRMAA_ERRNO_INVALID_JOB meaning + * that the job has been already reaped. This error is the same as if the job + * was unknown. Failing due to an elapsed timeout has an effect that it is + * possible to issue drmaa_wait multiple times for the same job_id. When + * successful, the rusage information SHALL be provided as an array of strings, + * where each string complies with the format =. The string portion + * contains the amount of resources consumed by the job and is opaque. + * The 'stat' drmaa_wait parameter is used in the drmaa_w* functions for + * providing more detailed information about job termination if available. An + * analogous set of macros is defined in POSIX for analyzing the wait3(2) OUT + * parameter 'stat'. + * + * drmaa_wait() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_AUTH_FAILURE, + * DRMAA_ERRNO_NO_RUSAGE, + * DRMAA_ERRNO_NO_MEMORY, + * DRMAA_ERRNO_EXIT_TIMEOUT or + * DRMAA_ERRNO_INVALID_JOB. + */ +public static native int drmaa_wait(String job_id, Pointer job_id_out, NativeLong job_id_out_len, + IntByReference stat, NativeLong timeout, /* drmaa_attr_values_t ** */ PointerByReference rusage, + Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * Evaluates into 'exited' a non-zero value if stat was returned for a + * job that terminated normally. A zero value can also indicate that + * altough the job has terminated normally an exit status is not available + * or that it is not known whether the job terminated normally. In both + * cases drmaa_wexitstatus() SHALL NOT provide exit status information. + * A non-zero 'exited' value indicates more detailed diagnosis can be provided + * by means of drmaa_wifsignaled(), drmaa_wtermsig() and drmaa_wcoredump(). + */ +public static native int drmaa_wifexited(IntByReference exited, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * If the OUT parameter 'exited' of drmaa_wifexited() is non-zero, + * this function evaluates into 'exit_code' the exit code that the + * job passed to _exit() (see exit(2)) or exit(3C), or the value that + * the child process returned from main. + */ +public static native int drmaa_wexitstatus(IntByReference exit_status, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * Evaluates into 'signaled' a non-zero value if status was returned + * for a job that terminated due to the receipt of a signal. A zero value + * can also indicate that altough the job has terminated due to the receipt + * of a signal the signal is not available or that it is not known whether + * the job terminated due to the receipt of a signal. In both cases + * drmaa_wtermsig() SHALL NOT provide signal information. + */ +public static native int drmaa_wifsignaled(IntByReference signaled, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * If the OUT parameter 'signaled' of drmaa_wifsignaled(stat) is + * non-zero, this function evaluates into signal a string representation of the + * signal that caused the termination of the job. For signals declared by POSIX, + * the symbolic names SHALL be returned (e.g., SIGABRT, SIGALRM). + * For signals not declared by POSIX, any other string MAY be returned. + */ +public static native int drmaa_wtermsig(Pointer signal, NativeLong signal_len, int stat, + Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * If the OUT parameter 'signaled' of drmaa_wifsignaled(stat) is + * non-zero, this function evaluates into 'core_dumped' a non-zero value + * if a core image of the terminated job was created. + */ +public static native int drmaa_wcoredump(IntByReference core_dumped, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* + * Evaluates into 'aborted' a non-zero value if 'stat' + * was returned for a job that ended before entering the running state. + */ +public static native int drmaa_wifaborted(IntByReference aborted, int stat, Pointer error_diagnosis, + NativeLong error_diag_len); + + + +/* + * Get the program status of the job identified by 'job_id'. + * The possible values returned in 'remote_ps' and their meanings SHALL be: + * + * DRMAA_PS_UNDETERMINED = 0x00: process status cannot be determined + * DRMAA_PS_QUEUED_ACTIVE = 0x10: job is queued and active + * DRMAA_PS_SYSTEM_ON_HOLD = 0x11: job is queued and in system hold + * DRMAA_PS_USER_ON_HOLD = 0x12: job is queued and in user hold + * DRMAA_PS_USER_SYSTEM_ON_HOLD = 0x13: job is queued and in user and system + * hold + * DRMAA_PS_RUNNING = 0x20: job is running + * DRMAA_PS_SYSTEM_SUSPENDED = 0x21: job is system suspended + * DRMAA_PS_USER_SUSPENDED = 0x22: job is user suspended + * DRMAA_PS_USER_SYSTEM_SUSPENDED = 0x23: job is user and system suspended + * DRMAA_PS_DONE = 0x30: job finished normally + * DRMAA_PS_FAILED = 0x40: job finished, but failed + * + * DRMAA SHOULD always get the status of job_id from DRM system, unless the + * previous status has been DRMAA_PS_FAILED or DRMAA_PS_DONE and the status has + * been successfully cached. Terminated jobs get DRMAA_PS_FAILED status. + * + * drmaa_synchronize() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE, + * DRMAA_ERRNO_AUTH_FAILURE, + * DRMAA_ERRNO_NO_MEMORY or + * DRMAA_ERRNO_INVALID_JOB. + */ +public static native int drmaa_job_ps(String job_id, IntByReference remote_ps, Pointer error_diagnosis, + NativeLong error_diag_len); + +/* ------------------- auxiliary routines ------------------- */ + +/* + * SHALL return the error message text associated with the errno number. The + * routine SHALL return null string if called with invalid ERRNO number. + */ +public static native String drmaa_strerror(int drmaa_errno); + +/* + * If called before drmaa_init(), it SHALL return a comma delimited default + * DRMAA implementation contacts string, one per each DRM system provided + * implementation. If called after drmaa_init(), it SHALL return the selected + * contact string. The output string is Implementation dependent. + * drmaa_get_contact() SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INTERNAL_ERROR. + */ +public static native int drmaa_get_contact(Pointer contact, NativeLong contact_len, + Pointer error_diagnosis, NativeLong error_diag_len); + +/* + * OUT major - major version number (non-negative integer) + * OUT minor - minor version number (non-negative integer) + * SHALL return the major and minor version numbers of the DRMAA library; + * for DRMAA 1.0, 'major' is 1 and 'minor' is 0. + */ +public static native int drmaa_version(IntByReference major, IntByReference minor, + Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * If called before drmaa_init(), it SHALL return a comma delimited DRM systems + * string, one per each DRM system provided implementation. If called after + * drmaa_init(), it SHALL return the selected DRM system. The output string is + * implementation dependent. + * + * drmaa_get_DRM_system() SHALL return DRMAA_ERRNO_SUCCESS on success, + * otherwise: + * DRMAA_ERRNO_INTERNAL_ERROR. + */ +public static native int drmaa_get_DRM_system(Pointer drm_system, NativeLong drm_system_len, + Pointer error_diagnosis, NativeLong error_diag_len); + + +/* + * If called before drmaa_init(), it SHALL return a comma delimited DRMAA + * implementations string, one per each DRM system provided implementation. If + * called after drmaa_init(), it SHALL return the selected DRMAA implementation. + * The output (string) is implementation dependent. drmaa_get_DRM_implementation + * routine SHALL return DRMAA_ERRNO_SUCCESS on success, otherwise: + * DRMAA_ERRNO_INTERNAL_ERROR. + */ +public static native int drmaa_get_DRMAA_implementation(Pointer drmaa_impl, NativeLong drmaa_impl_len, + Pointer error_diagnosis, NativeLong error_diag_len); +} diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java index 2446383ff..d7b34a253 100644 --- a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java +++ b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBat.java @@ -91,6 +91,54 @@ public class LibBat { Native.register("bat"); } + // Via support@platform.com: + // For equivalent api of bsub -a "xxx aaa qqq", option -a is not in struct submit, we + // have to use setOption_ to set it. setOption_ can be used in user program by including + // cmd.h or opensource.h of LSF opensource. You can refer to cmd.sub.c in opensource. + // + // Here is a demonstration on the api for bsub -a + // ========================================================================= + // /*define external setOption_ function*/ + // extern int setOption_(int argc, char **argv, char *template, + // struct submit *req, int mask, int mask2, char **errMsg); + // + // int setEsub(char *esub, struct submit *req) { + // int x; + // char *template, *arg[3]; + // /*set esub with the following strings and set array length*/ + // arg[0] = "blah"; + // arg[1] = "-a"; + // arg[2] = test; + // /* -a "test", You can add additional esubs in here. Just make sure they're space delimited. ie. "test mpich lammpi" */ + // x=3; + // /*set template*/ + // template = "a:" + // /*run setOption_()*/ + // if (setOption_(x, arg, template, req, ~0, ~0, ~0, NULL) == -1) { + // return(-1); + // } + // else { + // return(0); + // } + // } + // ========================================================================= + + /** + * Used for setting esub and other options not in struct submit. + * Via support@platform.com + * + * @param argc number of args + * @param argv arguments including a first argument that will not be used + * @param template a colon delimited list of arguments in getopt format + * @param jobSubReq the lsf submit + * @param mask unknown + * @param mask2 unknown + * @param mask3 unknown + * @param errMsg unknown + * @return -1 if the option setting failed + */ + public static native int setOption_(int argc, Pointer argv, String template, submit jobSubReq, int mask, int mask2, int mask3, Pointer errMsg); + /** Max job name length as defined by 'man bsub'. */ public static final int MAX_JOB_NAME_LEN = 4094; @@ -9690,8 +9738,10 @@ public class LibBat { * for a service class. */ - public enum objectives { - GOAL_DEADLINE, GOAL_VELOCITY, GOAL_THROUGHPUT + public static interface objectives { + public static int GOAL_DEADLINE = 0; + public static int GOAL_VELOCITY = 1; + public static int GOAL_THROUGHPUT = 2; } @@ -15109,52 +15159,46 @@ public static class ByValue extends jobArrayElementLog implements Structure.ByVa * \addtogroup _consumertype _consumertype * consumer types */ - public static enum consumerType { + public static interface consumerType { /** * < Queues */ - LIMIT_QUEUES(1), + public static final int LIMIT_QUEUES = 1; /** * < Per-queue */ - LIMIT_PER_QUEUE(2), + public static final int LIMIT_PER_QUEUE = 2; /** * < Users */ - LIMIT_USERS(3), + public static final int LIMIT_USERS = 3; /** * < Per-users */ - LIMIT_PER_USER(4), + public static final int LIMIT_PER_USER = 4; /** * < Hosts */ - LIMIT_HOSTS(5), + public static final int LIMIT_HOSTS = 5; /** * < Per-host */ - LIMIT_PER_HOST(6), + public static final int LIMIT_PER_HOST = 6; /** * < Projects */ - LIMIT_PROJECTS(7), + public static final int LIMIT_PROJECTS = 7; /** * < Per-project */ - LIMIT_PER_PROJECT(8); - - private int value; - - private consumerType(int value) { - this.value = value; - } + public static final int LIMIT_PER_PROJECT = 8; } @@ -19011,20 +19055,27 @@ public static class ByValue extends jobArrayElementLog implements Structure.ByVa /* [] mis-matched in RMS[] */ public static final int RMS_BRACKETS_MISMATCH_ERR = (-22); - public static enum rmsAllocType_t { - RMS_ALLOC_TYPE_UNKNOWN, RMS_ALLOC_TYPE_SLOAD, RMS_ALLOC_TYPE_SNODE, RMS_ALLOC_TYPE_MCONT + public static interface rmsAllocType_t { + public static final int RMS_ALLOC_TYPE_UNKNOWN = 0; + public static final int RMS_ALLOC_TYPE_SLOAD = 1; + public static final int RMS_ALLOC_TYPE_SNODE = 2; + public static final int RMS_ALLOC_TYPE_MCONT = 3; } - public static enum rmsTopology_t { - RMS_TOPOLOGY_UNKNOWN, RMS_TOPOLOGY_PTILE, RMS_TOPOLOGY_NODES + public static interface rmsTopology_t { + public static final int RMS_TOPOLOGY_UNKNOWN = 0; + public static final int RMS_TOPOLOGY_PTILE = 1; + public static final int RMS_TOPOLOGY_NODES = 2; } - public static enum rmsFlags_t { - RMS_FLAGS_UNKNOWN, RMS_FLAGS_RAILS, RMS_FLAGS_RAILMASK + public static interface rmsFlags_t { + public static final int RMS_FLAGS_UNKNOWN = 0; + public static final int RMS_FLAGS_RAILS = 1; + public static final int RMS_FLAGS_RAILMASK = 2; } diff --git a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java index c7b3de6cf..30b83abc2 100644 --- a/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java +++ b/public/java/src/org/broadinstitute/sting/jna/lsf/v7_0_6/LibLsf.java @@ -495,14 +495,19 @@ public class LibLsf { - public enum valueType { - LS_BOOLEAN, LS_NUMERIC, LS_STRING, LS_EXTERNAL + public static interface valueType { + public static final int LS_BOOLEAN = 0; + public static final int LS_NUMERIC = 1; + public static final int LS_STRING = 2; + public static final int LS_EXTERNAL = 3; } - public enum orderType { - INCR, DECR, NA + public static interface orderType { + public static final int INCR = 0; + public static final int DECR = 1; + public static final int NA = 2; } @@ -1567,8 +1572,13 @@ public class LibLsf { public static final int NIO_TASK_ALL = 0x03; public static final int NIO_TASK_CONNECTED = 0x04; - public static enum nioType { - NIO_STATUS, NIO_STDOUT, NIO_EOF, NIO_IOERR, NIO_REQUEUE, NIO_STDERR + public static interface nioType { + public static final int NIO_STATUS = 0; + public static final int NIO_STDOUT = 1; + public static final int NIO_EOF = 2; + public static final int NIO_IOERR = 3; + public static final int NIO_REQUEUE = 4; + public static final int NIO_STDERR = 5; } diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java index c09c4037e..cb5bad4ae 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java @@ -144,6 +144,9 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } else if ("input_file".equals(argumentDefinition.fullName) && argumentDefinition.ioType == ArgumentIOType.INPUT) { return Arrays.asList(new InputTaggedFileDefinitionField(argumentDefinition), new InputIndexesArgumentField(argumentDefinition, BAMIndex.BAMIndexSuffix, ".bam")); + } else if ((RodBinding.class.equals(argumentDefinition.argumentType) || RodBinding.class.equals(argumentDefinition.componentType)) && argumentDefinition.ioType == ArgumentIOType.INPUT) { + return Arrays.asList(new InputTaggedFileDefinitionField(argumentDefinition), new InputIndexesArgumentField(argumentDefinition, Tribble.STANDARD_INDEX_EXTENSION)); + } else if (argumentDefinition.ioType == ArgumentIOType.INPUT) { return Collections.singletonList(new InputArgumentField(argumentDefinition)); @@ -196,7 +199,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } // if (intervalFields.contains(argumentDefinition.fullName) && argumentDefinition.ioType == ArgumentIOType.INPUT) - // Change intervals exclusize of intervalsString. + // Change intervals exclusive of intervalsString. private static class IntervalFileArgumentField extends InputArgumentField { public IntervalFileArgumentField(ArgumentDefinition argumentDefinition) { super(argumentDefinition); @@ -332,9 +335,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } } - /** - * The other extreme of a NamedRodBindingField, allows the user to specify the track name, track type, and the file. - */ + // Allows the user to specify the track name, track type, and the file. public static class RodBindArgumentField extends ArgumentDefinitionField { public RodBindArgumentField(ArgumentDefinition argumentDefinition) { super(argumentDefinition); @@ -347,25 +348,28 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } } - /** - * Named input_files. - */ + // Tagged input_files or other rods. public static class InputTaggedFileDefinitionField extends ArgumentDefinitionField { public InputTaggedFileDefinitionField(ArgumentDefinition argumentDefinition) { super(argumentDefinition); } @Override protected Class getInnerType() { return null; } // TaggedFile does not need to be imported. - @Override protected String getFieldType() { return "List[File]"; } - @Override protected String getDefaultValue() { return "Nil"; } + @Override protected String getFieldType() { return argumentDefinition.isMultiValued ? "List[File]" : "File"; } + @Override protected String getDefaultValue() { return argumentDefinition.isMultiValued ? "Nil" : "_"; } @Override protected String getCommandLineTemplate() { - return " + repeat(\"\", %3$s, format=TaggedFile.formatCommandLine(\"%1$s\"))"; + if (argumentDefinition.isMultiValued) { + return " + repeat(\"\", %3$s, format=TaggedFile.formatCommandLine(\"%1$s\"))"; + } else if (!argumentDefinition.required) { + return " + optional(\"\", %3$s, format=TaggedFile.formatCommandLine(\"%1$s\"))"; + } else { + return " + TaggedFile.formatCommandLine(\"%1$s\")(\"\", %3$s, \"\")"; + } } } - /** - * Adds optional inputs for the indexes of any bams or sams added to this function. - */ + // Adds optional inputs for the indexes of any rods added to this function. private static class InputIndexesArgumentField extends ArgumentField { + private final boolean originalIsMultiValued; private final String indexFieldName; private final String originalFieldName; private final String indexSuffix; @@ -374,14 +378,19 @@ public abstract class ArgumentDefinitionField extends ArgumentField { this(originalArgumentDefinition, indexSuffix, null); } public InputIndexesArgumentField(ArgumentDefinition originalArgumentDefinition, String indexSuffix, String originalSuffix) { - this.indexFieldName = originalArgumentDefinition.fullName + "Indexes"; + this.originalIsMultiValued = originalArgumentDefinition.isMultiValued; + this.indexFieldName = originalArgumentDefinition.fullName + "Index" + (originalIsMultiValued ? "es" : ""); this.originalFieldName = originalArgumentDefinition.fullName; this.indexSuffix = indexSuffix; this.originalSuffix = originalSuffix; } @Override protected Class getAnnotationIOClass() { return Input.class; } @Override public String getCommandLineAddition() { return ""; } - @Override protected String getDoc() { return "Dependencies on any indexes of " + this.originalFieldName; } + @Override protected String getDoc() { + return originalIsMultiValued + ? "Dependencies on any indexes of " + this.originalFieldName + : "Dependencies on the index of " + this.originalFieldName; + } @Override protected String getFullName() { return this.indexFieldName; } @Override protected boolean isRequired() { return false; } @Override protected String getFieldType() { return "List[File]"; } @@ -389,24 +398,41 @@ public abstract class ArgumentDefinitionField extends ArgumentField { @Override protected Class getInnerType() { return File.class; } @Override protected String getRawFieldName() { return this.indexFieldName; } @Override protected String getFreezeFields() { - if (originalSuffix == null) { - return String.format( - ("%1$s ++= %2$s" + - ".filter(orig => orig != null)" + - ".map(orig => new File(orig.getPath + \"%3$s\"))%n"), - indexFieldName, originalFieldName, indexSuffix); + if (originalIsMultiValued) { + if (originalSuffix == null) { + return String.format( + ("%1$s ++= %2$s" + + ".filter(orig => orig != null)" + + ".map(orig => new File(orig.getPath + \"%3$s\"))%n"), + indexFieldName, originalFieldName, indexSuffix); + } else { + return String.format( + ("%1$s ++= %2$s" + + ".filter(orig => orig != null && orig.getName.endsWith(\"%4$s\"))" + + ".flatMap(orig => Array(" + + " new File(orig.getPath + \"%3$s\")," + + " new File(orig.getPath.stripSuffix(\"%4$s\") + \"%3$s\") ))%n"), + indexFieldName, originalFieldName, indexSuffix, originalSuffix); + } } else { - return String.format( - ("%1$s ++= %2$s" + - ".filter(orig => orig != null && orig.getName.endsWith(\"%4$s\"))" + - ".flatMap(orig => Array(" + - " new File(orig.getPath + \"%3$s\")," + - " new File(orig.getPath.stripSuffix(\"%4$s\") + \"%3$s\") ))%n"), - indexFieldName, originalFieldName, indexSuffix, originalSuffix); + if (originalSuffix == null) { + return String.format( + ("if (%2$s != null)%n " + + "%1$s :+= new File(%2$s.getPath + \"%3$s\")%n"), + indexFieldName, originalFieldName, indexSuffix); + } else { + return String.format( + ("if (%2$s != null && %2$s.getName.endsWith(\"%4$s\"))%n " + + "%1$s ++= Array(" + + " new File(%2$s.getPath + \"%3$s\")," + + " new File(%2$s.getPath.stripSuffix(\"%4$s\") + \"%3$s\") )%n"), + indexFieldName, originalFieldName, indexSuffix, originalSuffix); + } } } } + // Tracks an automatically generated index private static abstract class OutputIndexArgumentField extends ArgumentField { protected final String indexFieldName; protected final String originalFieldName; @@ -456,6 +482,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { } } + // Allows setting the format for floats and doubles private static class FormatterArgumentField extends ArgumentField { private final ArgumentField argumentField; public FormatterArgumentField(ArgumentField argumentField) { diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index 5095bd6e5..9578eda84 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -38,10 +38,9 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.io.stubs.SAMFileReaderArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -118,7 +117,6 @@ public class GATKExtensionsGenerator extends CommandLineProgram { protected Collection getArgumentTypeDescriptors() { List typeDescriptors = new ArrayList(); typeDescriptors.add(new VCFWriterArgumentTypeDescriptor(GATKEngine,System.out,Collections.emptyList())); - typeDescriptors.add(new SAMFileReaderArgumentTypeDescriptor(GATKEngine)); typeDescriptors.add(new SAMFileWriterArgumentTypeDescriptor(GATKEngine,System.out)); typeDescriptors.add(new OutputStreamArgumentTypeDescriptor(GATKEngine,System.out)); return typeDescriptors; @@ -159,7 +157,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { List argumentFields = new ArrayList(); argumentFields.addAll(ArgumentDefinitionField.getArgumentFields(parser,walkerType)); - argumentFields.addAll(RodBindField.getRodArguments(walkerType, trackBuilder)); + //argumentFields.addAll(RodBindField.getRodArguments(walkerType, trackBuilder)); argumentFields.addAll(ReadFilterField.getFilterArguments(parser,walkerType)); String constructor = String.format("analysisName = \"%1$s\"%nanalysis_type = \"%1$s\"%n", walkerName); diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java index ea180d33c..baf083575 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.extensions.gatk; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -91,39 +91,39 @@ public class RodBindField extends ArgumentField { } return exclusiveOf.toString(); } - - public static List getRodArguments(Class walkerClass, RMDTrackBuilder trackBuilder) { - List argumentFields = new ArrayList(); - - List requires = WalkerManager.getRequiredMetaData(walkerClass); - List allows = WalkerManager.getAllowsMetaData(walkerClass); - - for (RMD required: requires) { - List fields = new ArrayList(); - String trackName = required.name(); - if ("*".equals(trackName)) { - // TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers - //fields.add(new RodBindArgumentField(argumentDefinition, true)); - } else { - for (String typeName: trackBuilder.getTrackRecordTypeNames(required.type())) - fields.add(new RodBindField(trackName, typeName, fields, true)); - } - argumentFields.addAll(fields); - } - - for (RMD allowed: allows) { - List fields = new ArrayList(); - String trackName = allowed.name(); - if ("*".equals(trackName)) { - // TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers - //fields.add(new RodBindArgumentField(argumentDefinition, false)); - } else { - for (String typeName: trackBuilder.getTrackRecordTypeNames(allowed.type())) - fields.add(new RodBindField(trackName, typeName, fields, true)); - } - argumentFields.addAll(fields); - } - - return argumentFields; - } +// +// public static List getRodArguments(Class walkerClass, RMDTrackBuilder trackBuilder) { +// List argumentFields = new ArrayList(); +// +// List requires = WalkerManager.getRequiredMetaData(walkerClass); +// List allows = WalkerManager.getAllowsMetaData(walkerClass); +// +// for (RMD required: requires) { +// List fields = new ArrayList(); +// String trackName = required.name(); +// if ("*".equals(trackName)) { +// // TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers +// //fields.add(new RodBindArgumentField(argumentDefinition, true)); +// } else { +// for (String typeName: trackBuilder.getFeatureManager().getTrackRecordTypeNames(required.type())) +// fields.add(new RodBindField(trackName, typeName, fields, true)); +// } +// argumentFields.addAll(fields); +// } +// +// for (RMD allowed: allows) { +// List fields = new ArrayList(); +// String trackName = allowed.name(); +// if ("*".equals(trackName)) { +// // TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers +// //fields.add(new RodBindArgumentField(argumentDefinition, false)); +// } else { +// for (String typeName: trackBuilder.getFeatureManager().getTrackRecordTypeNames(allowed.type())) +// fields.add(new RodBindField(trackName, typeName, fields, true)); +// } +// argumentFields.addAll(fields); +// } +// +// return argumentFields; +// } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java rename to public/java/src/org/broadinstitute/sting/utils/AminoAcid.java index 0d0b906e0..0b47093fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java +++ b/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; +package org.broadinstitute.sting.utils; /** * Represents a single amino acid. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java similarity index 99% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java rename to public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java index c10eb5dd7..1ae28ffb3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java +++ b/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; +package org.broadinstitute.sting.utils; import java.util.HashMap; diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 491e4e25e..673b1524d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -419,6 +419,44 @@ public class BaseUtils { return new String(simpleComplement(bases.getBytes())); } + /** + * Returns the index of the most common base in the basecounts array. To be used with + * pileup.getBaseCounts. + * + * @param baseCounts counts of a,c,g,t in order. + * @return the index of the most common base + */ + static public int mostFrequentBaseIndex(int[] baseCounts) { + int mostFrequentBaseIndex = 0; + for (int baseIndex = 1; baseIndex < 4; baseIndex++) { + if (baseCounts[baseIndex] > baseCounts[mostFrequentBaseIndex]) { + mostFrequentBaseIndex = baseIndex; + } + } + return mostFrequentBaseIndex; + } + + static public int mostFrequentBaseIndexNotRef(int[] baseCounts, int refBaseIndex) { + int tmp = baseCounts[refBaseIndex]; + baseCounts[refBaseIndex] = -1; + int result = mostFrequentBaseIndex(baseCounts); + baseCounts[refBaseIndex] = tmp; + return result; + } + + static public int mostFrequentBaseIndexNotRef(int[] baseCounts, byte refSimpleBase) { + return mostFrequentBaseIndexNotRef(baseCounts, simpleBaseToBaseIndex(refSimpleBase)); + } + + /** + * Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts. + * + * @param baseCounts counts of a,c,g,t in order. + * @return the most common base + */ + static public byte mostFrequentSimpleBase(int[] baseCounts) { + return baseIndexToSimpleBase(mostFrequentBaseIndex(baseCounts)); + } /** * For the most frequent base in the sequence, return the percentage of the read it constitutes. @@ -437,12 +475,7 @@ public class BaseUtils { } } - int mostFrequentBaseIndex = 0; - for (int baseIndex = 1; baseIndex < 4; baseIndex++) { - if (baseCounts[baseIndex] > baseCounts[mostFrequentBaseIndex]) { - mostFrequentBaseIndex = baseIndex; - } - } + int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts); return ((double) baseCounts[mostFrequentBaseIndex])/((double) sequence.length); } diff --git a/public/java/src/org/broadinstitute/sting/utils/ContigComparator.java b/public/java/src/org/broadinstitute/sting/utils/ContigComparator.java new file mode 100644 index 000000000..619beddb8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/ContigComparator.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.utils; + +import java.util.Comparator; +import java.util.Set; +import java.util.TreeSet; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 7/23/11 + * Time: 6:07 PM + * + * Contig comparator -- sorting contigs like Picard + * + * This is very useful if you want to output your text files or manipulate data in the usual chromosome ordering : + * 1 + * 2 + * 3 + * ... + * 21 + * 22 + * X + * Y + * GL*** + * ... + * Just use this comparator in any SortedSet class constructor and your data will be sorted like in the BAM file. + */ +public class ContigComparator implements Comparator { + private Set specialChrs; + + public ContigComparator() { + specialChrs = new TreeSet(); + specialChrs.add("X"); + specialChrs.add("Y"); + } + + public int compare(String chr1, String chr2) { + if (chr1.equals(chr2)) + return 0; + + Integer x = convertStringWithoutException(chr1); + Integer y = convertStringWithoutException(chr2); + // both contigs are numbered + if (x != null && y != null) + return (x < y) ? -1:1; + + // both contigs are named + if (x == null && y == null) { + // both contigs are special contigs or neither contig is a special contigs + if (specialChrs.contains(chr1) && specialChrs.contains(chr2) || (!specialChrs.contains(chr1) && !specialChrs.contains(chr2))) + return chr1.compareTo(chr2); + // one contig is a special and the other is not special + if (specialChrs.contains(chr1)) + return -1; + return 1; + } + + // one contig is named the other is numbered + if (x != null) + return -1; + return 1; + } + + private Integer convertStringWithoutException(String contig) { + Integer x = null; + try { + x = Integer.decode(contig); + } catch (NumberFormatException n){} + return x; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index a5c6e0537..8d9768681 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -34,6 +34,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -443,6 +444,15 @@ public class GenomeLocParser { } } + /** + * Creates a GenomeLoc from a Tribble feature + * @param feature + * @return + */ + public GenomeLoc createGenomeLoc(final Feature feature) { + return createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()); + } + /** * create a new genome loc, given the contig name, and a single position. Must be on the reference * @@ -457,19 +467,6 @@ public class GenomeLocParser { return createGenomeLoc(contig, getContigIndex(contig), pos, pos); } -// /** -// * Creates a new GenomeLoc without performing any validation on its contig or bounds. -// * FOR UNIT TESTING PURPOSES ONLY! -// * -// * @param contig the contig name -// * @param start start position of the interval -// * @param stop stop position of the interval -// * @return a new GenomeLoc representing the specified location -// */ -// public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) { -// return new GenomeLoc(contig, getContigIndexWithoutException(contig), start, stop); -// } - /** * create a new genome loc from an existing loc, with a new start position * Note that this function will NOT explicitly check the ending offset, in case someone wants to diff --git a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java index af69ebca6..74f147127 100755 --- a/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/IndelUtils.java @@ -121,9 +121,9 @@ public class IndelUtils { boolean done = false; ArrayList inds = new ArrayList(); - if ( vc.isInsertion() ) { + if ( vc.isSimpleInsertion() ) { indelAlleleString = vc.getAlternateAllele(0).getDisplayString(); - } else if ( vc.isDeletion() ) { + } else if ( vc.isSimpleDeletion() ) { indelAlleleString = vc.getReference().getDisplayString(); } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java old mode 100755 new mode 100644 index 36ed506aa..0d85f9606 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -124,6 +124,16 @@ public class MathUtils { } + /** + * Calculates the log10 cumulative sum of an array with log10 probabilities + * @param log10p the array with log10 probabilites + * @param upTo index in the array to calculate the cumsum up to + * @return the log10 of the cumulative sum + */ + public static double log10CumulativeSumLog10(double [] log10p, int upTo) { + return log10sumLog10(log10p, 0, upTo); + } + /** * Converts a real space array of probabilities into a log10 array * @param prRealSpace @@ -137,23 +147,27 @@ public class MathUtils { } public static double log10sumLog10(double[] log10p, int start) { + return log10sumLog10(log10p, start, log10p.length); + } + + public static double log10sumLog10(double[] log10p, int start, int finish) { double sum = 0.0; double maxValue = Utils.findMaxEntry(log10p); - for ( int i = start; i < log10p.length; i++ ) { + for ( int i = start; i < finish; i++ ) { sum += Math.pow(10.0, log10p[i] - maxValue); } return Math.log10(sum) + maxValue; } - public static double sum(List values) { + public static double sumDoubles(List values) { double s = 0.0; for ( double v : values) s += v; return s; } - public static int sum(List values) { + public static int sumIntegers(List values) { int s = 0; for ( int v : values) s += v; return s; @@ -345,6 +359,23 @@ public class MathUtils { return Math.pow(10,log10MultinomialProbability(n, k, log10P)); } + /** + * calculate the Root Mean Square of an array of integers + * @param x an byte[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(byte[] x) { + if ( x.length == 0 ) + return 0.0; + + double rms = 0.0; + for (int i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + /** * calculate the Root Mean Square of an array of integers * @param x an int[] of numbers @@ -377,6 +408,17 @@ public class MathUtils { return Math.sqrt(rms); } + public static double rms(Collection l) { + if (l.size() == 0) + return 0.0; + + double rms = 0.0; + for (Double i : l) + rms += i*i; + rms /= l.size(); + return Math.sqrt(rms); + } + public static double distanceSquared( final double[] x, final double[] y ) { double dist = 0.0; for(int iii = 0; iii < x.length; iii++) { @@ -428,7 +470,7 @@ public class MathUtils { // for precision purposes, we need to add (or really subtract, since they're // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = MathUtils.arrayMax( array ); + double maxValue = MathUtils.arrayMaxDouble( array ); for (int i = 0; i < array.size(); i++) normalized[i] = Math.pow(10, array.get(i) - maxValue); @@ -471,6 +513,18 @@ public class MathUtils { return maxI; } + public static int maxElementIndex(int[] array) { + if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = -1; + for ( int i = 0; i < array.length; i++ ) { + if ( maxI == -1 || array[i] > array[maxI] ) + maxI = i; + } + + return maxI; + } + public static double arrayMax(double[] array) { return array[maxElementIndex(array)]; } @@ -507,7 +561,7 @@ public class MathUtils { return minI; } - public static int arrayMax(List array) { + public static int arrayMaxInt(List array) { if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); @@ -516,7 +570,7 @@ public class MathUtils { return m; } - public static double arrayMax(List array) { + public static double arrayMaxDouble(List array) { if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); @@ -727,6 +781,38 @@ public class MathUtils { return count; } + public static int countOccurrences(byte element, byte [] array) { + int count = 0; + for (byte y : array) { + if (element == y) + count++; + } + + return count; + } + + /** + * Returns the top (larger) N elements of the array. Naive n^2 implementation (Selection Sort). + * Better than sorting if N (number of elements to return) is small + * + * @param array the array + * @param n number of top elements to return + * @return the n larger elements of the array + */ + public static Collection getNMaxElements(double [] array, int n) { + ArrayList maxN = new ArrayList(n); + double lastMax = Double.MAX_VALUE; + for (int i=0; i= x + MAX_JACOBIAN_TOLERANCE) - return y; - if (x >= y + MAX_JACOBIAN_TOLERANCE) - return x; + double diff = x-y; - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - - //double diff = Math.abs(x-y); - double diff = x-y; - double t1 =x; - if (diff<0) { // - t1 = y; - diff= -diff; - } - // t has max(x,y), diff has abs(x-y) - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - //int ind = (int)Math.round(diff*INV_JACOBIAN_LOG_TABLE_STEP); - int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); - // gdebug+ - //double z =Math.log10(1+Math.pow(10.0,-diff)); - //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - //gdebug- - return t1+jacobianLogTable[ind]; - // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - } + if (diff > MAX_JACOBIAN_TOLERANCE) + return x; + else if (diff < -MAX_JACOBIAN_TOLERANCE) + return y; + else if (diff >= 0) { + int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + return x + jacobianLogTable[ind]; + } + else { + int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + return y + jacobianLogTable[ind]; + } + } public static double phredScaleToProbability (byte q) { return Math.pow(10,(-q)/10.0); @@ -1015,6 +1089,11 @@ public class MathUtils { return ((-q)/10.0); } + /** + * Returns the phred scaled value of probability p + * @param p probability (between 0 and 1). + * @return phred scaled probability of p + */ public static byte probabilityToPhredScale (double p) { return (byte) ((-10) * Math.log10(p)); } @@ -1274,5 +1353,4 @@ public class MathUtils { public static double log10Factorial (int x) { return log10Gamma(x+1); } - -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java new file mode 100644 index 000000000..c0493fe22 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.R; + +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; +import org.broadinstitute.sting.utils.PathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +/** + * Generic service for executing RScripts in the GATK directory + * + * @author Your Name + * @since Date created + */ +public class RScriptExecutor { + /** + * our log + */ + protected static Logger logger = Logger.getLogger(RScriptExecutor.class); + + public static class RScriptArgumentCollection { + @Advanced + @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) + public String PATH_TO_RSCRIPT = "Rscript"; + + @Advanced + @Argument(fullName = "path_to_Rresources", shortName = "Rresources", doc = "Path to resources folder holding the Sting R scripts.", required = false) + public List PATH_TO_RESOURCES = Arrays.asList("public/R/", "private/R/"); + + public RScriptArgumentCollection() {} + + /** For testing and convenience */ + public RScriptArgumentCollection(final String PATH_TO_RSCRIPT, final List PATH_TO_RESOURCES) { + this.PATH_TO_RSCRIPT = PATH_TO_RSCRIPT; + this.PATH_TO_RESOURCES = PATH_TO_RESOURCES; + } + } + + final RScriptArgumentCollection myArgs; + final boolean exceptOnError; + + public RScriptExecutor(final RScriptArgumentCollection myArgs, final boolean exceptOnError) { + this.myArgs = myArgs; + this.exceptOnError = exceptOnError; + } + + public void callRScripts(String scriptName, Object... scriptArgs) { + callRScripts(scriptName, Arrays.asList(scriptArgs)); + } + + public void callRScripts(String scriptName, List scriptArgs) { + try { + final File pathToScript = findScript(scriptName); + if ( pathToScript == null ) return; // we failed but shouldn't exception out + final String argString = Utils.join(" ", scriptArgs); + final String cmdLine = Utils.join(" ", Arrays.asList(myArgs.PATH_TO_RSCRIPT, pathToScript, argString)); + logger.info("Executing RScript: " + cmdLine); + Runtime.getRuntime().exec(cmdLine).waitFor(); + } catch (InterruptedException e) { + generateException(e); + } catch (IOException e) { + generateException("Fatal Exception: Perhaps RScript jobs are being spawned too quickly?", e); + } + } + + public File findScript(final String scriptName) { + for ( String pathToResource : myArgs.PATH_TO_RESOURCES ) { + final File f = new File(pathToResource + "/" + scriptName); + if ( f.exists() ) { + if ( f.canRead() ) + return f; + else + generateException("Script exists but couldn't be read: " + scriptName); + } + } + + generateException("Couldn't find script: " + scriptName + " in " + myArgs.PATH_TO_RESOURCES); + return null; + } + + private void generateException(String msg) { + generateException(msg, null); + } + + private void generateException(Throwable e) { + generateException("", e); + } + + private void generateException(String msg, Throwable e) { + if ( exceptOnError ) + throw new UserException(msg, e); + else + logger.warn(msg + (e == null ? "" : ":" + e.getMessage())); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 6a50badce..f6edb319f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -29,6 +29,7 @@ import net.sf.samtools.util.StringUtil; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.collections.Pair; +import java.net.InetAddress; import java.util.*; /** @@ -42,6 +43,21 @@ public class Utils { /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(Utils.class); + public static final float JAVA_DEFAULT_HASH_LOAD_FACTOR = 0.75f; + + /** + * Calculates the optimum initial size for a hash table given the maximum number + * of elements it will need to hold. The optimum size is the smallest size that + * is guaranteed not to result in any rehash/table-resize operations. + * + * @param maxElements The maximum number of elements you expect the hash table + * will need to hold + * @return The optimum initial size for the table, given maxElements + */ + public static int optimumHashSize ( int maxElements ) { + return (int)(maxElements / JAVA_DEFAULT_HASH_LOAD_FACTOR) + 2; + } + public static String getClassName(Class c) { String FQClassName = c.getName(); int firstChar; @@ -618,4 +634,20 @@ public class Utils { public static boolean isFlagSet(int value, int flag) { return ((value & flag) == flag); } + + /** + * Helper utility that calls into the InetAddress system to resolve the hostname. If this fails, + * unresolvable gets returned instead. + * + * @return + */ + public static final String resolveHostname() { + try { + return InetAddress.getLocalHost().getCanonicalHostName(); + } + catch (java.net.UnknownHostException uhe) { // [beware typo in code sample -dmw] + return "unresolvable"; + // handle exception + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java b/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java index e65b8f921..fa154fca3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java @@ -224,4 +224,14 @@ public class JVMUtils { throw new StingException("Unknown type: " + type + " (" + type.getClass().getName() + ")"); } } + + public static Class getParameterizedTypeClass(Type t) { + if ( t instanceof ParameterizedType ) { + ParameterizedType parameterizedType = (ParameterizedType)t; + if ( parameterizedType.getActualTypeArguments().length != 1 ) + throw new ReviewedStingException("BUG: more than 1 generic type found on class" + t); + return (Class)parameterizedType.getActualTypeArguments()[0]; + } else + throw new ReviewedStingException("BUG: could not find generic type on class " + t); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 8d37ff573..04cbef0c3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -172,7 +172,7 @@ public class PluginManager { } } - protected Map> getPluginsByName() { + public Map> getPluginsByName() { return Collections.unmodifiableMap(pluginsByName); } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index 5449906b2..bc200372f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -1,11 +1,15 @@ package org.broadinstitute.sting.utils.clipreads; +import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import java.util.Iterator; +import java.util.Stack; import java.util.Vector; /** @@ -16,37 +20,28 @@ import java.util.Vector; * according to the wishes of the supplid ClippingAlgorithm enum. */ public class ClippingOp { - public final ClippingType type; public final int start, stop; // inclusive - public final Object extraInfo; public ClippingOp(int start, int stop) { - this(null, start, stop, null); - } - - public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) { - // todo -- remove type and extra info - this.type = type; this.start = start; this.stop = stop; - this.extraInfo = extraInfo; } + public int getLength() { return stop - start + 1; } /** - * Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping + * Clips the bases in read according to this operation's start and stop. Uses the clipping * representation used is the one provided by algorithm argument. * * @param algorithm - * @param clippedRead + * @param read */ - public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord clippedRead) { - //clippedRead.setReferenceIndex(1); - byte[] quals = clippedRead.getBaseQualities(); - byte[] bases = clippedRead.getReadBases(); + public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord read) { + byte[] quals = read.getBaseQualities(); + byte[] bases = read.getReadBases(); switch (algorithm) { // important note: @@ -55,79 +50,65 @@ public class ClippingOp { case WRITE_NS: for (int i = start; i <= stop; i++) bases[i] = 'N'; - clippedRead.setReadBases(bases); + read.setReadBases(bases); break; case WRITE_Q0S: for (int i = start; i <= stop; i++) quals[i] = 0; - clippedRead.setBaseQualities(quals); + read.setBaseQualities(quals); break; case WRITE_NS_Q0S: for (int i = start; i <= stop; i++) { bases[i] = 'N'; quals[i] = 0; } - clippedRead.setReadBases(bases); - clippedRead.setBaseQualities(quals); + read.setReadBases(bases); + read.setBaseQualities(quals); break; case HARDCLIP_BASES: - case SOFTCLIP_BASES: - if ( ! clippedRead.getReadUnmappedFlag() ) { - // we can't process unmapped reads - - //System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength()); - int myStop = stop; - if ( (stop + 1 - start) == clippedRead.getReadLength() ) { - // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone - //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName())); - //break; - myStop--; // just decrement stop - } - - if ( start > 0 && myStop != clippedRead.getReadLength() - 1 ) - throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", - clippedRead.getReadName(), start, myStop)); - - Cigar oldCigar = clippedRead.getCigar(); - - int scLeft = 0, scRight = clippedRead.getReadLength(); - if ( start == 0 ) - scLeft = myStop + 1; - else - scRight = start; - - Cigar newCigar = softClip(oldCigar, scLeft, scRight); - clippedRead.setCigar(newCigar); - - int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); - int newStart = clippedRead.getAlignmentStart() + newClippedStart; - clippedRead.setAlignmentStart(newStart); - - if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) - clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead); - //System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); - } else if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) { - // we can hard clip unmapped reads - if ( clippedRead.getReadNegativeStrandFlag() ) - clippedRead = ReadUtils.hardClipBases(clippedRead, 0, start, null); - else - clippedRead = ReadUtils.hardClipBases(clippedRead, start, start + getLength(), null); - } + read = hardClip(read, start, stop); break; + + case SOFTCLIP_BASES: + if ( read.getReadUnmappedFlag() ) { + // we can't process unmapped reads + throw new UserException("Read Clipper cannot soft clip unmapped reads"); + } + + //System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); + int myStop = stop; + if ( (stop + 1 - start) == read.getReadLength() ) { + // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone + //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); + //break; + myStop--; // just decrement stop + } + + if ( start > 0 && myStop != read.getReadLength() - 1 ) + throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); + + Cigar oldCigar = read.getCigar(); + + int scLeft = 0, scRight = read.getReadLength(); + if ( start == 0 ) + scLeft = myStop + 1; + else + scRight = start; + + Cigar newCigar = softClip(oldCigar, scLeft, scRight); + read.setCigar(newCigar); + + int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); + int newStart = read.getAlignmentStart() + newClippedStart; + read.setAlignmentStart(newStart); + + break; + default: throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); } - return clippedRead; - } - - /** - * What is the type of a ClippingOp? - */ - public enum ClippingType { - LOW_Q_SCORES, - WITHIN_CLIP_RANGE, - MATCHES_CLIP_SEQ + return read; } /** @@ -198,7 +179,7 @@ public class ClippingOp { Vector newElements = new Vector(); for (CigarElement curElem : __cigar.getCigarElements()) { if (!curElem.getOperator().consumesReadBases()) { - if (curLength > __startClipEnd && curLength < __endClipBegin) { + if (curElem.getOperator() == CigarOperator.HARD_CLIP || curLength > __startClipEnd && curLength < __endClipBegin) { newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); } continue; @@ -265,4 +246,233 @@ public class ClippingOp { assert newCigar.isValid(null, -1) == null; return newCigar; } -} + + @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1", "!read.getReadUnmappedFlag()"}) + private SAMRecord hardClip (SAMRecord read, int start, int stop) { + if (start == 0 && stop == read.getReadLength() -1) + return new SAMRecord(read.getHeader()); + + // If the read is unmapped there is no Cigar string and neither should we create a new cigar string + CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop); + + // the cigar may force a shift left or right (or both) in case we are left with insertions + // starting or ending the read after applying the hard clip on start/stop. + int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; + byte [] newBases = new byte[newLength]; + byte [] newQuals = new byte[newLength]; + int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; + + System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); + System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); + + SAMRecord hardClippedRead; + try { + hardClippedRead = (SAMRecord) read.clone(); + } catch (CloneNotSupportedException e) { + throw new ReviewedStingException("Where did the clone go?"); + } + + hardClippedRead.setBaseQualities(newQuals); + hardClippedRead.setReadBases(newBases); + hardClippedRead.setCigar(cigarShift.cigar); + if (start == 0) + hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), cigarShift.cigar)); + + return hardClippedRead; + + } + + @Requires({"!cigar.isEmpty()"}) + private CigarShift hardClipCigar (Cigar cigar, int start, int stop) { + Cigar newCigar = new Cigar(); + int index = 0; + int totalHardClipCount = stop - start + 1; + int alignmentShift = 0; // caused by hard clipping insertions or deletions + + // hard clip the beginning of the cigar string + if (start == 0) { + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); + CigarElement cigarElement = cigarElementIterator.next(); + // Skip all leading hard clips + while (cigarElement.getOperator() == CigarOperator.HARD_CLIP) { + totalHardClipCount += cigarElement.getLength(); + if (cigarElementIterator.hasNext()) + cigarElement = cigarElementIterator.next(); + else + throw new ReviewedStingException("Read is entirely hardclipped, shouldn't be trying to clip it's cigar string"); + } + // keep clipping until we hit stop + while (index <= stop) { + int shift = 0; + if (cigarElement.getOperator().consumesReadBases()) + shift = cigarElement.getLength(); + + // we're still clipping or just finished perfectly + if (index + shift == stop + 1) { + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); + newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); + } + // element goes beyond what we need to clip + else if (index + shift > stop + 1) { + int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1); + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop-index+1); + newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); + newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); + } + index += shift; + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, shift); + + if (index <= stop && cigarElementIterator.hasNext()) + cigarElement = cigarElementIterator.next(); + } + + // add the remaining cigar elements + while (cigarElementIterator.hasNext()) { + cigarElement = cigarElementIterator.next(); + newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); + } + } + + // hard clip the end of the cigar string + else { + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); + CigarElement cigarElement = cigarElementIterator.next(); + + // Keep marching on until we find the start + while (index < start) { + int shift = 0; + if (cigarElement.getOperator().consumesReadBases()) + shift = cigarElement.getLength(); + + // we haven't gotten to the start yet, keep everything as is. + if (index + shift < start) + newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); + + // element goes beyond our clip starting position + else { + int elementLengthAfterChopping = start - index; + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength() - (start - index)); + + // if this last element is a HARD CLIP operator, just merge it with our hard clip operator to be added later + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + totalHardClipCount += elementLengthAfterChopping; + // otherwise, maintain what's left of this last operator + else + newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); + } + index += shift; + if (index < start && cigarElementIterator.hasNext()) + cigarElement = cigarElementIterator.next(); + } + + // check if we are hard clipping indels + while(cigarElementIterator.hasNext()) { + cigarElement = cigarElementIterator.next(); + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); + } + newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); + } + return cleanHardClippedCigar(newCigar); + } + + /** + * Checks if a hard clipped cigar left a read starting or ending with insertions/deletions + * and cleans it up accordingly. + * + * @param cigar + * @return + */ + private CigarShift cleanHardClippedCigar(Cigar cigar) { + Cigar cleanCigar = new Cigar(); + int shiftFromStart = 0; + int shiftFromEnd = 0; + Stack cigarStack = new Stack(); + Stack inverseCigarStack = new Stack(); + + for (CigarElement cigarElement : cigar.getCigarElements()) + cigarStack.push(cigarElement); + + for (int i = 1; i <= 2; i++) { + int shift = 0; + boolean readHasStarted = false; + + while(!cigarStack.empty()) { + CigarElement cigarElement = cigarStack.pop(); + + if ( !readHasStarted && + cigarElement.getOperator() != CigarOperator.INSERTION && + cigarElement.getOperator() != CigarOperator.DELETION && + cigarElement.getOperator() != CigarOperator.HARD_CLIP) + readHasStarted = true; + else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.INSERTION) + shift += cigarElement.getLength(); + + if (readHasStarted || cigarElement.getOperator() == CigarOperator.HARD_CLIP) { + if (i==1) + inverseCigarStack.push(cigarElement); + else + cleanCigar.add(cigarElement); + } + } + // first pass (i=1) is from end to start of the cigar elements + if (i == 1) { + shiftFromEnd = shift; + cigarStack = inverseCigarStack; + } + // second pass (i=2) is from start to end with the end already cleaned + else { + shiftFromStart = shift; + } + } + return new CigarShift(cleanCigar, shiftFromStart, shiftFromEnd); + } + + private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) { + int shift = 0; + + // Rewind to previous start (by counting everything that was already clipped in this read) + for (CigarElement cigarElement : oldCigar.getCigarElements()) { + if (!cigarElement.getOperator().consumesReferenceBases()) + shift -= cigarElement.getLength(); + else + break; + } + + // Advance to new start (by counting everything new that has been clipped ) + for (CigarElement cigarElement : newCigar.getCigarElements()) { + if (!cigarElement.getOperator().consumesReferenceBases()) + shift += cigarElement.getLength(); + else + break; + } + + return shift; + } + + private int calculateHardClippingAlignmentShift(CigarElement cigarElement, int clippedLength) { + if (cigarElement.getOperator() == CigarOperator.INSERTION) { + int cigarElementLength = cigarElement.getLength(); + if (clippedLength >= cigarElementLength) + return -cigarElement.getLength(); + else + return -clippedLength; + } + + if (cigarElement.getOperator() == CigarOperator.DELETION) + return cigarElement.getLength(); + + return 0; + } + + private class CigarShift { + private Cigar cigar; + private int shiftFromStart; + private int shiftFromEnd; + + private CigarShift(Cigar cigar, int shiftFromStart, int shiftFromEnd) { + this.cigar = cigar; + this.shiftFromStart = shiftFromStart; + this.shiftFromEnd = shiftFromEnd; + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java index 14c04b5c4..0dbe55726 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java @@ -4,9 +4,28 @@ package org.broadinstitute.sting.utils.clipreads; * How should we represent a clipped bases in a read? */ public enum ClippingRepresentation { - WRITE_NS, // change the bases to Ns - WRITE_Q0S, // change the quality scores to Q0 - WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns - SOFTCLIP_BASES, // change cigar string to S, but keep bases - HARDCLIP_BASES // remove the bases from the read + /** Clipped bases are changed to Ns */ + WRITE_NS, + + /** Clipped bases are changed to have Q0 quality score */ + WRITE_Q0S, + + /** Clipped bases are change to have both an N base and a Q0 quality score */ + WRITE_NS_Q0S, + + /** + * Change the read's cigar string to soft clip (S, see sam-spec) away the bases. + * Note that this can only be applied to cases where the clipped bases occur + * at the start or end of a read. + */ + SOFTCLIP_BASES, + + /** + * Change the read's cigar string to hard clip (H, see sam-spec) away the bases. + * Hard clipping, unlike soft clipping, actually removes bases from the read, + * reducing the resulting file's size but introducing an irrevesible (i.e., + * lossy) operation. Note that this can only be applied to cases where the clipped + * bases occur at the start or end of a read. + */ + HARDCLIP_BASES } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 988d297f6..26c25850a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -1,6 +1,9 @@ package org.broadinstitute.sting.utils.clipreads; +import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.ArrayList; import java.util.List; @@ -10,6 +13,7 @@ import java.util.List; */ public class ReadClipper { SAMRecord read; + boolean wasClipped; List ops = null; /** @@ -19,6 +23,7 @@ public class ReadClipper { */ public ReadClipper(final SAMRecord read) { this.read = read; + this.wasClipped = false; } /** @@ -36,13 +41,73 @@ public class ReadClipper { } public boolean wasClipped() { - return ops != null; + return wasClipped; } public SAMRecord getRead() { return read; } + public SAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) { + return hardClipByReferenceCoordinates(-1, refStop); + } + + public SAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) { + return hardClipByReferenceCoordinates(refStart, -1); + } + + private SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { + int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart); + int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop); + + if (start < 0 || stop > read.getReadLength() - 1) + throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); + + // TODO add requires statement/check in the Hardclip function + if ( start > stop ) + stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, ReadUtils.getRefCoordSoftUnclippedEnd(read)); + + //System.out.println("Clipping start/stop: " + start + "/" + stop); + this.addOp(new ClippingOp(start, stop)); + SAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); + this.ops = null; + return clippedRead; + } + + public SAMRecord hardClipByReadCoordinates(int start, int stop) { + this.addOp(new ClippingOp(start, stop)); + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + + @Requires("left <= right") + public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { + if (left == right) + return new SAMRecord(read.getHeader()); + this.read = hardClipByReferenceCoordinates(right, -1); + return hardClipByReferenceCoordinates(-1, left); + } + + public SAMRecord hardClipLowQualEnds(byte lowQual) { + byte [] quals = read.getBaseQualities(); + int leftClipIndex = 0; + int rightClipIndex = read.getReadLength() - 1; + + // check how far we can clip both sides + while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--; + while (leftClipIndex < read.getReadLength() && quals[leftClipIndex] <= lowQual) leftClipIndex++; + + // if the entire read should be clipped, then return an empty read. (--todo: maybe null is better? testing this for now) + if (leftClipIndex > rightClipIndex) + return (new SAMRecord(read.getHeader())); + + if (rightClipIndex < read.getReadLength() - 1) { + this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1)); + } + if (leftClipIndex > 0 ) { + this.addOp(new ClippingOp(0, leftClipIndex - 1)); + } + return this.clipRead(ClippingRepresentation.HARDCLIP_BASES); + } /** * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. @@ -59,6 +124,7 @@ public class ReadClipper { for (ClippingOp op : getOps()) { clippedRead = op.apply(algorithm, clippedRead); } + wasClipped = true; return clippedRead; } catch (CloneNotSupportedException e) { throw new RuntimeException(e); // this should never happen diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java index 5e536d4c1..413848543 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata.features.beagle; +package org.broadinstitute.sting.utils.codecs.beagle; /* * Copyright (c) 2010 The Broad Institute * @@ -40,6 +40,29 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.regex.Pattern; +/** + * TODO GUILLERMO DEL ANGEL + * + *

    + * Codec Description + *

    + * + *

    + * See also: @see VCF specification
    + *

    + + *

    + * + *

    File format example

    + *
    + *     line 1
    + *     line 2
    + *     line 3
    + * 
    + * + * @author Mark DePristo + * @since 2010 + */ public class BeagleCodec implements ReferenceDependentFeatureCodec { private String[] header; public enum BeagleReaderType {PROBLIKELIHOOD, GENOTYPES, R2}; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java index e6832754d..0aa9ecba2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleFeature.java @@ -22,7 +22,7 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.beagle; +package org.broadinstitute.sting.utils.codecs.beagle; import org.broad.tribble.Feature; import org.broadinstitute.sting.utils.variantcontext.Allele; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java deleted file mode 100755 index fef6c4ea0..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/completegenomics/CGVarCodec.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.completegenomics; - -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * a codec for the VAR file types produced by the Complete Genomics Institute - */ -public class CGVarCodec implements FeatureCodec { - - private static final String REF_TYPE = "ref"; - private static final String SNP_TYPE = "snp"; - private static final String DELETION_TYPE = "del"; - private static final String INSERTION_TYPE = "ins"; - private static final String SUBSTITUTION_TYPE = "sub"; - - // the minimum number of features in the CG file line - private static final int minimumFeatureCount = 8; - - /** - * decode the location only - * @param line the input line to decode - * @return a HapMapFeature - */ - public Feature decodeLoc(String line) { - return decode(line); - } - - /** - * decode the CG record - * @param line the input line to decode - * @return a VariantContext - */ - public Feature decode(String line) { - String[] array = line.split("\\s+"); - - // make sure the split was successful - that we got an appropriate number of fields - if ( array.length < minimumFeatureCount ) - return null; - - String type = array[6]; - - long start = Long.valueOf(array[4]); - long end; - Allele ref, alt = null; - - //System.out.println(line); - - if ( type.equals(SNP_TYPE) ) { - ref = Allele.create(array[7], true); - alt = Allele.create(array[8], false); - end = start; - } else if ( type.equals(INSERTION_TYPE) ) { - ref = Allele.create(Allele.NULL_ALLELE_STRING, true); - alt = Allele.create(array[7], false); - end = start; - } else if ( type.equals(DELETION_TYPE) ) { - ref = Allele.create(array[7], true); - alt = Allele.create(Allele.NULL_ALLELE_STRING, false); - end = start + ref.length(); - //} else if ( type.equals(REF_TYPE) ) { - // ref = Allele.create("N", true); // ref bases aren't accurate - // start++; - // end = start; - //} else if ( type.equals(SUBSTITUTION_TYPE) ) { - // ref = Allele.create(array[7], true); - // alt = Allele.create(array[8], false); - // end = start + Math.max(ref.length(), alt.length()); - } else { - return null; // we don't handle other types - } - - HashSet alleles = new HashSet(); - alleles.add(ref); - if ( alt != null ) - alleles.add(alt); - - HashMap attrs = new HashMap(); - String id = array[array.length - 1]; - if ( id.indexOf("dbsnp") != -1 ) { - attrs.put(VariantContext.ID_KEY, parseID(id)); - } - - // create a new feature given the array - return new VariantContext("CGI", array[3], start, end, alleles, VariantContext.NO_NEG_LOG_10PERROR, null, attrs); - } - - public Class getFeatureType() { - return VariantContext.class; - } - - // There's no spec and no character to distinguish header lines... - private final static int NUM_HEADER_LINES = 12; - public Object readHeader(LineReader reader) { - return null; - - //String headerLine = null; - //try { - // for (int i = 0; i < NUM_HEADER_LINES; i++) - // headerLine = reader.readLine(); - //} catch (IOException e) { - // throw new IllegalArgumentException("Unable to read a line from the line reader"); - //} - //return headerLine; - } - - private static final Pattern DBSNP_PATTERN = Pattern.compile("^dbsnp\\.\\d+:(.*)"); - private String parseID(String raw) { - StringBuilder sb = null; - - String[] ids = raw.split(";"); - for ( String id : ids ) { - Matcher matcher = DBSNP_PATTERN.matcher(id); - if ( matcher.matches() ) { - String rsID = matcher.group(1); - if ( sb == null ) { - sb = new StringBuilder(rsID); - } else { - sb.append(";"); - sb.append(rsID); - } - } - } - - return sb == null ? null : sb.toString(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapCodec.java deleted file mode 100644 index bd95e0f3d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapCodec.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.codecs.hapmap; - -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.annotation.Strand; -import org.broad.tribble.readers.LineReader; - -import java.io.IOException; -import java.util.Arrays; - -/** - * a codec for the file types produced by the HapMap consortium, available on their website: - * http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/ - * - * The format includes eleven standard fields, plus genotypes for each of the samples included - * in the file - * - */ -public class HapMapCodec implements FeatureCodec { - // the minimum number of features in the HapMap file line - private static final int minimumFeatureCount = 11; - - private String headerLine; - /** - * decode the location only - * @param line the input line to decode - * @return a HapMapFeature - */ - public Feature decodeLoc(String line) { - return decode(line); - } - - /** - * decode the hapmap record - * @param line the input line to decode - * @return a HapMapFeature, with the given fields - */ - public Feature decode(String line) { - String[] array = line.split("\\s+"); - - // make sure the split was successful - that we got an appropriate number of fields - if (array.length < minimumFeatureCount) - throw new IllegalArgumentException("Unable to parse line " + line + ", the length of split features is less than the minimum of " + minimumFeatureCount); - - // create a new feature given the array - return new HapMapFeature(array[0], - array[1].split("/"), - array[2], - Long.valueOf(array[3]), - Strand.toStrand(array[4]), - array[5], - array[6], - array[7], - array[8], - array[9], - array[10], - Arrays.copyOfRange(array,11,array.length), - headerLine); - } - - public Class getFeatureType() { - return HapMapFeature.class; - } - - public Object readHeader(LineReader reader) { - try { - headerLine = reader.readLine(); - } catch (IOException e) { - throw new IllegalArgumentException("Unable to read a line from the line reader"); - } - return headerLine; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java new file mode 100644 index 000000000..a80e05d59 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.hapmap; + +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.annotation.Strand; +import org.broad.tribble.readers.LineReader; + +import java.io.IOException; +import java.util.Arrays; + +/** + * A codec for the file types produced by the HapMap consortium + * + *

    + * The format includes eleven standard fields, plus genotypes for each of the samples included + * in the file: + * + *

    + *     Col1: refSNP rs# identifier at the time of release (NB might merge with another rs# in the future)
    + *     Col2: SNP alleles according to dbSNP
    + *     Col3: chromosome that SNP maps to
    + *     Col4: chromosome position of SNP, in basepairs on reference sequence
    + *     Col5: strand of reference sequence that SNP maps to
    + *     Col6: version of reference sequence assembly
    + *     Col7: HapMap genotype center that produced the genotypes
    + *     Col8: LSID for HapMap protocol used for genotyping
    + *     Col9: LSID for HapMap assay used for genotyping
    + *     Col10: LSID for panel of individuals genotyped
    + *     Col11: QC-code, currently 'QC+' for all entries (for future use)
    + *     Col12 and on: observed genotypes of samples, one per column, sample identifiers in column headers (Coriell catalog numbers, example: NA10847). Duplicate samples have .dup suffix.
    + * 
    + *

    + * + *

    + * See also: @See HapMap genotypes download + *

    + * + *

    File format example

    + * From genotypes_chr1_ASW_r27_nr.b36_fwd.txt.gz: + *
    + *     rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode NA19625 NA19700 NA19701 NA19702 NA19703 NA19704 NA19705 NA19708 NA19712 NA19711 NA19818 NA19819 NA19828 NA19835 NA19834 NA19836 NA19902 NA19901 NA19900 NA19904 NA19919 NA19908 NA19909 NA19914 NA19915 NA19916 NA19917 NA19918 NA19921 NA20129 NA19713 NA19982 NA19983 NA19714 NA19985 NA20128 NA20126 NA20127 NA20277 NA20276 NA20279 NA20282 NA20281 NA20284 NA20287 NA20288 NA20290 NA20289 NA20291 NA20292 NA20295 NA20294 NA20297 NA20300 NA20301 NA20302 NA20317 NA20319 NA20322 NA20333 NA20332 NA20335 NA20334 NA20337 NA20336 NA20340 NA20341 NA20343 NA20342 NA20344 NA20345 NA20346 NA20347 NA20348 NA20349 NA20350 NA20357 NA20356 NA20358 NA20359 NA20360 NA20363 NA20364
    + *     rs9629043 C/T chr1 554636 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8575115:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ CC CC CC CC CC CC CC CC CC CC CC CC NN CC CC CC CT CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CT CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC CC
    + *     rs28446478 G/T chr1 576058 + ncbi_b36 sanger urn:LSID:illumina.hapmap.org:Protocol:Human_1M_BeadChip:3 urn:LSID:sanger.hapmap.org:Assay:H1Mrs28446478:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GT TT GT TT TT TT TT GT GT TT TT TT TT GT GT GT GT TT GT TT GT GT TT GT GT TT TT TT GT GT TT TT TT GT TT GT TT GT GT GT GT GT TT GT TT TT GT GT TT TT TT TT TT TT GT GT GT GT TT TT TT TT GT TT GT TT TT GT TT TT TT GT TT TT TT GT GT TT GT TT GT TT TT
    + *     rs12565286 C/G chr1 711153 + ncbi_b36 broad urn:LSID:affymetrix.hapmap.org:Protocol:GenomeWideSNP_6.0:3 urn:LSID:broad.hapmap.org:Assay:SNP_A-8709646:3 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:3 QC+ GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG GG CG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG CG CG GG GG GG GG GG GG GG GG GG GG CG NN GG GG GG GG GG GG NN GG NN NN
    + * 
    + * + * @author Mark DePristo + * @since 2010 + */ +public class RawHapMapCodec implements FeatureCodec { + // the minimum number of features in the HapMap file line + private static final int minimumFeatureCount = 11; + + private String headerLine; + /** + * decode the location only + * @param line the input line to decode + * @return a HapMapFeature + */ + public Feature decodeLoc(String line) { + return decode(line); + } + + /** + * decode the hapmap record + * @param line the input line to decode + * @return a HapMapFeature, with the given fields + */ + public Feature decode(String line) { + String[] array = line.split("\\s+"); + + // make sure the split was successful - that we got an appropriate number of fields + if (array.length < minimumFeatureCount) + throw new IllegalArgumentException("Unable to parse line " + line + ", the length of split features is less than the minimum of " + minimumFeatureCount); + + // create a new feature given the array + return new RawHapMapFeature(array[0], + array[1].split("/"), + array[2], + Long.valueOf(array[3]), + Strand.toStrand(array[4]), + array[5], + array[6], + array[7], + array[8], + array[9], + array[10], + Arrays.copyOfRange(array,11,array.length), + headerLine); + } + + public Class getFeatureType() { + return RawHapMapFeature.class; + } + + public Object readHeader(LineReader reader) { + try { + headerLine = reader.readLine(); + } catch (IOException e) { + throw new IllegalArgumentException("Unable to read a line from the line reader"); + } + return headerLine; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java similarity index 88% rename from public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java index 7a47a4b8d..d0480a90b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/HapMapFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapFeature.java @@ -35,8 +35,9 @@ import java.util.Map; * a feature returned by the HapMap Codec - it represents contig, position, name, * alleles, other hapmap information, and genotypes for specified samples */ -public class HapMapFeature implements Feature { +public class RawHapMapFeature implements Feature { + public static final String NULL_ALLELE_STRING = "-"; public static final String INSERTION = "I"; public static final String DELETION = "D"; @@ -71,19 +72,19 @@ public class HapMapFeature implements Feature { * @param qccode ?? * @param genotypes a list of strings, representing the genotypes for the list of samples */ - public HapMapFeature(String name, - String[] alleles, - String contig, - Long position, - Strand strand, - String assembly, - String center, - String protLSID, - String assayLSID, - String panelLSID, - String qccode, - String[] genotypes, - String headerLine) { + public RawHapMapFeature(String name, + String[] alleles, + String contig, + Long position, + Strand strand, + String assembly, + String center, + String protLSID, + String assayLSID, + String panelLSID, + String qccode, + String[] genotypes, + String headerLine) { this.name = name; this.alleles = alleles; this.contig = contig; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java similarity index 80% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index 461aab9a5..d94d9ff84 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -1,98 +1,125 @@ -package org.broadinstitute.sting.gatk.refdata.features.refseq; - -import org.broad.tribble.Feature; -import org.broad.tribble.TribbleException; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.ArrayList; - -/** - * the ref seq codec - */ -public class RefSeqCodec implements ReferenceDependentFeatureCodec { - - /** - * The parser to use when resolving genome-wide locations. - */ - private GenomeLocParser genomeLocParser; - - /** - * Set the parser to use when resolving genetic data. - * @param genomeLocParser The supplied parser. - */ - @Override - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - @Override - public Feature decodeLoc(String line) { - if (line.startsWith("#")) return null; - String fields[] = line.split("\t"); - if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length); - String contig_name = fields[2]; - try { - return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - } catch ( UserException.MalformedGenomeLoc e ) { - Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")"); - return null; - } - } - - /** Fills this object from a text line in RefSeq (UCSC) text dump file */ - @Override - public RefSeqFeature decode(String line) { - if (line.startsWith("#")) return null; - String fields[] = line.split("\t"); - - // we reference postion 15 in the split array below, make sure we have at least that many columns - if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length); - String contig_name = fields[2]; - RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - - feature.setTranscript_id(fields[1]); - if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1); - else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1); - else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line); - - - feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]))); - feature.setGene_name(fields[12]); - String[] exon_starts = fields[9].split(","); - String[] exon_stops = fields[10].split(","); - String[] eframes = fields[15].split(","); - - if ( exon_starts.length != exon_stops.length ) - throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line); - if ( exon_starts.length != eframes.length ) - throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line); - - ArrayList exons = new ArrayList(exon_starts.length); - ArrayList exon_frames = new ArrayList(eframes.length); - - for ( int i = 0 ; i < exon_starts.length ; i++ ) { - exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); - exon_frames.add(Integer.decode(eframes[i])); - } - - feature.setExons(exons); - feature.setExon_frames(exon_frames); - return feature; - } - - @Override - public Object readHeader(LineReader reader) { - return null; - } - - @Override - public Class getFeatureType() { - return RefSeqCodec.class; - } -} +package org.broadinstitute.sting.utils.codecs.refseq; + +import org.broad.tribble.Feature; +import org.broad.tribble.TribbleException; +import org.broad.tribble.readers.LineReader; +import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.ArrayList; + +/** + * TODO FOR CHRIS HARTL + * + *

    + * Codec Description + *

    + * + *

    + * See also: link to file specification + *

    + * + *

    File format example

    + *

    + * A BAM file containing exactly one sample. + *

    + * + * @author Mark DePristo + * @since 2010 + */ +public class RefSeqCodec implements ReferenceDependentFeatureCodec { + + /** + * The parser to use when resolving genome-wide locations. + */ + private GenomeLocParser genomeLocParser; + private boolean zero_coding_length_user_warned = false; + /** + * Set the parser to use when resolving genetic data. + * @param genomeLocParser The supplied parser. + */ + @Override + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + @Override + public Feature decodeLoc(String line) { + if (line.startsWith("#")) return null; + String fields[] = line.split("\t"); + if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length); + String contig_name = fields[2]; + try { + return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + } catch ( UserException.MalformedGenomeLoc e ) { + Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")"); + return null; + } + } + + /** Fills this object from a text line in RefSeq (UCSC) text dump file */ + @Override + public RefSeqFeature decode(String line) { + if (line.startsWith("#")) return null; + String fields[] = line.split("\t"); + + // we reference postion 15 in the split array below, make sure we have at least that many columns + if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length); + String contig_name = fields[2]; + RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + + feature.setTranscript_id(fields[1]); + if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1); + else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1); + else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line); + + int coding_start = Integer.parseInt(fields[6])+1; + int coding_stop = Integer.parseInt(fields[7]); + + if ( coding_start > coding_stop ) { + if ( ! zero_coding_length_user_warned ) { + Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+ + "Such transcripts will be ignored (this warning is printed only once)"); + zero_coding_length_user_warned = true; + } + return null; + } + + feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, coding_start, coding_stop)); + feature.setGene_name(fields[12]); + String[] exon_starts = fields[9].split(","); + String[] exon_stops = fields[10].split(","); + String[] eframes = fields[15].split(","); + + if ( exon_starts.length != exon_stops.length ) + throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line); + if ( exon_starts.length != eframes.length ) + throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line); + + ArrayList exons = new ArrayList(exon_starts.length); + ArrayList exon_frames = new ArrayList(eframes.length); + + for ( int i = 0 ; i < exon_starts.length ; i++ ) { + exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); + exon_frames.add(Integer.decode(eframes[i])); + } + + feature.setExons(exons); + feature.setExon_frames(exon_frames); + return feature; + } + + @Override + public Object readHeader(LineReader reader) { + return null; + } + + @Override + public Class getFeatureType() { + return RefSeqFeature.class; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java index d12114f9a..c04ca8592 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/refseq/RefSeqFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java @@ -1,7 +1,6 @@ -package org.broadinstitute.sting.gatk.refdata.features.refseq; +package org.broadinstitute.sting.utils.codecs.refseq; import org.broad.tribble.Feature; -import org.broadinstitute.sting.gatk.refdata.Transcript; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java index b8a0868dd..3e8a4fb34 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/Transcript.java @@ -1,53 +1,53 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.HasGenomeLocation; - -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 22, 2009 - * Time: 5:22:30 PM - * To change this template use File | Settings | File Templates. - */ -public interface Transcript extends HasGenomeLocation { - - /** Returns id of the transcript (RefSeq NM_* id) */ - public String getTranscriptId(); - /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */ - public int getStrand(); - /** Returns transcript's full genomic interval (includes all exons with UTRs) */ - public GenomeLoc getLocation(); - /** Returns genomic interval of the coding sequence (does not include - * UTRs, but still includes introns, since it's a single interval on the DNA) - */ - public GenomeLoc getCodingLocation(); - /** Name of the gene this transcript corresponds to (typically NOT gene id such as Entrez etc, - * but the implementation can decide otherwise) - */ - public String getGeneName(); - /** Number of exons in this transcript */ - public int getNumExons(); - /** Genomic location of the n-th exon; expected to throw an exception (runtime) if n is out of bounds */ - public GenomeLoc getExonLocation(int n); - - /** Returns the list of all exons in this transcript, as genomic intervals */ - public List getExons(); - - /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ - public boolean overlapsP (GenomeLoc that); - - /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. - * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons, - * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. - * @see #overlapsExonP - */ - public boolean overlapsCodingP (GenomeLoc that); - - /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ - public boolean overlapsExonP (GenomeLoc that); - - -} +package org.broadinstitute.sting.utils.codecs.refseq; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.HasGenomeLocation; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Sep 22, 2009 + * Time: 5:22:30 PM + * To change this template use File | Settings | File Templates. + */ +public interface Transcript extends HasGenomeLocation { + + /** Returns id of the transcript (RefSeq NM_* id) */ + public String getTranscriptId(); + /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */ + public int getStrand(); + /** Returns transcript's full genomic interval (includes all exons with UTRs) */ + public GenomeLoc getLocation(); + /** Returns genomic interval of the coding sequence (does not include + * UTRs, but still includes introns, since it's a single interval on the DNA) + */ + public GenomeLoc getCodingLocation(); + /** Name of the gene this transcript corresponds to (typically NOT gene id such as Entrez etc, + * but the implementation can decide otherwise) + */ + public String getGeneName(); + /** Number of exons in this transcript */ + public int getNumExons(); + /** Genomic location of the n-th exon; expected to throw an exception (runtime) if n is out of bounds */ + public GenomeLoc getExonLocation(int n); + + /** Returns the list of all exons in this transcript, as genomic intervals */ + public List getExons(); + + /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ + public boolean overlapsP (GenomeLoc that); + + /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. + * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons, + * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. + * @see #overlapsExonP + */ + public boolean overlapsCodingP (GenomeLoc that); + + /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ + public boolean overlapsExonP (GenomeLoc that); + + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java similarity index 86% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java index 43e2c3ff5..f4633b2ce 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.sampileup; +package org.broadinstitute.sting.utils.codecs.sampileup; import org.broad.tribble.Feature; import org.broad.tribble.FeatureCodec; @@ -35,13 +35,46 @@ import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; -import static org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature.VariantType; +import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType; /** - * A Tribble encoder / decoder for SAM pileup data. + * Decoder for SAM pileup data. For GATK validation purposes only * - * @author mhanna - * @version 0.1 + *

    + * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute. + * It desribes the base-pair information at each chromosomal position. This format + * facilitates SNP/indel calling and brief alignment viewing by eyes. + *

    + *

    + * Each line consists of chromosome, 1-based coordinate, reference base, the + * number of reads covering the site, read bases and base qualities. At the + * read base column, a dot stands for a match to the reference base on the + * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch + * on the forward strand and `acgtn' for a mismatch on the reverse strand. + * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between + * this reference position and the next reference position. The length of the + * insertion is given by the integer in the pattern, followed by the inserted sequence. + *

    + * + *

    + *
    See also: @see SAMTools project
    + *
    See also: @see Pileup format
    + *

    + * + *

    File format example

    + *
    + *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
    + *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
    + *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
    + *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
    + *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
    + *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
    + *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
    + *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
    + * 
    + * + * @author Matt Hanna + * @since 2009 */ public class SAMPileupCodec implements FeatureCodec { // the number of tokens we expect to parse from a pileup line diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java similarity index 99% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java index 378f26934..eb33243e3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.sampileup; +package org.broadinstitute.sting.utils.codecs.sampileup; import net.sf.samtools.util.StringUtil; import org.broad.tribble.Feature; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java similarity index 89% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java index 039b8adde..d4bdb5aa9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.samread; +package org.broadinstitute.sting.utils.codecs.samread; import net.sf.samtools.Cigar; import net.sf.samtools.TextCigarCodec; @@ -36,8 +36,21 @@ import org.broad.tribble.util.ParsingUtils; /** * Decodes a simple SAM text string. * - * @author mhanna - * @version 0.1 + *

    + * Reads in the SAM text version of a BAM file as a ROD. For testing only + *

    + * + *

    + * See also: @see SAMTools for format specification + *

    + * + *

    File format example

    + *
    + *     SL-XBC:1:10:628:923#0	16	Escherichia_coli_K12	1	37	76M	=	1	0	AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA	B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB
    + * 
    + * + * @author Matt Hanna + * @since 2009 */ public class SAMReadCodec implements FeatureCodec { /* SL-XBC:1:10:628:923#0 16 Escherichia_coli_K12 1 37 76M = 1 0 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA B@>87<;A@?@957:>>@AA@B>@A9AB@B>@A@@@@@A;=AAB@BBBBBCBBBB@>A>:ABB@BAABCB=CA@CB */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java index 7f12b2b2f..fc1bf89af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/samread/SAMReadFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadFeature.java @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.features.samread; +package org.broadinstitute.sting.utils.codecs.samread; import org.broad.tribble.Feature; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java deleted file mode 100755 index e169dbdfc..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/soapsnp/SoapSNPCodec.java +++ /dev/null @@ -1,209 +0,0 @@ -package org.broadinstitute.sting.utils.codecs.soapsnp; - -import org.broad.tribble.Feature; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.NameAwareCodec; -import org.broad.tribble.TribbleException; -import org.broad.tribble.exception.CodecLineParsingException; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; - -/** - * @author depristo - *

    - * a codec for parsing soapsnp files (see http://soap.genomics.org.cn/soapsnp.html#usage2) - *

    - * - * A simple text file format with the following whitespace separated fields: - * -1) Chromosome ID -2) Coordinate on chromosome, start from 1 -3) Reference genotype -4) Consensus genotype -5) Quality score of consensus genotype -6) Best base -7) Average quality score of best base -8) Count of uniquely mapped best base -9) Count of all mapped best base -10) Second best bases -11) Average quality score of second best base -12) Count of uniquely mapped second best base -13) Count of all mapped second best base -14) Sequencing depth of the site -15) Rank sum test p_value -16) Average copy number of nearby region -17) Whether the site is a dbSNP. - */ -public class SoapSNPCodec implements FeatureCodec, NameAwareCodec { - private String[] parts; - - // we store a name to give to each of the variant contexts we emit - private String name = "Unknown"; - - public Feature decodeLoc(String line) { - return decode(line); - } - - /** - * Decode a line as a Feature. - * - * @param line - * - * @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is - * a comment) - */ - public Feature decode(String line) { - try { - // parse into lines - parts = line.trim().split("\\s+"); - - // check that we got the correct number of tokens in the split - if (parts.length != 18) - throw new CodecLineParsingException("Invalid SoapSNP row found -- incorrect element count. Expected 18, got " + parts.length + " line = " + line); - - String contig = parts[0]; - long start = Long.valueOf(parts[1]); - AlleleAndGenotype allelesAndGenotype = parseAlleles(parts[2], parts[3], line); - - double negLog10PError = Integer.valueOf(parts[4]) / 10.0; - - Map attributes = new HashMap(); - attributes.put("BestBaseQ", parts[6]); - attributes.put("SecondBestBaseQ", parts[10]); - attributes.put("RankSumP", parts[15]); - // add info to keys - - //System.out.printf("Alleles = " + allelesAndGenotype.alleles); - //System.out.printf("genotype = " + allelesAndGenotype.genotype); - - VariantContext vc = new VariantContext(name, contig, start, start, allelesAndGenotype.alleles, allelesAndGenotype.genotype, negLog10PError, VariantContext.PASSES_FILTERS, attributes); - - //System.out.printf("line = %s%n", line); - //System.out.printf("vc = %s%n", vc); - - return vc; - } catch (CodecLineParsingException e) { - throw new TribbleException("Unable to parse line " + line,e); - } catch (NumberFormatException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - throw new TribbleException("Unable to parse line " + line,e); - } - } - - private static class AlleleAndGenotype { - Collection alleles; - Collection genotype; - - public AlleleAndGenotype(Collection alleles, Genotype genotype) { - this.alleles = alleles; - this.genotype = new HashSet(); - this.genotype.add(genotype); - } - } - - private AlleleAndGenotype parseAlleles(String ref, String consensusGenotype, String line) { - /* A Adenine - C Cytosine - G Guanine - T (or U) Thymine (or Uracil) - R A or G - Y C or T - S G or C - W A or T - K G or T - M A or C - B C or G or T - D A or G or T - H A or C or T - V A or C or G - N any base - . or - gap - */ - if ( ref.equals(consensusGenotype) ) - throw new TribbleException.InternalCodecException("Ref base and consensus genotype are the same " + ref); - - Allele refAllele = Allele.create(ref, true); - List genotypeAlleles = null; - - char base = consensusGenotype.charAt(0); - - switch ( base ) { - case 'A': case 'C': case 'G': case 'T': - Allele a = Allele.create(consensusGenotype); - genotypeAlleles = Arrays.asList(a, a); - break; - case 'R': case 'Y': case 'S': case 'W': case 'K': case 'M': - genotypeAlleles = determineAlt(refAllele, ref.charAt(0), base); - break; - default: - throw new TribbleException("Unexpected consensus genotype " + consensusGenotype + " at line = " + line); - } - - - Collection alleles = new HashSet(genotypeAlleles); - alleles.add(refAllele); - Genotype genotype = new Genotype("unknown", genotypeAlleles); // todo -- probably should include genotype quality - - return new AlleleAndGenotype( alleles, genotype ); - } - - private static final Map IUPAC_SNPS = new HashMap(); - static { - IUPAC_SNPS.put('R', "AG"); - IUPAC_SNPS.put('Y', "CT"); - IUPAC_SNPS.put('S', "GC"); - IUPAC_SNPS.put('W', "AT"); - IUPAC_SNPS.put('K', "GT"); - IUPAC_SNPS.put('M', "AC"); - } - - private List determineAlt(Allele ref, char refbase, char alt) { - String alts = IUPAC_SNPS.get(alt); - if ( alts == null ) - throw new IllegalStateException("BUG: unexpected consensus genotype " + alt); - - Allele a1 = alts.charAt(0) == refbase ? ref : Allele.create((byte)alts.charAt(0)); - Allele a2 = alts.charAt(1) == refbase ? ref : Allele.create((byte)alts.charAt(1)); - - //if ( a1 != ref && a2 != ref ) - // throw new IllegalStateException("BUG: unexpected consensus genotype " + alt + " does not contain the reference base " + ref); - - return Arrays.asList(a1, a2); - } - - /** - * @return VariantContext - */ - public Class getFeatureType() { - return VariantContext.class; - } - - public Object readHeader(LineReader reader) { - - return null; // we don't have a meaningful header - } - - /** - * get the name of this codec - * @return our set name - */ - public String getName() { - return name; - } - - /** - * set the name of this codec - * @param name new name - */ - public void setName(String name) { - this.name = name; - } - - public static void main(String[] args) { - System.out.printf("Testing " + args[0]); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java similarity index 73% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java index 745ccdd9f..fdcc8ed10 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/BedTableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/BedTableCodec.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata.features.table; +package org.broadinstitute.sting.utils.codecs.table; import org.broad.tribble.Feature; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; @@ -6,14 +6,19 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import java.util.Arrays; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 3/28/11 - * Time: 2:47 PM - * To change this template use File | Settings | File Templates. - */ -/** - * The standard table codec with a slightly different parsing convention (expects loci as contig start stop, not contig:start-stop) + * The standard table codec that expects loci as contig start stop, not contig:start-stop + * + *

    + * The standard table codec with a slightly different parsing convention + * (expects loci as contig start stop, not contig:start-stop) + *

    + * + *

    + * See also: TableCodec + *

    + * + * @author Chris Hartl + * @since 2010 */ public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec { diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java similarity index 64% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java index ab1ac59d8..1919ccbf0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata.features.table; +package org.broadinstitute.sting.utils.codecs.table; import org.broad.tribble.Feature; import org.broad.tribble.readers.LineReader; @@ -11,13 +11,40 @@ import java.util.ArrayList; import java.util.Arrays; /** - * implementation of a simple table (tab or comma delimited format) input files + * Reads tab deliminated tabular text files + * + *

    + *

      + *
    • Header: must begin with line HEADER or track (for IGV), followed by any number of column names, + * separated by whitespace.
    • + *
    • Comment lines starting with # are ignored
    • + *
    • Each non-header and non-comment line is split into parts by whitespace, + * and these parts are assigned as a map to their corresponding column name in the header. + * Note that the first element (corresponding to the HEADER column) must be a valid genome loc + * such as 1, 1:1 or 1:1-10, which is the position of the Table element on the genome. TableCodec + * requires that there be one value for each column in the header, and no more, on all lines.
    • + *
    + *

    + * + *

    + * + *

    File format example

    + *
    + *     HEADER a b c
    + *     1:1  1   2   3
    + *     1:2  4   5   6
    + *     1:3  7   8   9
    + * 
    + * + * @author Mark DePristo + * @since 2009 */ public class TableCodec implements ReferenceDependentFeatureCodec { - protected String delimiterRegex = "\\s+"; - protected String headerDelimiter = "HEADER"; - protected String igvHeaderDelimiter = "track"; - protected String commentDelimiter = "#"; + final static protected String delimiterRegex = "\\s+"; + final static protected String headerDelimiter = "HEADER"; + final static protected String igvHeaderDelimiter = "track"; + final static protected String commentDelimiter = "#"; + protected ArrayList header = new ArrayList(); /** @@ -51,7 +78,7 @@ public class TableCodec implements ReferenceDependentFeatureCodec { } @Override - public Class getFeatureType() { + public Class getFeatureType() { return TableFeature.class; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java similarity index 96% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java index ca73ee960..a85849f0b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.refdata.features.table; +package org.broadinstitute.sting.utils.codecs.table; import org.broad.tribble.Feature; import org.broadinstitute.sting.utils.GenomeLoc; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 710127f7a..bb212e128 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -7,16 +7,19 @@ import org.broad.tribble.NameAwareCodec; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.gatk.refdata.SelfScopingFeatureCodec; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.*; import java.util.*; +import java.util.zip.GZIPInputStream; -public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser { +public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser, SelfScopingFeatureCodec { protected final static Logger log = Logger.getLogger(VCFCodec.class); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column @@ -151,9 +154,45 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { - return reallyDecode(line); + String[] locParts = new String[6]; + ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true); + + // get our alleles (because the end position depends on them) + String ref = getCachedString(locParts[3].toUpperCase()); + String alts = getCachedString(locParts[4].toUpperCase()); + List alleles = parseAlleles(ref, alts, lineNo); + + // find out our location + int start = Integer.valueOf(locParts[1]); + int stop = start; + + // ref alleles don't need to be single bases for monomorphic sites + if ( alleles.size() == 1 ) { + stop = start + alleles.get(0).length() - 1; + } else if ( !isSingleNucleotideEvent(alleles) ) { + stop = clipAlleles(start, ref, alleles, null, lineNo); + } + + return new VCFLocFeature(locParts[0], start, stop); } + private final static class VCFLocFeature implements Feature { + + final String chr; + final int start, stop; + + private VCFLocFeature(String chr, int start, int stop) { + this.chr = chr; + this.start = start; + this.stop = stop; + } + + public String getChr() { return chr; } + public int getStart() { return start; } + public int getEnd() { return stop; } + } + + /** * decode the line into a feature (VariantContext) * @param line the line @@ -204,7 +243,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, // parse out the required fields String contig = getCachedString(parts[0]); - long pos = Long.valueOf(parts[1]); + int pos = Integer.valueOf(parts[1]); String id = null; if ( parts[2].length() == 0 ) generateException("The VCF specification requires a valid ID field"); @@ -224,7 +263,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, Map attributes = parseInfo(info, id); // find out our current location, and clip the alleles down to their minimum length - long loc = pos; + int loc = pos; // ref alleles don't need to be single bases for monomorphic sites if ( alleles.size() == 1 ) { loc = pos + alleles.get(0).length() - 1; @@ -242,7 +281,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VariantContext vc = null; try { - vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes); + vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]); } catch (Exception e) { generateException(e.getMessage()); } @@ -251,15 +290,14 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, if ( !header.samplesWereAlreadySorted() ) vc.getGenotypes(); - // Trim bases of all alleles if necessary - return createVariantContextWithTrimmedAlleles(vc); + return vc; } /** * * @return the type of record */ - public Class getFeatureType() { + public Class getFeatureType() { return VariantContext.class; } @@ -477,25 +515,44 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, return true; } - private static int computeForwardClipping(List unclippedAlleles, String ref) { + public static int computeForwardClipping(List unclippedAlleles, String ref) { boolean clipping = true; - // Note that the computation of forward clipping here is meant only to see whether there is a common - // base to all alleles, and to correctly compute reverse clipping, - // but it is not used for actually changing alleles - this is done in function - // createVariantContextWithTrimmedAlleles() below. - for (Allele a : unclippedAlleles) { - if (a.isSymbolic()) { + for ( Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) continue; - } - if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) { + + if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) { clipping = false; + break; } } - return (clipping) ? 1 : 0; + return (clipping) ? 1 : 0; } + protected static int computeReverseClipping(List unclippedAlleles, String ref, int forwardClipping, int lineNo) { + int clipping = 0; + boolean stillClipping = true; + + while ( stillClipping ) { + for ( Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 ) + stillClipping = false; + else if ( ref.length() == clipping ) + generateException("bad alleles encountered", lineNo); + else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] ) + stillClipping = false; + } + if ( stillClipping ) + clipping++; + } + + return clipping; + } /** * clip the alleles, based on the reference * @@ -503,118 +560,50 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, * @param ref the reference string * @param unclippedAlleles the list of unclipped alleles * @param clippedAlleles output list of clipped alleles - * @return a list of alleles, clipped to the reference + * @param lineNo the current line number in the file + * @return the new reference end position of this event */ - protected static long clipAlleles(long position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { - - // Note that the computation of forward clipping here is meant only to see whether there is a common - // base to all alleles, and to correctly compute reverse clipping, - // but it is not used for actually changing alleles - this is done in function - // createVariantContextWithTrimmedAlleles() below. + protected static int clipAlleles(int position, String ref, List unclippedAlleles, List clippedAlleles, int lineNo) { int forwardClipping = computeForwardClipping(unclippedAlleles, ref); + int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo); - int reverseClipped = 0; - boolean clipping = true; - while (clipping) { - for (Allele a : unclippedAlleles) { - if (a.isSymbolic()) { - continue; + if ( clippedAlleles != null ) { + for ( Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) { + clippedAlleles.add(a); + } else { + clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping), a.isReference())); } - if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0) - clipping = false; - else if (ref.length() == reverseClipped) - generateException("bad alleles encountered", lineNo); - else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1]) - clipping = false; - } - if (clipping) reverseClipped++; - } - - for (Allele a : unclippedAlleles) { - if (a.isSymbolic()) { - clippedAlleles.add(a); - } else { - clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference())); } } // the new reference length - int refLength = ref.length() - reverseClipped; + int refLength = ref.length() - reverseClipping; return position+Math.max(refLength - 1,0); } - public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { - // see if we need to trim common reference base from all alleles - boolean trimVC; - - // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles - Allele refAllele = inputVC.getReference(); - if (!inputVC.isVariant()) - trimVC = false; - else if (refAllele.isNull()) - trimVC = false; - else { - trimVC = (computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), - inputVC.getReference().getDisplayString()) > 0); - } - - // nothing to do if we don't need to trim bases - if (trimVC) { - List alleles = new ArrayList(); - Map genotypes = new TreeMap(); - - // set the reference base for indels in the attributes - Map attributes = new TreeMap(inputVC.getAttributes()); - attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte(inputVC.getReference().getBases()[0])); - - Map originalToTrimmedAlleleMap = new HashMap(); - - for (Allele a : inputVC.getAlleles()) { - if (a.isSymbolic()) { - alleles.add(a); - originalToTrimmedAlleleMap.put(a, a); - } else { - // get bases for current allele and create a new one with trimmed bases - byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length()); - Allele trimmedAllele = Allele.create(newBases, a.isReference()); - alleles.add(trimmedAllele); - originalToTrimmedAlleleMap.put(a, trimmedAllele); - } - } - - // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation - // example: mixed records such as {TA*,TGA,TG} - boolean hasNullAlleles = false; - - for (Allele a: originalToTrimmedAlleleMap.values()) { - if (a.isNull()) - hasNullAlleles = true; - if (a.isReference()) - refAllele = a; - } - - if (!hasNullAlleles) - return inputVC; - // now we can recreate new genotypes with trimmed alleles - for ( Map.Entry sample : inputVC.getGenotypes().entrySet() ) { - - List originalAlleles = sample.getValue().getAlleles(); - List trimmedAlleles = new ArrayList(); - for ( Allele a : originalAlleles ) { - if ( a.isCalled() ) - trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); - else - trimmedAlleles.add(Allele.NO_CALL); - } - genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); - - } - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes); - + public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) { + try { + return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || + isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE); + } catch ( FileNotFoundException e ) { + return false; + } catch ( IOException e ) { + return false; } + } - return inputVC; + private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) { + try { + byte[] buff = new byte[MAGIC_HEADER_LINE.length()]; + stream.read(buff, 0, MAGIC_HEADER_LINE.length()); + String firstLine = new String(buff); + stream.close(); + return firstLine.startsWith(MAGIC_HEADER_LINE); + } catch ( IOException e ) { + return false; + } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java index 311aaecf7..c299511db 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java @@ -105,9 +105,8 @@ public abstract class SortingVCFWriterBase implements VCFWriter { * add a record to the file * * @param vc the Variant Context object - * @param refBase the ref base */ - public void add(VariantContext vc, byte refBase) { + public void add(VariantContext vc) { /* Note that the code below does not prevent the successive add()-ing of: (chr1, 10), (chr20, 200), (chr15, 100) since there is no implicit ordering of chromosomes: */ @@ -122,7 +121,7 @@ public abstract class SortingVCFWriterBase implements VCFWriter { noteCurrentRecord(vc); // possibly overwritten - queue.add(new VCFRecord(vc, refBase)); + queue.add(new VCFRecord(vc)); emitSafeRecords(); } @@ -133,7 +132,7 @@ public abstract class SortingVCFWriterBase implements VCFWriter { // No need to wait, waiting for nothing, or before what we're waiting for: if (emitUnsafe || mostUpstreamWritableLoc == null || firstRec.vc.getStart() <= mostUpstreamWritableLoc) { queue.poll(); - innerWriter.add(firstRec.vc, firstRec.refBase); + innerWriter.add(firstRec.vc); } else { break; @@ -143,7 +142,7 @@ public abstract class SortingVCFWriterBase implements VCFWriter { /** * Gets a string representation of this object. - * @return + * @return a string representation of this object */ @Override public String toString() { @@ -158,11 +157,9 @@ public abstract class SortingVCFWriterBase implements VCFWriter { private static class VCFRecord { public VariantContext vc; - public byte refBase; - public VCFRecord(VariantContext vc, byte refBase) { + public VCFRecord(VariantContext vc) { this.vc = vc; - this.refBase = refBase; } } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index b7f4be39a..d3705813c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -202,20 +202,18 @@ public class StandardVCFWriter implements VCFWriter { * add a record to the file * * @param vc the Variant Context object - * @param refBase the ref base used for indels */ - public void add(VariantContext vc, byte refBase) { - add(vc, refBase, false); + public void add(VariantContext vc) { + add(vc, false); } /** * add a record to the file * * @param vc the Variant Context object - * @param refBase the ref base used for indels * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD) */ - public void add(VariantContext vc, byte refBase, boolean refBaseShouldBeAppliedToEndOfAlleles) { + public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) { if ( mHeader == null ) throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString()); @@ -223,7 +221,7 @@ public class StandardVCFWriter implements VCFWriter { vc = VariantContext.modifyGenotypes(vc, null); try { - vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBase, refBaseShouldBeAppliedToEndOfAlleles); + vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles); // if we are doing on the fly indexing, add the record ***before*** we write any bytes if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition()); @@ -285,7 +283,7 @@ public class StandardVCFWriter implements VCFWriter { Map infoFields = new TreeMap(); for ( Map.Entry field : vc.getAttributes().entrySet() ) { String key = field.getKey(); - if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) + if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) continue; String outputValue = formatVCFField(field.getValue()); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index c29f2ba8b..e5b1a2de5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -7,15 +7,31 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.File; +import java.io.FileReader; import java.io.IOException; import java.util.*; /** - * a feature codec for the VCF 3 specification. Our aim is to read in the records and convert to VariantContext as - * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + * A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been + * depreciated in favor of VCF4 (See VCF codec for the latest information) + * + *

    + * Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example) + *

    + * + *

    + * See also: @see VCF specification
    + * See also: @see VCF spec. publication + *

    + * + * @author Mark DePristo + * @since 2010 */ public class VCF3Codec extends AbstractVCFCodec { + public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; + /** * @param reader the line reader to take header lines from @@ -178,4 +194,8 @@ public class VCF3Codec extends AbstractVCFCodec { return genotypes; } + @Override + public boolean canDecode(final File potentialInput) { + return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 05fff5d9e..fa030ef5f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -7,15 +7,52 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.File; +import java.io.FileReader; import java.io.IOException; import java.util.*; - /** - * a feature codec for the VCF 4 specification. Our aim is to read in the records and convert to VariantContext as - * quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + * A feature codec for the VCF 4 specification + * + *

    + * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a + * header line, and then data lines each containing information about a position in the genome. + *

    + *

    One of the main uses of next-generation sequencing is to discover variation amongst large populations + * of related samples. Recently the format for storing next-generation read alignments has been + * standardised by the SAM/BAM file format specification. This has significantly improved the + * interoperability of next-generation tools for alignment, visualisation, and variant calling. + * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent + * types of sequence variation, including SNPs, indels and larger structural variants, together + * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for + * fast data retrieval of variants from a range of positions on the reference genome. + * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects + * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements + * various utilities for processing VCF files, including validation, merging and comparing, + * and also provides a general Perl and Python API. + * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.

    + * + *

    + * See also: @see VCF specification
    + * See also: @see VCF spec. publication + *

    + * + *

    File format example

    + *
    + *     ##fileformat=VCFv4.0
    + *     #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
    + *     chr1    109     .       A       T       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:610,327:308:-316.30,-95.47,-803.03:99
    + *     chr1    147     .       C       A       0       PASS  AC=1    GT:AD:DP:GL:GQ  0/1:294,49:118:-57.87,-34.96,-338.46:99
    + * 
    + * + * @author Mark DePristo + * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + + public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; /** * @param reader the line reader to take header lines from @@ -184,5 +221,8 @@ public class VCFCodec extends AbstractVCFCodec { return genotypes; } - + @Override + public boolean canDecode(final File potentialInput) { + return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index eb01e5dca..fd1c74993 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -24,6 +24,7 @@ public class VCFHeader { private final Set mMetaData; private final Map mInfoMetaData = new HashMap(); private final Map mFormatMetaData = new HashMap(); + private final Map mOtherMetaData = new HashMap(); // the list of auxillary tags private final Set mGenotypeSampleNames = new LinkedHashSet(); @@ -110,6 +111,9 @@ public class VCFHeader { VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; mFormatMetaData.put(formatLine.getName(), formatLine); } + else { + mOtherMetaData.put(line.getKey(), line); + } } } @@ -185,6 +189,14 @@ public class VCFHeader { public VCFFormatHeaderLine getFormatHeaderLine(String key) { return mFormatMetaData.get(key); } + + /** + * @param key the header key name + * @return the meta data line, or null if there is none + */ + public VCFHeaderLine getOtherHeaderLine(String key) { + return mOtherMetaData.get(key); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index f43891e77..2d8421507 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -116,10 +116,26 @@ public class VCFUtils { return fields; } - + /** Only displays a warning if a logger is provided and an identical warning hasn't been already issued */ + private static final class HeaderConflictWarner { + Logger logger; + Set alreadyIssued = new HashSet(); + + private HeaderConflictWarner(final Logger logger) { + this.logger = logger; + } + + public void warn(final VCFHeaderLine line, final String msg) { + if ( logger != null && ! alreadyIssued.contains(line.getKey()) ) { + alreadyIssued.add(line.getKey()); + logger.warn(msg); + } + } + } public static Set smartMergeHeaders(Collection headers, Logger logger) throws IllegalStateException { HashMap map = new HashMap(); // from KEY.NAME -> line + HeaderConflictWarner conflictWarner = new HeaderConflictWarner(logger); // todo -- needs to remove all version headers from sources and add its own VCF version line for ( VCFHeader source : headers ) { @@ -152,24 +168,24 @@ public class VCFUtils { // number, then this value should be 1. However, if the INFO field describes a pair // of numbers, then this value should be 2 and so on. If the number of possible // values varies, is unknown, or is unbounded, then this value should be '.'. - if ( logger != null ) logger.warn("Promoting header field Number to . due to number differences in header lines: " + line + " " + other); + conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other); compOther.setNumberToUnbounded(); } else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) { // promote key to Float - if ( logger != null ) logger.warn("Promoting Integer to Float in header: " + compOther); + conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); map.put(key, compOther); } else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) { // promote key to Float - if ( logger != null ) logger.warn("Promoting Integer to Float in header: " + compOther); + conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); } else { throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other ); } } if ( ! compLine.getDescription().equals(compOther) ) - if ( logger != null ) logger.warn("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine); + conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine); } else { // we are not equal, but we're not anything special either - if ( logger != null ) logger.warn("Ignoring header line already in map: this header line = " + line + " already present header = " + other); + conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other); } } else { map.put(key, line); @@ -180,4 +196,19 @@ public class VCFUtils { return new HashSet(map.values()); } + + public static String rsIDOfFirstRealVariant(List VCs, VariantContext.Type type) { + if ( VCs == null ) + return null; + + String rsID = null; + for ( VariantContext vc : VCs ) { + if ( vc.getType() == type ) { + rsID = vc.getID(); + break; + } + } + + return rsID; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java index 0d23fe455..55749d26e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java @@ -14,5 +14,5 @@ public interface VCFWriter { */ public void close(); - public void add(VariantContext vc, byte refBase); + public void add(VariantContext vc); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 7eab6f6c9..274c64f42 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -29,6 +29,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; @@ -43,6 +44,9 @@ import java.util.Arrays; * Date: Sep 3, 2010 * Time: 2:24:09 PM */ +@DocumentedGATKFeature( + groupName = "User exceptions", + summary = "Exceptions caused by incorrect user behavior, such as bad files, bad arguments, etc." ) public class UserException extends ReviewedStingException { public UserException(String msg) { super(msg); } public UserException(String msg, Throwable e) { super(msg, e); } @@ -83,6 +87,13 @@ public class UserException extends ReviewedStingException { } } + public static class UnknownTribbleType extends CommandLineException { + public UnknownTribbleType(String type, String message) { + super(String.format("Unknown tribble type %s: %s", type, message)); + } + } + + public static class BadTmpDir extends UserException { public BadTmpDir(String message) { super(String.format("Failure working with the tmp directory %s. Override with -Djava.io.tmpdir=X on the command line to a bigger/better file system. Exact error was %s", System.getProperties().get("java.io.tmpdir"), message)); @@ -159,7 +170,7 @@ public class UserException extends ReviewedStingException { } public MalformedVCF(String message, int lineNo) { - super(String.format("The provided VCF file is malformed at line nmber %d: %s", lineNo, message)); + super(String.format("The provided VCF file is malformed at line number %d: %s", lineNo, message)); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java deleted file mode 100644 index 65c332048..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.utils.help; - -import com.sun.tools.doclets.Taglet; - -import java.util.Map; - -/** - * Provide an alternate description for the given help system. - * - * @author mhanna - * @version 0.1 - */ -public class DescriptionTaglet extends HelpTaglet { - /** - * The key tag for this taglet. - */ - public static final String NAME = "help.description"; - - /** - * Return the name of this custom tag. - */ - @Override - public String getName() { - return NAME; - } - - /** - * Will return false since overviews are always named - * by the @WalkerName tag. - * @return false always - */ - @Override - public boolean inOverview() { - return true; - } - - /** - * Will return true to indicate that packages can be given useful - * description. - * @return true always - */ - @Override - public boolean inPackage() { - return true; - } - - /** - * Register this Taglet. - * @param tagletMap the map to register this tag to. - */ - public static void register(Map tagletMap) { - DescriptionTaglet tag = new DescriptionTaglet(); - Taglet t = (Taglet)tagletMap.get(tag.getName()); - if (t != null) { - tagletMap.remove(tag.getName()); - } - tagletMap.put(tag.getName(), tag); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java deleted file mode 100644 index 6c6dad736..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java +++ /dev/null @@ -1,49 +0,0 @@ -package org.broadinstitute.sting.utils.help; - -import com.sun.tools.doclets.Taglet; - -import java.util.Map; - -/** - * Provide a display name in the help for packages - * - * @author mhanna - * @version 0.1 - */ -public class DisplayNameTaglet extends HelpTaglet { - /** - * The display name for this taglet. - */ - public static final String NAME = "help.display.name"; - - /** - * Return the name of this custom tag. - */ - @Override - public String getName() { - return NAME; - } - - /** - * Will return true to indicate that packages can be given useful - * display text. - * @return true always - */ - @Override - public boolean inPackage() { - return true; - } - - /** - * Register this Taglet. - * @param tagletMap the map to register this tag to. - */ - public static void register(Map tagletMap) { - DisplayNameTaglet tag = new DisplayNameTaglet(); - Taglet t = (Taglet)tagletMap.get(tag.getName()); - if (t != null) { - tagletMap.remove(tag.getName()); - } - tagletMap.put(tag.getName(), tag); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java new file mode 100644 index 000000000..5bbe3f91e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import java.lang.annotation.*; + +/** + * An annotation to identify a class as a GATK capability for documentation + * + * @author depristo + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface DocumentedGATKFeature { + /** Should we actually document this feature, even through it's annotated? */ + public boolean enable() default true; + /** The overall group name (walkers, readfilters) this feature is associated with */ + public String groupName(); + /** A human readable summary of the purpose of this group of features */ + public String summary() default ""; + /** Are there links to other docs that we should include? CommandLineGATK.class for walkers, for example? */ + public Class[] extraDocs() default {}; +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java new file mode 100644 index 000000000..87926d2e3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.RootDoc; + +import java.io.*; +import java.util.Set; + +/** + * Extend this class to provide a documentation handler for GATKdocs + */ +public abstract class DocumentedGATKFeatureHandler { + private GATKDoclet doclet; + + /** + * @return the javadoc RootDoc of this javadoc run + */ + protected RootDoc getRootDoc() { + return this.doclet.rootDoc; + } + + /** Set the master doclet driving this handler */ + public void setDoclet(GATKDoclet doclet) { + this.doclet = doclet; + } + + /** + * @return the GATKDoclet driving this documentation run + */ + public GATKDoclet getDoclet() { + return doclet; + } + + /** + * Should return false iff this handler wants GATKDoclet to skip documenting + * this ClassDoc. + * @param doc that is being considered for inclusion in the docs + * @return true if the doclet should document ClassDoc doc + */ + public boolean includeInDocs(ClassDoc doc) { return true; } + + /** + * Return the flat filename (no paths) that the handler would like the Doclet to + * write out the documentation for ClassDoc doc and its associated Class clazz + * @param doc + * @param clazz + * @return + */ + public String getDestinationFilename(ClassDoc doc, Class clazz) { + return GATKDocUtils.htmlFilenameForClass(clazz); + } + + /** + * Return the name of the FreeMarker template we will use to process ClassDoc doc. + * + * Note this is a flat filename relative to settings/helpTemplates in the GATK source tree + * @param doc + * @return + * @throws IOException + */ + public abstract String getTemplateName(ClassDoc doc) throws IOException; + + /** + * Actually generate the documentation map associated with toProcess + * + * Can use all to provide references and rootDoc for additional information, if necessary. + * Implementing methods should end with a call to setHandlerContext on toProcess, as in: + * + * toProcess.setHandlerContent(summary, rootMap); + * + * @param toProcess + */ + public abstract void processOne(GATKDocWorkUnit toProcess); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java new file mode 100644 index 000000000..6c8b0a475 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureObject.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +/** + * Documentation unit. Effectively a class version of the DocumentedGATKFeature. + * Immutable data structure. + * + * @author depristo + */ +class DocumentedGATKFeatureObject { + /** Which class are we documenting. Specific to each class being documented */ + private final Class classToDoc; + /** Are we enabled? */ + private final boolean enable; + private final String groupName, summary; + private final Class[] extraDocs; + + public DocumentedGATKFeatureObject(Class classToDoc, final boolean enable, final String groupName, final String summary, final Class[] extraDocs) { + this.classToDoc = classToDoc; + this.enable = enable; + this.groupName = groupName; + this.summary = summary; + this.extraDocs = extraDocs; + } + + public DocumentedGATKFeatureObject(Class classToDoc, final String groupName, final String summary) { + this(classToDoc, true, groupName, summary, new Class[]{}); + } + + public Class getClassToDoc() { return classToDoc; } + public boolean enable() { return enable; } + public String groupName() { return groupName; } + public String summary() { return summary; } + public Class[] extraDocs() { return extraDocs; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java new file mode 100644 index 000000000..cd645943b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +public class GATKDocUtils { + /** The URL root for RELEASED GATKDOC units */ + public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = "http://www.broadinstitute.org/gsa/gatkdocs/release/"; + /** The URL root for STABLE GATKDOC units */ + public final static String URL_ROOT_FOR_STABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/stable/"; + /** The URL root for UNSTABLE GATKDOC units */ + public final static String URL_ROOT_FOR_UNSTABLE_GATKDOCS = "http://iwww.broadinstitute.org/gsa/gatkdocs/unstable/"; + + /** + * Return the filename of the GATKDoc HTML that would be generated for Class. This + * does not guarantee that the docs exist, or that docs would actually be generated + * for class (might not be annotated for documentation, for example). But if + * this class is documented, GATKDocs will write the docs to a file named as returned + * by this function. + * + * @param c + * @return + */ + public static String htmlFilenameForClass(Class c) { + return c.getName().replace(".", "_") + ".html"; + } + + /** + * Returns a full URL http://etc/ linking to the documentation for class (assuming it + * exists). Currently points to the RELEASE doc path only. + * @param c + * @return + */ + public static String helpLinksToGATKDocs(Class c) { + String classPath = htmlFilenameForClass(c); + StringBuilder b = new StringBuilder(); + b.append(URL_ROOT_FOR_RELEASE_GATKDOCS).append(classPath); + //b.append("stable version: ").append(URL_ROOT_FOR_STABLE_GATKDOCS).append(classPath).append("\n"); + //b.append("unstable version: ").append(URL_ROOT_FOR_UNSTABLE_GATKDOCS).append(classPath).append("\n"); + return b.toString(); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java new file mode 100644 index 000000000..41c855329 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.ClassDoc; + +import java.util.HashMap; +import java.util.Map; + +/** + * Simple collection of all relevant information about something the GATKDoclet can document + * + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/24/11 + * Time: 7:59 PM + */ +class GATKDocWorkUnit implements Comparable { + /** The class that's being documented */ + final Class clazz; + /** The name of the thing we are documenting */ + final String name; + /** the filename where we will be writing the docs for this class */ + final String filename; + /** The name of the documentation group (e.g., walkers, read filters) class belongs to */ + final String group; + /** The documentation handler for this class */ + final DocumentedGATKFeatureHandler handler; + /** The javadoc documentation for clazz */ + final ClassDoc classDoc; + /** The annotation that lead to this Class being in GATKDoc */ + final DocumentedGATKFeatureObject annotation; + /** When was this walker built, and what's the absolute version number */ + final String buildTimestamp, absoluteVersion; + + // set by the handler + String summary; + Map forTemplate; + + public GATKDocWorkUnit(String name, String filename, String group, + DocumentedGATKFeatureObject annotation, DocumentedGATKFeatureHandler handler, + ClassDoc classDoc, Class clazz, + String buildTimestamp, String absoluteVersion) { + this.annotation = annotation; + this.name = name; + this.filename = filename; + this.group = group; + this.handler = handler; + this.classDoc = classDoc; + this.clazz = clazz; + this.buildTimestamp = buildTimestamp; + this.absoluteVersion = absoluteVersion; + } + + /** + * Called by the GATKDoclet to set handler provided context for this work unit + * @param summary + * @param forTemplate + */ + public void setHandlerContent(String summary, Map forTemplate) { + this.summary = summary; + this.forTemplate = forTemplate; + } + + /** + * Return a String -> String map suitable for FreeMarker to create an index to this WorkUnit + * @return + */ + public Map indexDataMap() { + Map data = new HashMap(); + data.put("name", name); + data.put("summary", summary); + data.put("filename", filename); + data.put("group", group); + return data; + } + + /** + * Sort in order of the name of this WorkUnit + * @param other + * @return + */ + public int compareTo(GATKDocWorkUnit other) { + return this.name.compareTo(other.name); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java new file mode 100644 index 000000000..7f26f22f5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.RootDoc; +import freemarker.template.Configuration; +import freemarker.template.DefaultObjectWrapper; +import freemarker.template.Template; +import freemarker.template.TemplateException; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.walkers.qc.DocumentationTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.*; +import java.util.*; + +/** + * Javadoc Doclet that combines javadoc, GATK ParsingEngine annotations, and FreeMarker + * templates to produce html formatted GATKDocs for walkers + * and other classes. + * + * This document has the following workflow: + * + * 1 -- walk the javadoc heirarchy, looking for class that have the + * DocumentedGATKFeature annotation or are in the type heirarchy in the + * static list of things to document, and are to be documented + * 2 -- construct for each a GATKDocWorkUnit, resulting in the complete + * set of things to document + * 3 -- for each unit, actually generate an html page documenting it + * as well as links to related features via their units. Writing + * of a specific class HTML is accomplished by a generate DocumentationHandler + * 4 -- write out an index of all units, organized by group + * + * The documented classes are restricted to only those with @DocumentedGATKFeature + * annotation or are in the STATIC_DOCS class. + */ +public class GATKDoclet { + final protected static Logger logger = Logger.getLogger(GATKDoclet.class); + + /** Where we find the help FreeMarker templates */ + final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); + + /** Where we write the GATKDoc html directory */ + final protected static File DESTINATION_DIR = new File("gatkdocs"); + + // ---------------------------------------------------------------------- + // + // Global variables that are set on the command line by javadoc + // + // ---------------------------------------------------------------------- + protected static String buildTimestamp = null, absoluteVersion = null; + protected static boolean showHiddenFeatures = false; + + protected static boolean testOnly = false; + + /** + * Any class that's in this list will be included in the documentation + * when the -test argument is provided. Useful for debugging. + */ + private static final List> testOnlyKeepers = Arrays.asList( + DocumentationTest.class, CommandLineGATK.class, UserException.class); + + /** The javadoc root doc */ + RootDoc rootDoc; + + /** The set of all things we are going to document */ + Set myWorkUnits; + + /** + * A static list of DocumentedGATKFeatureObjects. Any class that is as or extends + * one of the DocumentedGATKFeatureObjects.clazz of this collection will also + * be documented, even if it doesn't have the @DocumentedGATKFeature annotation. Useful + * when you want to document things that implement an interface (annotations on java + * interfaces aren't inherited) or whose base class isn't under your control (tribble + * codecs). + */ + final static Collection STATIC_DOCS = new ArrayList(); + static { + STATIC_DOCS.add(new DocumentedGATKFeatureObject(FeatureCodec.class, + "Reference ordered data (ROD) codecs", + "Tribble codecs for reading reference ordered data such as VCF or BED files")); + } + + + /** + * Extracts the contents of certain types of javadoc and adds them to an XML file. + * @param rootDoc The documentation root. + * @return Whether the JavaDoc run succeeded. + * @throws java.io.IOException if output can't be written. + */ + public static boolean start(RootDoc rootDoc) throws IOException { + logger.setLevel(Level.INFO); + + // load arguments + for(String[] options: rootDoc.options()) { + if(options[0].equals("-build-timestamp")) + buildTimestamp = options[1]; + if (options[0].equals("-absolute-version")) + absoluteVersion = options[1]; + if (options[0].equals("-include-hidden")) + showHiddenFeatures = true; + if (options[0].equals("-test")) + testOnly = true; + } + + // process the docs + new GATKDoclet().processDocs(rootDoc); + + return true; + } + + /** + * Validate the given options against options supported by this doclet. + * @param option Option to validate. + * @return Number of potential parameters; 0 if not supported. + */ + public static int optionLength(String option) { + if(option.equals("-build-timestamp") || + option.equals("-absolute-version") || + option.equals("-include-hidden")) { + return 2; + } else if ( option.equals("-test") ) + return 1; + else + return 0; + } + + /** + * Are we supposed to include @Hidden annotations in our documented output? + * @return + */ + public boolean showHiddenFeatures() { + return showHiddenFeatures; + } + + /** + * + * @param rootDoc + */ + private void processDocs(RootDoc rootDoc) { + // setup the global access to the root + this.rootDoc = rootDoc; + + try { + // basic setup + DESTINATION_DIR.mkdirs(); + FileUtils.copyFile(new File(SETTINGS_DIR + "/style.css"), new File(DESTINATION_DIR + "/style.css")); + + /* ------------------------------------------------------------------- */ + /* You should do this ONLY ONCE in the whole application life-cycle: */ + + Configuration cfg = new Configuration(); + // Specify the data source where the template files come from. + cfg.setDirectoryForTemplateLoading(SETTINGS_DIR); + // Specify how templates will see the data-model. This is an advanced topic... + cfg.setObjectWrapper(new DefaultObjectWrapper()); + + myWorkUnits = computeWorkUnits(); + for ( GATKDocWorkUnit workUnit : myWorkUnits ) { + processDocWorkUnit(cfg, workUnit); + } + + processIndex(cfg, new ArrayList(myWorkUnits)); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + /** + * Returns the set of all GATKDocWorkUnits that we are going to generate docs for. + * @return + */ + private Set computeWorkUnits() { + TreeSet m = new TreeSet(); + + for ( ClassDoc doc : rootDoc.classes() ) { + //logger.debug("Considering " + doc); + Class clazz = getClassForClassDoc(doc); + + // don't add anything that's not DocumentationTest if we are in test mode + if ( clazz != null && testOnly && ! testOnlyKeepers.contains(clazz) ) + continue; + + //if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) + // logger.debug("foo"); + + DocumentedGATKFeatureObject feature = getFeatureForClassDoc(doc); + DocumentedGATKFeatureHandler handler = createHandler(doc, feature); + if ( handler != null && handler.includeInDocs(doc) ) { + logger.info("Generating documentation for class " + doc); + String filename = handler.getDestinationFilename(doc, clazz); + GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), + filename, feature.groupName(), + feature, handler, doc, clazz, + buildTimestamp, absoluteVersion); + m.add(unit); + } + } + + return m; + } + + /** + * Create a handler capable of documenting the class doc according to feature. Returns + * null if no appropriate handler is found or doc shouldn't be documented at all. + * @param doc + * @param feature + * @return + */ + private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeatureObject feature) { + if ( feature != null ) { + if ( feature.enable() ) { + DocumentedGATKFeatureHandler handler = new GenericDocumentationHandler(); + handler.setDoclet(this); + return handler; + } else { + logger.info("Skipping disabled Documentation for " + doc); + } + } + + return null; + } + + /** + * Returns the instantiated DocumentedGATKFeatureObject that describes the GATKDoc + * structure we will apply to Doc. + * + * @param doc + * @return null if this proves inappropriate or doc shouldn't be documented + */ + private DocumentedGATKFeatureObject getFeatureForClassDoc(ClassDoc doc) { + Class docClass = getClassForClassDoc(doc); + + if ( docClass == null ) + return null; // not annotated so it shouldn't be documented + + if ( docClass.isAnnotationPresent(DocumentedGATKFeature.class) ) { + DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class); + return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs()); + } else { + for ( DocumentedGATKFeatureObject staticDocs : STATIC_DOCS ) { + if ( staticDocs.getClassToDoc().isAssignableFrom(docClass) ) { + return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs()); + } + } + return null; + } + } + + /** + * Return the Java class described by the ClassDoc doc + * @param doc + * @return + */ + private Class getClassForClassDoc(ClassDoc doc) { + try { + // todo -- what do I need the ? extends Object to pass the compiler? + return (Class)HelpUtils.getClassForDoc(doc); + } catch ( ClassNotFoundException e) { + //logger.warn("Couldn't find class for ClassDoc " + doc); + // we got a classdoc for a class we can't find. Maybe in a library or something + return null; + } catch ( NoClassDefFoundError e ) { + return null; + } catch ( UnsatisfiedLinkError e) { + return null; // naughty BWA bindings + } + } + + /** + * Create the html index listing all of the GATKDocs features + * @param cfg + * @param indexData + * @throws IOException + */ + private void processIndex(Configuration cfg, List indexData) throws IOException { + /* Get or create a template */ + Template temp = cfg.getTemplate("generic.index.template.html"); + + /* Merge data-model with template */ + Writer out = new OutputStreamWriter(new FileOutputStream(new File(DESTINATION_DIR + "/index.html"))); + try { + temp.process(groupIndexData(indexData), out); + out.flush(); + } catch ( TemplateException e ) { + throw new ReviewedStingException("Failed to create GATK documentation", e); + } + } + + /** + * Helpful function to create the html index. Given all of the already run GATKDocWorkUnits, + * create the high-level grouping data listing individual features by group. + * @param indexData + * @return + */ + private Map groupIndexData(List indexData) { + // + // root -> data -> { summary -> y, filename -> z }, etc + // -> groups -> group1, group2, etc. + Map root = new HashMap(); + + Collections.sort(indexData); + + List> groups = new ArrayList>(); + Set seenDocumentationFeatures = new HashSet(); + List> data = new ArrayList>(); + for ( GATKDocWorkUnit workUnit : indexData ) { + data.add(workUnit.indexDataMap()); + if ( ! seenDocumentationFeatures.contains(workUnit.annotation.groupName()) ) { + groups.add(toMap(workUnit.annotation)); + seenDocumentationFeatures.add(workUnit.annotation.groupName()); + } + } + + root.put("data", data); + root.put("groups", groups); + root.put("timestamp", buildTimestamp); + root.put("version", absoluteVersion); + + return root; + } + + /** + * Trivial helper routine that returns the map of name and summary given the annotation + * @param annotation + * @return + */ + private static final Map toMap(DocumentedGATKFeatureObject annotation) { + Map root = new HashMap(); + root.put("name", annotation.groupName()); + root.put("summary", annotation.summary()); + return root; + } + + /** + * Helper function that finding the GATKDocWorkUnit associated with class from among all of the work units + * @param c the class we are looking for + * @return the GATKDocWorkUnit whose .clazz.equals(c), or null if none could be found + */ + public final GATKDocWorkUnit findWorkUnitForClass(Class c) { + for ( final GATKDocWorkUnit unit : this.myWorkUnits ) + if ( unit.clazz.equals(c) ) + return unit; + return null; + } + + /** + * Return the ClassDoc associated with clazz + * @param clazz + * @return + */ + public ClassDoc getClassDocForClass(Class clazz) { + return rootDoc.classNamed(clazz.getName()); + } + + /** + * High-level function that processes a single DocWorkUnit unit using its handler + * + * @param cfg + * @param unit + * @throws IOException + */ + private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit) + throws IOException { + //System.out.printf("Processing documentation for class %s%n", unit.classDoc); + + unit.handler.processOne(unit); + + // Get or create a template + Template temp = cfg.getTemplate(unit.handler.getTemplateName(unit.classDoc)); + + // Merge data-model with template + File outputPath = new File(DESTINATION_DIR + "/" + unit.filename); + try { + Writer out = new OutputStreamWriter(new FileOutputStream(outputPath)); + temp.process(unit.forTemplate, out); + out.flush(); + } catch ( TemplateException e ) { + throw new ReviewedStingException("Failed to create GATK documentation", e); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java new file mode 100644 index 000000000..4f1e95499 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -0,0 +1,578 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.FieldDoc; +import com.sun.javadoc.RootDoc; +import com.sun.javadoc.Tag; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broad.tribble.bed.FullBEDFeature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.io.*; +import java.lang.reflect.*; +import java.util.*; + +/** + * + */ +public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { + private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class); + + /** + * The max. length of the longest of --fullName -shortName argument name + * before we prefer the shorter option. + */ + private static final int MAX_DISPLAY_NAME = 30; + + /** The Class we are documenting */ + private GATKDocWorkUnit toProcess; + + @Override + public boolean includeInDocs(ClassDoc doc) { + try { + Class type = HelpUtils.getClassForDoc(doc); + return JVMUtils.isConcrete(type); + } catch ( ClassNotFoundException e ) { + return false; + } + } + + + @Override + public String getTemplateName(ClassDoc doc) throws IOException { + return "generic.template.html"; + } + + @Override + public void processOne(GATKDocWorkUnit toProcessArg) { + this.toProcess = toProcessArg; + + //System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc); + Map root = new HashMap(); + + addHighLevelBindings(root); + addArgumentBindings(root); + addRelatedBindings(root); + + toProcess.setHandlerContent((String)root.get("summary"), root); + } + + /** + * Add high-level summary information about toProcess to root, such as its + * name, summary, description, version, etc. + * + * @param root + */ + protected void addHighLevelBindings(Map root) { + root.put("name", toProcess.classDoc.name()); + + // Extract overrides from the doc tags. + StringBuilder summaryBuilder = new StringBuilder(); + for(Tag tag: toProcess.classDoc.firstSentenceTags()) + summaryBuilder.append(tag.text()); + root.put("summary", summaryBuilder.toString()); + root.put("description", toProcess.classDoc.commentText().substring(summaryBuilder.toString().length())); + root.put("timestamp", toProcess.buildTimestamp); + root.put("version", toProcess.absoluteVersion); + + for(Tag tag: toProcess.classDoc.tags()) { + root.put(tag.name(), tag.text()); + } + } + + /** + * Add bindings describing related GATK capabilites to toProcess + * @param root + */ + protected void addRelatedBindings(Map root) { + List> extraDocsData = new ArrayList>(); + + // add in all of the explicitly related items + for ( final Class extraDocClass : toProcess.annotation.extraDocs() ) { + final GATKDocWorkUnit otherUnit = getDoclet().findWorkUnitForClass(extraDocClass); + if ( otherUnit == null ) + throw new ReviewedStingException("Requested extraDocs for class without any documentation: " + extraDocClass); + extraDocsData.add( + new HashMap(){{ + put("filename", otherUnit.filename); + put("name", otherUnit.name);}}); + + } + root.put("extradocs", extraDocsData); + } + + /** + * Add information about all of the arguments available to toProcess to root + * + * @param root + */ + protected void addArgumentBindings(Map root) { + ParsingEngine parsingEngine = createStandardGATKParsingEngine(); + + Map>> args = createArgumentMap(); + root.put("arguments", args); + try { + // loop over all of the arguments according to the parsing engine + for ( final ArgumentSource argumentSource : parsingEngine.extractArgumentSources(HelpUtils.getClassForDoc(toProcess.classDoc)) ) { + // todo -- why can you have multiple ones? + ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0); + FieldDoc fieldDoc = getFieldDoc(toProcess.classDoc, argumentSource.field.getName()); + Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); + if ( ! argumentSource.isHidden() || getDoclet().showHiddenFeatures() ) { + final String kind = docKindOfArg(argumentSource); + + final Object value = argumentValue(toProcess.clazz, argumentSource); + if ( value != null ) + argBindings.put("defaultValue", prettyPrintValueString(value)); + + args.get(kind).add(argBindings); + args.get("all").add(argBindings); + } + } + + // sort the arguments + for (Map.Entry>> entry : args.entrySet()) { + entry.setValue(sortArguments(entry.getValue())); + } + } catch ( ClassNotFoundException e ) { + throw new RuntimeException(e); + } + } + + /** + * Return the argument kind (required, advanced, hidden, etc) of this argumentSource + * @param argumentSource + * @return + */ + @Requires("argumentSource != null") + @Ensures("result != null") + private String docKindOfArg(ArgumentSource argumentSource) { + if ( argumentSource.isRequired() ) return "required"; + else if ( argumentSource.isAdvanced() ) return "advanced"; + else if ( argumentSource.isHidden() ) return "hidden"; + else if ( argumentSource.isDeprecated() ) return "depreciated"; + else return "optional"; + } + + /** + * Attempts to determine the value of argumentSource in an instantiated version of c + * @param c + * @param argumentSource + * @return value of argumentSource, or null if this isn't possible + */ + @Requires({"c != null", "argumentSource != null"}) + private Object argumentValue(Class c, ArgumentSource argumentSource) { + // get the value of the field + // attempt to instantiate the class + final Object instance = makeInstanceIfPossible(toProcess.clazz); + if ( instance != null ) { + final Object value = getFieldValue(instance, argumentSource.field.getName()); + if ( value != null ) + return value; + + if ( argumentSource.createsTypeDefault() ) { + try { // handle the case where there's an implicit default + return argumentSource.typeDefaultDocString(); + } catch (ReviewedStingException e) { + ; // failed to create type default, don't worry about it + } + } + } + + return null; + } + + /** + * Create the argument map for holding class arguments + * @return + */ + private Map>> createArgumentMap() { + Map>> args = new HashMap>>(); + args.put("all", new ArrayList>()); + args.put("required", new ArrayList>()); + args.put("optional", new ArrayList>()); + args.put("advanced", new ArrayList>()); + args.put("hidden", new ArrayList>()); + args.put("depreciated", new ArrayList>()); + return args; + } + + + /** + * Sorts the individual argument list in unsorted according to CompareArgumentsByName + * @param unsorted + * @return + */ + private List> sortArguments(List> unsorted) { + Collections.sort(unsorted, new CompareArgumentsByName()); + return unsorted; + } + + /** + * Sort arguments by case-insensitive comparison ignoring the -- and - prefixes + */ + private class CompareArgumentsByName implements Comparator> { + public int compare(Map x, Map y) { + return elt(x).compareTo(elt(y)); + } + + private String elt(Map m) { + String v = m.get("name").toString().toLowerCase(); + if ( v.startsWith("--") ) + return v.substring(2); + else if ( v.startsWith("-") ) + return v.substring(1); + else + throw new RuntimeException("Expect to see arguments beginning with at least one -, but found " + v); + } + } + + /** + * Utility function that finds the value of fieldName in any fields of ArgumentCollection fields in + * instance of class c. + * + * @param instance the object to query for the field value + * @param fieldName the name of the field we are looking for in instance + * @return The value assigned to field in the ArgumentCollection, otherwise null + */ + private Object getFieldValue(Object instance, String fieldName) { + // + // subtle note. If you have a field named X that is an ArgumentCollection that + // contains a field X as well, you need only consider fields in the argumentCollection, not + // matching the argument itself. + // + // @ArgumentCollection + // protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + // + for ( Field field : JVMUtils.getAllFields(instance.getClass()) ) { + if ( field.isAnnotationPresent(ArgumentCollection.class) ) { + //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); + Object fieldValue = JVMUtils.getFieldValue(field, instance); + Object value = getFieldValue(fieldValue, fieldName); + if ( value != null ) + return value; + } else if ( field.getName().equals(fieldName) ) { + return JVMUtils.getFieldValue(field, instance); + } + } + + return null; + } + + /** + * Pretty prints value + * + * Assumes value != null + * @param value + * @return + */ + private Object prettyPrintValueString(Object value) { + if ( value.getClass().isArray() ) { + Class type = value.getClass().getComponentType(); + if ( boolean.class.isAssignableFrom(type) ) + return Arrays.toString((boolean[])value); + if ( byte.class.isAssignableFrom(type) ) + return Arrays.toString((byte[])value); + if ( char.class.isAssignableFrom(type) ) + return Arrays.toString((char[])value); + if ( double.class.isAssignableFrom(type) ) + return Arrays.toString((double[])value); + if ( float.class.isAssignableFrom(type) ) + return Arrays.toString((float[])value); + if ( int.class.isAssignableFrom(type) ) + return Arrays.toString((int[])value); + if ( long.class.isAssignableFrom(type) ) + return Arrays.toString((long[])value); + if ( short.class.isAssignableFrom(type) ) + return Arrays.toString((short[])value); + if ( Object.class.isAssignableFrom(type) ) + return Arrays.toString((Object[])value); + else + throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); + } else if ( RodBinding.class.isAssignableFrom(value.getClass() ) ) + // annoying special case to handle the UnBound() constructor + return "none"; + + return value.toString(); + } + + /** + * Attempt to instantiate class c, if possible. Returns null if this proves impossible. + * @param c + * @return + */ + private Object makeInstanceIfPossible(Class c) { + Object instance = null; + try { + // don't try to make something where we will obviously fail + if (! c.isEnum() && ! c.isAnnotation() && ! c.isAnonymousClass() && + ! c.isArray() && ! c.isPrimitive() & JVMUtils.isConcrete(c) ) { + instance = c.newInstance(); + //System.out.printf("Created object of class %s => %s%n", c, instance); + return instance; + } else + return null; + } + catch (IllegalAccessException e ) { } + catch (InstantiationException e ) { } + catch (ExceptionInInitializerError e ) { } + catch (SecurityException e ) { } + // this last one is super dangerous, but some of these methods catch ClassNotFoundExceptions + // and rethrow then as RuntimeExceptions + catch (RuntimeException e) {} + + return instance; + } + + + /** + * Create an instance of the GATK parsing engine, for argument processing with GATKDoclet + * @return + */ + private ParsingEngine createStandardGATKParsingEngine() { + CommandLineProgram clp = new CommandLineGATK(); + try { + CommandLineProgram.start(clp, new String[]{}, true); + return clp.parser; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Gets the javadocs associated with field name in classDoc. Throws a + * runtime exception if this proves impossible. + * + * @param classDoc + * @param name + * @return + */ + private FieldDoc getFieldDoc(ClassDoc classDoc, String name) { + return getFieldDoc(classDoc, name, true); + } + + /** + * Recursive helper routine to getFieldDoc() + * @param classDoc + * @param name + * @param primary + * @return + */ + private FieldDoc getFieldDoc(ClassDoc classDoc, String name, boolean primary) { + //System.out.printf("Looking for %s in %s%n", name, classDoc.name()); + for ( FieldDoc fieldDoc : classDoc.fields(false) ) { + //System.out.printf("fieldDoc " + fieldDoc + " name " + fieldDoc.name()); + if ( fieldDoc.name().equals(name) ) + return fieldDoc; + + Field field = HelpUtils.getFieldForFieldDoc(fieldDoc); + if ( field == null ) + throw new RuntimeException("Could not find the field corresponding to " + fieldDoc + ", presumably because the field is inaccessible"); + if ( field.isAnnotationPresent(ArgumentCollection.class) ) { + ClassDoc typeDoc = getRootDoc().classNamed(fieldDoc.type().qualifiedTypeName()); + if ( typeDoc == null ) + throw new ReviewedStingException("Tried to get javadocs for ArgumentCollection field " + fieldDoc + " but could't find the class in the RootDoc"); + else { + FieldDoc result = getFieldDoc(typeDoc, name, false); + if ( result != null ) + return result; + // else keep searching + } + } + } + + // if we didn't find it here, wander up to the superclass to find the field + if ( classDoc.superclass() != null ) { + return getFieldDoc(classDoc.superclass(), name, false); + } + + if ( primary ) + throw new RuntimeException("No field found for expected field " + name); + else + return null; + } + + /** + * Returns a Pair of (main, synonym) names for argument with fullName s1 and + * shortName s2. The main is selected to be the longest of the two, provided + * it doesn't exceed MAX_DISPLAY_NAME, in which case the shorter is taken. + * @param s1 + * @param s2 + * @return + */ + Pair displayNames(String s1, String s2) { + if ( s1 == null ) return new Pair(s2, null); + if ( s2 == null ) return new Pair(s1, null); + + String l = s1.length() > s2.length() ? s1 : s2; + String s = s1.length() > s2.length() ? s2 : s1; + + if ( l.length() > MAX_DISPLAY_NAME ) + return new Pair(s, l); + else + return new Pair(l, s); + } + + /** + * Returns a human readable string that describes the Type type of a GATK argument. + * + * This will include parameterized types, so that Set{T} shows up as Set(T) and not + * just Set in the docs. + * + * @param type + * @return + */ + protected String argumentTypeString(Type type) { + if (type instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType)type; + List subs = new ArrayList(); + for (Type actualType: parameterizedType.getActualTypeArguments()) + subs.add(argumentTypeString(actualType)); + return argumentTypeString(((ParameterizedType)type).getRawType()) + "[" + Utils.join(",", subs) + "]"; + } else if (type instanceof GenericArrayType) { + return argumentTypeString(((GenericArrayType)type).getGenericComponentType()) + "[]"; + } else if (type instanceof WildcardType) { + throw new RuntimeException("We don't support wildcards in arguments: " + type); + } else if (type instanceof Class) { + return ((Class) type).getSimpleName(); + } else { + throw new StingException("Unknown type: " + type); + } + } + + /** + * Helper routine that returns the Feature.class required by a RodBinding, + * either T for RodBinding{T} or List{RodBinding{T}}. Returns null if + * the Type doesn't fit either model. + * @param type + * @return + */ + protected Class getFeatureTypeIfPossible(Type type) { + if ( type instanceof ParameterizedType) { + ParameterizedType paramType = (ParameterizedType)type; + if ( RodBinding.class.isAssignableFrom((Class)paramType.getRawType()) ) { + return (Class)JVMUtils.getParameterizedTypeClass(type); + } else { + for ( Type paramtype : paramType.getActualTypeArguments() ) { + Class x = getFeatureTypeIfPossible(paramtype); + if ( x != null ) + return x; + } + } + } + + return null; + } + + /** + * High-level entry point for creating a FreeMarker map describing the GATK argument + * source with definition def, with associated javadoc fieldDoc. + * @param fieldDoc + * @param source + * @param def + * @return a non-null Map binding argument keys with their values + */ + protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { + Map root = new HashMap(); + Pair names = displayNames("-" + def.shortName, "--" + def.fullName); + + root.put("name", names.getFirst() ); + + if ( names.getSecond() != null ) + root.put("synonyms", names.getSecond()); + + root.put("required", def.required ? "yes" : "no"); + + // type of the field + root.put("type", argumentTypeString(source.field.getGenericType())); + + Class featureClass = getFeatureTypeIfPossible(source.field.getGenericType()); + if ( featureClass != null ) { + // deal with the allowable types + FeatureManager manager = new FeatureManager(); + List rodTypes = new ArrayList(); + for (FeatureManager.FeatureDescriptor descriptor : manager.getByFeature(featureClass) ) { + rodTypes.add(String.format("%s", + GATKDocUtils.htmlFilenameForClass(descriptor.getCodecClass()), + descriptor.getName())); + } + + root.put("rodTypes", Utils.join(", ", rodTypes)); + } + + // summary and fulltext + root.put("summary", def.doc != null ? def.doc : ""); + root.put("fulltext", fieldDoc.commentText()); + + // What are our enum options? + if ( def.validOptions != null ) + root.put("options", docForEnumArgument(source.field.getType())); + + // general attributes + List attributes = new ArrayList(); + if ( def.required ) attributes.add("required"); + if ( source.isDeprecated() ) attributes.add("depreciated"); + if ( attributes.size() > 0 ) + root.put("attributes", Utils.join(", ", attributes)); + + return root; + } + + /** + * Helper routine that provides a FreeMarker map for an enumClass, grabbing the + * values of the enum and their associated javadoc documentation. + * @param enumClass + * @return + */ + @Requires("enumClass.isEnum()") + private List> docForEnumArgument(Class enumClass) { + ClassDoc doc = this.getDoclet().getClassDocForClass(enumClass); + if ( doc == null ) // || ! doc.isEnum() ) + throw new RuntimeException("Tried to get docs for enum " + enumClass + " but got instead: " + doc); + + List> bindings = new ArrayList>(); + for (final FieldDoc field : doc.fields(false) ) { + bindings.add( + new HashMap(){{ + put("name", field.name()); + put("summary", field.commentText());}}); + } + + return bindings; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java deleted file mode 100644 index b350b1a29..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java +++ /dev/null @@ -1,91 +0,0 @@ -package org.broadinstitute.sting.utils.help; - -import com.sun.javadoc.Tag; -import com.sun.tools.doclets.Taglet; - -/** - * Basic functionality for the help taglet. - * - * @author mhanna - * @version 0.1 - */ -public abstract class HelpTaglet implements Taglet { - /** - * Return the name of this custom tag. - */ - public abstract String getName(); - - /** - * Will return false since this tag cannot be applied - * to a field. - * @return false since this tag cannot be applied to a field. - */ - public boolean inField() { - return false; - } - - /** - * Will return false since by default, help tags cannot be applied to a constructor. - * @return false since by default, help tags cannot be applied to a constructor. - */ - public boolean inConstructor() { - return false; - } - - /** - * Will return false since by default, help tags cannot be applied to a method. - * @return false since by default, this tag cannot be applied to a method. - */ - public boolean inMethod() { - return false; - } - - /** - * Will return false since by default, help tags cannot be applied to an overview. - * @return false since by default, help tags cannot be applied to an overview. - */ - public boolean inOverview() { - return false; - } - - /** - * Will return false since by default, help tags cannot be applied to a package. - * description. - * @return false since by default, help tags cannot be applied to a package. - */ - public boolean inPackage() { - return false; - } - - /** - * Will return false since help tags are by default not inline. - * @return false since help tags are by default not inline. - */ - public boolean inType() { - return false; - } - - /** - * Will return false since help tags are by default not inline. - * @return false since help tags are by default not inline. - */ - public boolean isInlineTag() { - return false; - } - - /** - * Create a string representation of this tag. Since this tag is only - * used by the help system, don't output any HTML. - */ - public String toString(Tag tag) { - return null; - } - - /** - * Create a string representation of this tag. Since this tag is only - * used by the help system, don't output any HTML. - */ - public String toString(Tag[] tags) { - return null; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java new file mode 100644 index 000000000..645ab34c1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.FieldDoc; +import com.sun.javadoc.PackageDoc; +import com.sun.javadoc.ProgramElementDoc; +import org.broadinstitute.sting.utils.classloader.JVMUtils; + +import java.lang.reflect.Field; + +public class HelpUtils { + protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) { + try { + Class type = getClassForDoc(classDoc); + return lhsClass.isAssignableFrom(type) && (!requireConcrete || JVMUtils.isConcrete(type)); + } catch (Throwable t) { + // Ignore errors. + return false; + } + } + + protected static Class getClassForDoc(ProgramElementDoc doc) throws ClassNotFoundException { + return Class.forName(getClassName(doc)); + } + + protected static Field getFieldForFieldDoc(FieldDoc fieldDoc) { + try { + Class clazz = getClassForDoc(fieldDoc.containingClass()); + return JVMUtils.findField(clazz, fieldDoc.name()); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Reconstitute the class name from the given class JavaDoc object. + * + * @param doc the Javadoc model for the given class. + * @return The (string) class name of the given class. + */ + protected static String getClassName(ProgramElementDoc doc) { + PackageDoc containingPackage = doc.containingPackage(); + return containingPackage.name().length() > 0 ? + String.format("%s.%s", containingPackage.name(), doc.name()) : + String.format("%s", doc.name()); + } + +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java index 6ee12d42e..a28a7bcee 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java @@ -28,14 +28,9 @@ package org.broadinstitute.sting.utils.help; import com.sun.javadoc.*; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.*; -import java.util.HashSet; -import java.util.Properties; -import java.util.Scanner; -import java.util.Set; +import java.util.*; /** * Extracts certain types of javadoc (specifically package and class descriptions) and makes them available @@ -48,17 +43,21 @@ public class ResourceBundleExtractorDoclet { /** * Taglet for the particular version number. */ - private static final String VERSION_TAGLET_NAME = "version"; + public static final String VERSION_TAGLET_NAME = "version"; + public static final String SUMMARY_TAGLET_NAME = "help.summary"; + public static final String DESCRIPTION_TAGLET_NAME = "help.description"; /** * Maintains a collection of resources in memory as they're accumulated. */ - private static final Properties resourceText = new Properties(); + protected final Properties resourceText = new Properties(); /** * Maintains a collection of classes that should really be documented. */ - private static final Set undocumentedWalkers = new HashSet(); + protected final Set undocumentedWalkers = new HashSet(); + + protected String buildTimestamp = null, absoluteVersion = null; /** * Extracts the contents of certain types of javadoc and adds them to an XML file. @@ -67,26 +66,38 @@ public class ResourceBundleExtractorDoclet { * @throws IOException if output can't be written. */ public static boolean start(RootDoc rootDoc) throws IOException { + ResourceBundleExtractorDoclet doclet = new ResourceBundleExtractorDoclet(); + PrintStream out = doclet.loadData(rootDoc, true); + doclet.processDocs(rootDoc, out); + return true; + } + + protected PrintStream loadData(RootDoc rootDoc, boolean overwriteResourcesFile) { PrintStream out = System.out; - String buildTimestamp = null, versionPrefix = null, versionSuffix = null, absoluteVersion = null; for(String[] options: rootDoc.options()) { if(options[0].equals("-out")) { - loadExistingResourceFile(options[1], rootDoc); - out = new PrintStream(options[1]); + try { + loadExistingResourceFile(options[1], rootDoc); + if ( overwriteResourcesFile ) + out = new PrintStream(options[1]); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } catch ( IOException e ) { + throw new RuntimeException(e); + } } if(options[0].equals("-build-timestamp")) buildTimestamp = options[1]; - if(options[0].equals("-version-prefix")) - versionPrefix = options[1]; - if(options[0].equals("-version-suffix")) - versionSuffix = options[1]; if (options[0].equals("-absolute-version")) absoluteVersion = options[1]; } resourceText.setProperty("build.timestamp",buildTimestamp); + return out; + } + protected void processDocs(RootDoc rootDoc, PrintStream out) { // Cache packages as we see them, since there's no direct way to iterate over packages. Set packages = new HashSet(); @@ -97,13 +108,19 @@ public class ResourceBundleExtractorDoclet { if(isRequiredJavadocMissing(currentClass) && isWalker(currentClass)) undocumentedWalkers.add(currentClass.name()); - renderHelpText(getClassName(currentClass),currentClass,versionPrefix,versionSuffix,absoluteVersion); + renderHelpText(HelpUtils.getClassName(currentClass),currentClass); } for(PackageDoc currentPackage: packages) - renderHelpText(currentPackage.name(),currentPackage,versionPrefix,versionSuffix,absoluteVersion); + renderHelpText(currentPackage.name(),currentPackage); - resourceText.store(out,"Strings displayed by the Sting help system"); + try { + resourceText.store(out,"Strings displayed by the Sting help system"); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } catch ( IOException e ) { + throw new RuntimeException(e); + } // ASCII codes for making text blink final String blink = "\u001B\u005B\u0035\u006D"; @@ -111,8 +128,6 @@ public class ResourceBundleExtractorDoclet { if(undocumentedWalkers.size() > 0) Utils.warnUser(String.format("The following walkers are currently undocumented: %s%s%s", blink, Utils.join(" ",undocumentedWalkers), reset)); - - return true; } /** @@ -121,7 +136,7 @@ public class ResourceBundleExtractorDoclet { * @return Number of potential parameters; 0 if not supported. */ public static int optionLength(String option) { - if(option.equals("-build-timestamp") || option.equals("-version-prefix") || option.equals("-version-suffix") || option.equals("-out") || option.equals("-absolute-version") ) { + if(option.equals("-build-timestamp") || option.equals("-out") || option.equals("-absolute-version") ) { return 2; } return 0; @@ -137,7 +152,7 @@ public class ResourceBundleExtractorDoclet { * @throws IOException if there is an I/O-related error other than FileNotFoundException * while attempting to read the resource file. */ - private static void loadExistingResourceFile( String resourceFileName, RootDoc rootDoc ) throws IOException { + private void loadExistingResourceFile( String resourceFileName, RootDoc rootDoc ) throws IOException { try { BufferedReader resourceFile = new BufferedReader(new FileReader(resourceFileName)); try { @@ -157,27 +172,8 @@ public class ResourceBundleExtractorDoclet { * @param classDoc the type of the given class. * @return True if the class of the given name is a walker. False otherwise. */ - private static boolean isWalker(ClassDoc classDoc) { - try { - Class type = Class.forName(getClassName(classDoc)); - return Walker.class.isAssignableFrom(type) && JVMUtils.isConcrete(type); - } - catch(Throwable t) { - // Ignore errors. - return false; - } - } - - /** - * Reconstitute the class name from the given class JavaDoc object. - * @param classDoc the Javadoc model for the given class. - * @return The (string) class name of the given class. - */ - private static String getClassName(ClassDoc classDoc) { - PackageDoc containingPackage = classDoc.containingPackage(); - return containingPackage.name().length() > 0 ? - String.format("%s.%s",containingPackage.name(),classDoc.name()) : - String.format("%s",classDoc.name()); + protected static boolean isWalker(ClassDoc classDoc) { + return HelpUtils.assignableToClass(classDoc, Walker.class, true); } /** @@ -186,8 +182,6 @@ public class ResourceBundleExtractorDoclet { * @return True if the JavaDoc is missing. False otherwise. */ private static boolean isRequiredJavadocMissing(ClassDoc classDoc) { - if(classDoc.containingPackage().name().contains("oneoffprojects")) - return false; return classDoc.commentText().length() == 0 || classDoc.commentText().contains("Created by IntelliJ"); } @@ -195,53 +189,23 @@ public class ResourceBundleExtractorDoclet { * Renders all the help text required for a given name. * @param elementName element name to use as the key * @param element Doc element to process. - * @param versionPrefix Text to add to the start of the version string. - * @param versionSuffix Text to add to the end of the version string. */ - private static void renderHelpText(String elementName, Doc element, String versionPrefix, String versionSuffix, String absoluteVersion) { - // Extract overrides from the doc tags. - String name = null; - String version = null; + private void renderHelpText(String elementName, Doc element) { StringBuilder summaryBuilder = new StringBuilder(); for(Tag tag: element.firstSentenceTags()) summaryBuilder.append(tag.text()); String summary = summaryBuilder.toString(); String description = element.commentText(); - for(Tag tag: element.tags()) { - if(tag.name().equals("@"+DisplayNameTaglet.NAME)) { - if(name != null) - throw new ReviewedStingException("Only one display name tag can be used per package / walker."); - name = tag.text(); - } - else if(tag.name().equals("@"+VERSION_TAGLET_NAME)) { - if ( absoluteVersion != null ) { - version = absoluteVersion; - } - else { - version = String.format("%s%s%s", (versionPrefix != null) ? versionPrefix : "", - tag.text(), - (versionSuffix != null) ? versionSuffix : ""); - } - } - else if(tag.name().equals("@"+SummaryTaglet.NAME)) - summary = tag.text(); - else if(tag.name().equals("@"+DescriptionTaglet.NAME)) - description = tag.text(); - } - - // Write out an alternate element name, if exists. - if(name != null) - resourceText.setProperty(String.format("%s.%s",elementName,DisplayNameTaglet.NAME),name); - - if(version != null) - resourceText.setProperty(String.format("%s.%s",elementName,VERSION_TAGLET_NAME),version); + // this might seem unnecessary, but the GATK command line program uses this tag to determine the version when running + if(absoluteVersion != null) + resourceText.setProperty(String.format("%s.%s",elementName,VERSION_TAGLET_NAME),absoluteVersion); // Write out an alternate element summary, if exists. - resourceText.setProperty(String.format("%s.%s",elementName,SummaryTaglet.NAME),formatText(summary)); + resourceText.setProperty(String.format("%s.%s",elementName,SUMMARY_TAGLET_NAME),formatText(summary)); // Write out an alternate description, if present. - resourceText.setProperty(String.format("%s.%s",elementName,DescriptionTaglet.NAME),formatText(description)); + resourceText.setProperty(String.format("%s.%s",elementName,DESCRIPTION_TAGLET_NAME),formatText(description)); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java deleted file mode 100644 index db8b55940..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java +++ /dev/null @@ -1,58 +0,0 @@ -package org.broadinstitute.sting.utils.help; - -import com.sun.tools.doclets.Taglet; - -import java.util.Map; - -/** - * Provide an alternate brief summary for this walker / package. - * Acts as an alternative to the first sentence employed by default. - * @author mhanna - * @version 0.1 - */ -public class SummaryTaglet extends HelpTaglet { - /** - * The key tag for this taglet. - */ - public static final String NAME = "help.summary"; - - /** - * Return the name of this custom tag. - */ - @Override - public String getName() { - return NAME; - } - - /** - * Will return false since overviews are always named - * by the @WalkerName tag. - * @return false always - */ - @Override - public boolean inOverview() { - return true; - } - - /** - * Will return true to indicate that packages can be given useful summary. - * @return true always - */ - @Override - public boolean inPackage() { - return true; - } - - /** - * Register this Taglet. - * @param tagletMap the map to register this tag to. - */ - public static void register(Map tagletMap) { - SummaryTaglet tag = new SummaryTaglet(); - Taglet t = (Taglet)tagletMap.get(tag.getName()); - if (t != null) { - tagletMap.remove(tag.getName()); - } - tagletMap.put(tag.getName(), tag); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java index 988240ef9..2bc3fa284 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.utils.interval; import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.gatk.refdata.utils.StringToGenomeLocIteratorAdapter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java b/public/java/src/org/broadinstitute/sting/utils/interval/StringToGenomeLocIteratorAdapter.java similarity index 95% rename from public/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java rename to public/java/src/org/broadinstitute/sting/utils/interval/StringToGenomeLocIteratorAdapter.java index fc7f7c58f..659260345 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/StringToGenomeLocIteratorAdapter.java @@ -23,7 +23,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.utils; +package org.broadinstitute.sting.utils.interval; import org.broadinstitute.sting.gatk.iterators.PushbackIterator; import org.broadinstitute.sting.utils.GenomeLoc; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 66e1afecb..12899e898 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -82,7 +82,7 @@ public class PileupElement { // -------------------------------------------------------------------------- private Integer getReducedReadQualityTagValue() { - return (Integer)getRead().getAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG); + return getRead().getIntegerAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG); } public boolean isReducedRead() { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 517f9f75d..c55a462f1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -41,6 +41,10 @@ public class GATKSAMRecord extends SAMRecord { // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; + /** A private cache for the reduced read quality. Null indicates the value hasn't be fetched yet or isn't available */ + private boolean lookedUpReducedReadQuality = false; + private Integer reducedReadQuality; + // These temporary attributes were added here to make life easier for // certain algorithms by providing a way to label or attach arbitrary data to // individual GATKSAMRecords. @@ -338,7 +342,17 @@ public class GATKSAMRecord extends SAMRecord { public Object getAttribute(final String tag) { return mRecord.getAttribute(tag); } - public Integer getIntegerAttribute(final String tag) { return mRecord.getIntegerAttribute(tag); } + public Integer getIntegerAttribute(final String tag) { + if ( tag == ReadUtils.REDUCED_READ_QUALITY_TAG ) { + if ( ! lookedUpReducedReadQuality ) { + lookedUpReducedReadQuality = true; + reducedReadQuality = mRecord.getIntegerAttribute(tag); + } + return reducedReadQuality; + } else { + return mRecord.getIntegerAttribute(tag); + } + } public Short getShortAttribute(final String tag) { return mRecord.getShortAttribute(tag); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 6c15910b1..62bbb0307 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -112,7 +113,42 @@ public class ReadUtils { * @version 0.1 */ - public enum OverlapType { NOT_OVERLAPPING, IN_ADAPTOR } + public enum OverlapType { NOT_OVERLAPPING, IN_ADAPTOR} + + /** + * This enum represents all the different ways in which a read can overlap an interval. + * + * NO_OVERLAP: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * LEFT_OVERLAP: + * the read starts before the beginning of the interval but ends inside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * RIGHT_OVERLAP: + * the read starts inside the interval but ends outside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * FULL_OVERLAP: + * the read starts before the interval and ends after the interval + * + * |-----------| (interval) + * <-------------------> (read) + * + * CONTAINED: + * the read starts and ends inside the interval + * + * |----------------| (interval) + * <--------> (read) + */ + public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} /** * God, there's a huge information asymmetry in SAM format: @@ -396,16 +432,34 @@ public class ReadUtils { keepEnd = rec.getReadLength() - l - 1; newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP)); break; - case H: - // TODO -- must be handled specially - throw new ReviewedStingException("BUG: tell mark he forgot to implement this"); + default: newCigarElements.add(ce); break; } } - return hardClipBases(rec, keepStart, keepEnd, newCigarElements); + // Merges tandem cigar elements like 5H10H or 2S5S to 15H or 7S + // this will happen if you soft clip a read that has been hard clipped before + // like: 5H20S => 5H20H + List mergedCigarElements = new LinkedList(); + Iterator cigarElementIterator = newCigarElements.iterator(); + CigarOperator currentOperator = null; + int currentOperatorLength = 0; + while (cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + if (currentOperator != cigarElement.getOperator()) { + if (currentOperator != null) + mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); + currentOperator = cigarElement.getOperator(); + currentOperatorLength = cigarElement.getLength(); + } + else + currentOperatorLength += cigarElement.getLength(); + } + mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); + + return hardClipBases(rec, keepStart, keepEnd, mergedCigarElements); } /** @@ -424,8 +478,7 @@ public class ReadUtils { "keepEnd < rec.getReadLength()", "rec.getReadUnmappedFlag() || newCigarElements != null"}) @Ensures("result != null") - public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, - List newCigarElements) { + public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, List newCigarElements) { int newLength = keepEnd - keepStart + 1; if ( newLength != rec.getReadLength() ) { try { @@ -569,7 +622,149 @@ public class ReadUtils { return 0; } + /** + * Determines what is the position of the read in relation to the interval. + * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. + * @param read the read + * @param interval the interval + * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) + */ + public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) { + int start = getRefCoordSoftUnclippedStart(read); + int stop = getRefCoordSoftUnclippedEnd(read); + + if ( !read.getReferenceName().equals(interval.getContig()) ) + return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; + + else if ( stop < interval.getStart() ) + return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; + + else if ( start > interval.getStop() ) + return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; + + else if ( (start >= interval.getStart()) && + (stop <= interval.getStop()) ) + return ReadAndIntervalOverlap.OVERLAP_CONTAINED; + + else if ( (start < interval.getStart()) && + (stop > interval.getStop()) ) + return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; + + else if ( (start < interval.getStart()) ) + return ReadAndIntervalOverlap.OVERLAP_LEFT; + + else + return ReadAndIntervalOverlap.OVERLAP_RIGHT; + } + + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) + public static int getRefCoordSoftUnclippedStart(SAMRecord read) { + int start = read.getUnclippedStart(); + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + start += cigarElement.getLength(); + else + break; + } + return start; + } + + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) + public static int getRefCoordSoftUnclippedEnd(SAMRecord read) { + int stop = read.getUnclippedStart(); + int shift = 0; + CigarOperator lastOperator = null; + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + stop += shift; + lastOperator = cigarElement.getOperator(); + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) + shift = cigarElement.getLength(); + else + shift = 0; + } + return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; + } + + /** + * Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before + * the alignment start of the read. + * + * @param read + * @param refCoord + * @return the corresponding read coordinate or -1 if it failed to find it (it has been hard clipped before) + */ + @Requires({"refCoord >= read.getUnclippedStart()", "refCoord < read.getAlignmentStart()"}) + private static int getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(SAMRecord read, int refCoord) { + if (getRefCoordSoftUnclippedStart(read) <= refCoord) + return refCoord - getRefCoordSoftUnclippedStart(read) + 1; + return -1; + } + + + /** + * Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region after + * the alignment end of the read. + * + * @param read + * @param refCoord + * @return the corresponding read coordinate or -1 if it failed to find it (it has been hard clipped before) + */ + @Requires({"refCoord <= read.getUnclippedEnd()", "refCoord > read.getAlignmentEnd()"}) + private static int getReadCoordinateForReferenceCoordinateBeforeAlignmentEnd(SAMRecord read, int refCoord) { + if (getRefCoordSoftUnclippedEnd(read) >= refCoord) + return refCoord - getRefCoordSoftUnclippedStart(read) + 1; + return -1; + } + + + @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) + @Ensures({"result >= 0", "result < read.getReadLength()"}) + public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { + int readBases = 0; + int refBases = 0; + + if (refCoord < read.getAlignmentStart()) { + readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(read, refCoord); + if (readBases < 0) + throw new ReviewedStingException("Requested a coordinate in a hard clipped area of the read. No equivalent read coordinate."); + } + else if (refCoord > read.getAlignmentEnd()) { + readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentEnd(read, refCoord); + if (readBases < 0) + throw new ReviewedStingException("Requested a coordinate in a hard clipped area of the read. No equivalent read coordinate."); + } + else { + int goal = refCoord - read.getAlignmentStart(); // The goal is to move this many reference bases + boolean goalReached = refBases == goal; + + Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; + + if (cigarElement.getOperator().consumesReferenceBases()) { + if (refBases + cigarElement.getLength() < goal) { + shift = cigarElement.getLength(); + } + else { + shift = goal - refBases; + } + refBases += shift; + } + goalReached = refBases == goal; + + if (cigarElement.getOperator().consumesReadBases()) { + readBases += goalReached ? shift : cigarElement.getLength(); + } + } + + if (!goalReached) + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + } + + return readBases; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java index f6aa882ad..9d4b23a8b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java @@ -25,10 +25,11 @@ package org.broadinstitute.sting.utils.text; import org.broadinstitute.sting.commandline.ParsingEngine; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; @@ -92,7 +93,9 @@ public class ListFileUtils { * @param RODBindings a text equivale * @return a list of expanded, bound RODs. */ - public static Collection unpackRODBindings(final List RODBindings, final String dbSNPFile, final ParsingEngine parser) { + @Deprecated + public static Collection unpackRODBindingsOldStyle(final Collection RODBindings, final ParsingEngine parser) { + // todo -- this is a strange home for this code. Move into ROD system Collection rodBindings = new ArrayList(); for (String fileName: RODBindings) { @@ -120,21 +123,54 @@ public class ListFileUtils { rodBindings.add(new RMDTriplet(name,type,fileName,storageType,tags)); } - if (dbSNPFile != null) { - if(dbSNPFile.toLowerCase().contains("vcf")) - throw new UserException("--DBSNP (-D) argument currently does not support VCF. To use dbSNP in VCF format, please use -B:dbsnp,vcf ."); + return rodBindings; + } - final Tags tags = parser.getTags(dbSNPFile); - String fileName = expandFileName(dbSNPFile); - RMDTriplet.RMDStorageType storageType = fileName.toLowerCase().endsWith("stdin") ? RMDTriplet.RMDStorageType.STREAM : RMDTriplet.RMDStorageType.FILE; + /** + * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. + * @param RODBindings a text equivale + * @return a list of expanded, bound RODs. + */ + public static Collection unpackRODBindings(final Collection RODBindings, final ParsingEngine parser) { + // todo -- this is a strange home for this code. Move into ROD system + Collection rodBindings = new ArrayList(); + FeatureManager builderForValidation = new FeatureManager(); - rodBindings.add(new RMDTriplet(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME,"dbsnp",fileName,storageType,tags)); + for (RodBinding rodBinding: RODBindings) { + String argValue = rodBinding.getSource(); + String fileName = expandFileName(argValue); + String name = rodBinding.getName(); + String type = rodBinding.getTribbleType(); + + RMDTriplet.RMDStorageType storageType = null; + if(rodBinding.getTags().getValue("storage") != null) + storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,rodBinding.getTags().getValue("storage")); + else if(fileName.toLowerCase().endsWith("stdin")) + storageType = RMDTriplet.RMDStorageType.STREAM; + else + storageType = RMDTriplet.RMDStorageType.FILE; + + RMDTriplet triplet = new RMDTriplet(name,type,fileName,storageType,rodBinding.getTags()); + + // validate triplet type + FeatureManager.FeatureDescriptor descriptor = builderForValidation.getByTriplet(triplet); + if ( descriptor == null ) + throw new UserException.UnknownTribbleType(rodBinding.getTribbleType(), + String.format("Field %s had provided type %s but there's no such Tribble type. The compatible types are: %n%s", + rodBinding.getName(), rodBinding.getTribbleType(), builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType()))); + if ( ! rodBinding.getType().isAssignableFrom(descriptor.getFeatureClass()) ) + throw new UserException.BadArgumentValue(rodBinding.getName(), + String.format("Field %s expects Features of type %s, but the input file produces Features of type %s. The compatible types are: %n%s", + rodBinding.getName(), rodBinding.getType().getSimpleName(), descriptor.getSimpleFeatureName(), + builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType()))); + + + rodBindings.add(triplet); } return rodBindings; } - /** * Expand any special characters that appear in the filename. Right now, '-' is expanded to * '/dev/stdin' only, but in the future, special characters like '~' and '*' that are passed diff --git a/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java index 1d4251542..d6bf72ec4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java @@ -104,7 +104,7 @@ public class TextFormattingUtils { bundle = ResourceBundle.getBundle(bundleName); } catch(MissingResourceException ex) { - logger.warn("Unable to load help text. Help output will be sparse."); + //logger.warn("Unable to load help text. Help output will be sparse."); // Generate an empty resource bundle. try { bundle = new PropertyResourceBundle(new StringReader("")); @@ -116,4 +116,57 @@ public class TextFormattingUtils { return bundle; } + + /** + * Returns the word starting positions within line, excluding the first position 0. + * The returned list is compatible with splitFixedWidth. + * @param line Text to parse. + * @return the word starting positions within line, excluding the first position 0. + */ + public static List getWordStarts(String line) { + if (line == null) + throw new ReviewedStingException("line is null"); + List starts = new ArrayList(); + int stop = line.length(); + for (int i = 1; i < stop; i++) + if (Character.isWhitespace(line.charAt(i-1))) + if(!Character.isWhitespace(line.charAt(i))) + starts.add(i); + return starts; + } + + /** + * Parses a fixed width line of text. + * @param line Text to parse. + * @param columnStarts the column starting positions within line, excluding the first position 0. + * @return The parsed string array with each entry trimmed. + */ + public static String[] splitFixedWidth(String line, List columnStarts) { + if (line == null) + throw new ReviewedStingException("line is null"); + if (columnStarts == null) + throw new ReviewedStingException("columnStarts is null"); + int startCount = columnStarts.size(); + String[] row = new String[startCount + 1]; + if (startCount == 0) { + row[0] = line.trim(); + } else { + row[0] = line.substring(0, columnStarts.get(0)).trim(); + for (int i = 1; i < startCount; i++) + row[i] = line.substring(columnStarts.get(i - 1), columnStarts.get(i)).trim(); + row[startCount] = line.substring(columnStarts.get(startCount - 1)).trim(); + } + return row; + } + + /** + * Parses a line of text by whitespace. + * @param line Text to parse. + * @return The parsed string array. + */ + public static String[] splitWhiteSpace(String line) { + if (line == null) + throw new ReviewedStingException("line is null"); + return line.trim().split("\\s+"); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java b/public/java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java deleted file mode 100644 index d16c19130..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java +++ /dev/null @@ -1,16 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import java.util.concurrent.locks.ReentrantLock; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 1/19/11 - * Time: 9:50 AM - * - * Simple extension of a ReentrantLock that supports a close method. - */ -public class ClosableReentrantLock extends ReentrantLock { - public boolean ownsLock() { return super.isHeldByCurrentThread(); } - public void close() {} -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java deleted file mode 100644 index 3763ec67d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java +++ /dev/null @@ -1,114 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.*; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** - * Keeps a copy of the processing locks in a file - */ -public class FileBackedGenomeLocProcessingTracker extends GenomeLocProcessingTracker { - private static final Logger logger = Logger.getLogger(FileBackedGenomeLocProcessingTracker.class); - private static final boolean DEBUG = false; - private static final String READ_MODE = "r"; - private static final String WRITE_MODE = "rws"; - - private final File sharedFile; - private final GenomeLocParser parser; - private long lastReadPosition = 0; - - public FileBackedGenomeLocProcessingTracker(File sharedFile, GenomeLocParser parser, ClosableReentrantLock lock, PrintStream status) { - super(lock, status); - - this.sharedFile = sharedFile; - this.parser = parser; - } - - private RandomAccessFile openFile(String mode) { - try { - return new RandomAccessFile(sharedFile, mode); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(sharedFile, e); - } - } - - private void closeFile(RandomAccessFile raFile) { - try { - if ( raFile != null ) raFile.close(); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(sharedFile, e); - } - } - - @Override - protected List readNewLocs() { - List newPLocs = new ArrayList(); // todo -- gratitous object creation - - if ( sharedFile.exists() ) { - RandomAccessFile raFile = null; - try { - raFile = openFile(READ_MODE); - //logger.warn(String.format("Reading new locs at: file.length=%d last=%d", raFile.length(), lastReadPosition)); - if ( raFile.length() > lastReadPosition ) { - raFile.seek(lastReadPosition); - - int counter = 0; - String line = raFile.readLine(); // Read another line - while ( line != null ) { - String[] parts = line.split(" "); - if ( parts.length != 2 ) throw new ReviewedStingException("BUG: bad sharedFile line '" + line + "' at " + raFile.getFilePointer()); - ProcessingLoc ploc = new ProcessingLoc(parser.parseGenomeLoc(parts[0]), parts[1]); - //logger.warn(" Read " + ploc); - newPLocs.add(ploc); - line = raFile.readLine(); - counter++; - } - lastReadPosition = raFile.getFilePointer(); - if ( DEBUG ) logger.warn(String.format("Read %s locs from file, current pos is %d, # read new locs is %d", - counter, lastReadPosition, newPLocs.size())); - } - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(sharedFile, e); - } catch (IOException e) { - throw new ReviewedStingException("Couldn't read sharedFile " + sharedFile, e); - } finally { - closeFile(raFile); - } - } - - return newPLocs; - } - - @Override - protected void registerNewLocs(Collection plocs) { - RandomAccessFile raFile = null; - - try { - raFile = openFile(WRITE_MODE); - long startPos = raFile.getFilePointer(); - raFile.seek(raFile.length()); - StringBuffer bytes = new StringBuffer(); - for ( ProcessingLoc ploc : plocs ) { - String packet = String.format("%s %s%n", ploc.getLocation(), ploc.getOwner()); - bytes.append(packet); - if ( DEBUG ) logger.warn(String.format("Wrote loc %s to file: %d + %d bytes ending at %d", ploc, startPos, packet.length(), raFile.getFilePointer())); - } - raFile.write(bytes.toString().getBytes()); - //raFile.getChannel().force(true); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(sharedFile, e); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(sharedFile, e); - } finally { - closeFile(raFile); - } - } -} - - diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java deleted file mode 100644 index e97a73fb8..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java +++ /dev/null @@ -1,486 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.HasGenomeLocation; -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.text.SimpleDateFormat; -import java.util.*; - -/** - * Abstract base class to coordinating data processing by a collecting for processes / threads. - * - * Conceptually, the genome is viewed as a collection of non-overlapping genome location: - * - * chr1:1-10 - * chr1:11-20 - * chr1:21-30 - * etc. - * - * This class, and it's concrete derived classes, provide the ability to claim individual locations - * as "mine", and exclude other processes / threads from processing them. At the lowest-level this - * is implemented by the claimOwnership(loc, name) function, that returns true if loc free (unclaimed) - * and makes name the owner of loc. High-level, and more efficient operations provide claiming - * iterators over streams of objects implementing the HasGenomeLocation interface, so that you can - * write code that looks like: - * - * for ( GenomeLoc ownedLoc : onlyOwned(allLocsToProcess.iterator) ) { - * doSomeWork(ownedLoc) - * - * Much of the code in this class is actually surrounding debugging and performance metrics code. - * The actual synchronization code is separated out into the ClosableReentrantLock() system - * and the two abstract functions: - * - * protected abstract void registerNewLocs(Collection plocs); - * protected abstract Collection readNewLocs(); - * - * That maintain the state of the tracker. - * - * That is, the ProcessingTracker is made of two components: a thread / process locking system and - * a subclass that implements the methods to record new claimed state changes and to read out updates - * that may have occurred by another thread or process. - * - * NOTE: this class assumes that all threads / processes are working with the same set of potential - * GenomeLocs to own. Claiming chr1:1-10 and then chr1:5-6 is allowed by the system. Basically, - * you only can stake claim to GenomeLocs that are .equal(). - */ -public abstract class GenomeLocProcessingTracker { - private final static Logger logger = Logger.getLogger(FileBackedGenomeLocProcessingTracker.class); - private final static SimpleDateFormat STATUS_FORMAT = new SimpleDateFormat("HH:mm:ss,SSS"); - private final static int DEFAULT_OWNERSHIP_ITERATOR_SIZE = 1; - - /** - * Useful state strings for printing status - */ - private final static String GOING_FOR_LOCK = "going_for_lock"; - private final static String RELEASING_LOCK = "releasing_lock"; - private final static String HAVE_LOCK = "have_lock"; - private final static String RUNNING = "running"; - - /** - * A map, for efficiency, that allows quick lookup of the processing loc for a - * given GenomeLoc. The map points from loc -> loc / owner as a ProcessingLoc - */ - private final Map processingLocs; - - /** - * The locking object used to protect data from simulatanous access by multiple - * threads or processes. - */ - private final ClosableReentrantLock lock; - - /** A stream for writing status messages. Can be null if we aren't writing status */ - private final PrintStream status; - - // - // Timers for recording performance information - // Note -- these cannot be used because this class isn't thread safe, and neither are the - // timers, so they result in invalid operations w.r.t. the SimpleTimer contract - // -// protected final SimpleTimer writeTimer = new SimpleTimer("writeTimer"); -// protected final SimpleTimer readTimer = new SimpleTimer("readTimer"); -// protected final SimpleTimer lockWaitTimer = new SimpleTimer("lockWaitTimer"); - protected final SimpleTimer timer = new SimpleTimer(); - protected long nLocks = 0, nWrites = 0, nReads = 0; - - // -------------------------------------------------------------------------------- - // - // Creating ProcessingTrackers - // - // -------------------------------------------------------------------------------- - public GenomeLocProcessingTracker(ClosableReentrantLock lock, PrintStream status) { - this.processingLocs = new HashMap(); - this.status = status; - this.lock = lock; - printStatusHeader(); - } - - // -------------------------------------------------------------------------------- - // - // Code to override to change the dynamics of the the GenomeLocProcessingTracker - // - // -------------------------------------------------------------------------------- - - protected void close() { - lock.close(); - if ( status != null ) status.close(); - } - - /** - * Takes a collection of newly claimed (i.e., previous unclaimed) genome locs - * and the name of their owner and "registers" this data in some persistent way that's - * visible to all threads / processes communicating via this GenomeLocProcessingTracker. - * - * Could be a in-memory data structure (a list) if we are restricting ourselves to intra-memory - * parallelism, a locked file on a shared file system, or a server we communicate with. - * - * @param plocs - */ - protected abstract void registerNewLocs(Collection plocs); - - /** - * The inverse of the registerNewLocs() function. Looks at the persistent data store - * shared by all threads / processes and returns the ones that have appeared since the last - * call to readNewLocs(). Note that we expect the pair of registerNewLocs and readNewLocs to - * include everything, even locs registered by this thread / process. For example: - * - * readNewLocs() => List() - * registerNewLocs(List(x, y,)) => void - * readNewLocs() => List(x,y)) - * - * even for this thread or process. - * @return - */ - protected abstract Collection readNewLocs(); - - - // -------------------------------------------------------------------------------- - // - // Code to claim intervals for processing and query for their ownership - // - // -------------------------------------------------------------------------------- - - /** - * Queries the current database if a location is owned. Does not guarantee that the - * loc can be owned in a future call, though. - * - * @param loc - * @return - */ - public final boolean locIsOwned(GenomeLoc loc, String id) { - return findOwner(loc, id) != null; - } - - /** - * The workhorse routine. Attempt to claim processing ownership of loc, with my name. - * This is an atomic operation -- other threads / processes will wait until this function - * returns. The return result is the ProcessingLoc object describing who owns this - * location. If the location isn't already claimed and we now own the location, the pl owner - * will be myName. Otherwise, the name of the owner can found in the pl. - * - * @param loc - * @param myName - * @return - */ - public final ProcessingLoc claimOwnership(final GenomeLoc loc, final String myName) { - // processingLocs is a shared memory synchronized object, and this - // method is synchronized, so we can just do our processing - return new WithLock(myName) { - public ProcessingLoc doBody() { - ProcessingLoc owner = findOwner(loc, myName); - if ( owner == null ) { // we are unowned - owner = new ProcessingLoc(loc, myName); - registerNewLocsWithTimers(Arrays.asList(owner), myName); - } - return owner; - } - }.run(); - } - - - // -------------------------------------------------------------------------------- - // - // High-level iterator-style interface to claiming ownership - // - // -------------------------------------------------------------------------------- - - /** - * A higher-level, and more efficient, interface to obtain the next location we own. Takes an - * iterator producing objects that support the getLocation() interface, and returns the next - * object in that stream that we can claim ownership of. Returns null if we run out of elements - * during the iteration. - * - * Can be more efficiently implemented in subclasses to avoid multiple unlocking - * - * @param iterator - * @param myName - * @return - */ - public final T claimOwnershipOfNextAvailable(Iterator iterator, String myName) { - OwnershipIterator myIt = new OwnershipIterator(iterator, myName, 1); - return myIt.next(); - } - - public final Iterable onlyOwned(Iterator iterator, String myName) { - return new OwnershipIterator(iterator, myName); - } - - private final class OwnershipIterator implements Iterator, Iterable { - private final Iterator subit; - private final String myName; - private final Queue cache; - private final int cacheSize; - - public OwnershipIterator(Iterator subit, String myName) { - this(subit, myName, DEFAULT_OWNERSHIP_ITERATOR_SIZE); - } - - public OwnershipIterator(Iterator subit, String myName, int cacheSize) { - this.subit = subit; - this.myName = myName; - cache = new LinkedList(); - this.cacheSize = cacheSize; - } - - /** - * Will return true for all elements of subit, even if we can't get ownership of some of the future - * elements and so will return null there - * @return - */ - public final boolean hasNext() { - return cache.peek() != null || subit.hasNext(); - } - - /** - * High performance iterator that only locks and unlocks once per claimed object found. Avoids - * locking / unlocking for each query - * - * @return an object of type T owned by this thread, or null if none of the remaining object could be claimed - */ - public final T next() { - if ( cache.peek() != null) - return cache.poll(); - else { - // cache is empty, we need to fill up the cache and return the first element of the queue - return new WithLock(myName) { - public T doBody() { - // read once the database of owners at the start - updateAndGetProcessingLocs(myName); - - boolean done = false; - Queue pwns = new LinkedList(); // ;-) - while ( !done && cache.size() < cacheSize && subit.hasNext() ) { - final T elt = subit.next(); - GenomeLoc loc = elt.getLocation(); - - ProcessingLoc owner = processingLocs.get(loc); - - if ( owner == null ) { // we are unowned - owner = new ProcessingLoc(loc, myName); - pwns.offer(owner); - if ( ! cache.offer(elt) ) throw new ReviewedStingException("Cache offer unexpectedly failed"); - if ( GenomeLoc.isUnmapped(loc) ) done = true; - } - // if not, we continue our search - } - - registerNewLocsWithTimers(pwns, myName); - - // we've either filled up the cache or run out of elements. Either way we return - // the first element of the cache. If the cache is empty, we return null here. - return cache.poll(); - } - }.run(); - } - } - - public final void remove() { - throw new UnsupportedOperationException(); - } - - public final Iterator iterator() { - return this; - } - } - - // -------------------------------------------------------------------------------- - // - // private / protected low-level accessors / manipulators and utility functions - // - // -------------------------------------------------------------------------------- - - /** - * Useful debugging function that returns the ProcessingLoc who owns loc. ID - * is provided for debugging purposes - * @param loc - * @param id - * @return - */ - protected final ProcessingLoc findOwner(GenomeLoc loc, String id) { - // fast path to check if we already have the existing genome loc in memory for ownership claims - // getProcessingLocs() may be expensive [reading from disk, for example] so we shouldn't call it - // unless necessary - ProcessingLoc x = processingLocs.get(loc); - return x == null ? updateAndGetProcessingLocs(id).get(loc) : x; - } - - /** - * Returns the list of currently owned locations, updating the database as necessary. - * DO NOT MODIFY THIS MAP! As with all parallelizing data structures, the list may be - * out of date immediately after the call returns, or may be updating on the fly. - * @return - */ - protected final Map updateAndGetProcessingLocs(String myName) { - return new WithLock>(myName) { - public Map doBody() { -// readTimer.restart(); - for ( ProcessingLoc p : readNewLocs() ) - processingLocs.put(p.getLocation(), p); -// readTimer.stop(); - nReads++; - return processingLocs; - } - }.run(); - } - - /** - * Wrapper around registerNewLocs that also times the operation - * - * @param plocs - * @param myName - */ - protected final void registerNewLocsWithTimers(Collection plocs, String myName) { -// writeTimer.restart(); - registerNewLocs(plocs); - nWrites++; -// writeTimer.stop(); - } - - private final void printStatusHeader() { - if ( status != null ) status.printf("process.id\thr.time\ttime\tstate%n"); - } - - private final void printStatus(String id, long machineTime, String state) { - // prints a line like processID human-readable-time machine-time state - if ( status != null ) { - status.printf("%s\t%s\t%d\t%s%n", id, STATUS_FORMAT.format(machineTime), machineTime, state); - status.flush(); - } - } - - - /** - * Lock the data structure, preventing other threads / processes from reading and writing to the - * common store - * @param id the name of the process doing the locking - */ - private final void lock(String id) { - //lockWaitTimer.restart(); - boolean hadLock = lock.ownsLock(); - if ( ! hadLock ) { - nLocks++; - //printStatus(id, lockWaitTimer.currentTime(), GOING_FOR_LOCK); - } - lock.lock(); - //lockWaitTimer.stop(); - //if ( ! hadLock ) printStatus(id, lockWaitTimer.currentTime(), HAVE_LOCK); - } - - /** - * Unlock the data structure, allowing other threads / processes to read and write to the common store - * @param id the name of the process doing the unlocking - */ - private final void unlock(String id) { - if ( lock.getHoldCount() == 1 ) printStatus(id, timer.currentTime(), RELEASING_LOCK); - lock.unlock(); - if ( ! lock.ownsLock() ) printStatus(id, timer.currentTime(), RUNNING); - } - - // useful code for getting - public final long getNLocks() { return nLocks; } - public final long getNReads() { return nReads; } - public final long getNWrites() { return nWrites; } -// public final double getTimePerLock() { return lockWaitTimer.getElapsedTime() / Math.max(nLocks, 1); } -// public final double getTimePerRead() { return readTimer.getElapsedTime() / Math.max(nReads,1); } -// public final double getTimePerWrite() { return writeTimer.getElapsedTime() / Math.max(nWrites,1); } - - // -------------------------------------------------------------------------------- - // - // Java-style functional form for with lock do { x }; - // - // -------------------------------------------------------------------------------- - - /** - * Private utility class that executes doBody() method with the lock() acquired and - * handles property unlock()ing the system, even if an error occurs. Allows one to write - * clean code like: - * - * new WithLock(name) { - * public Integer doBody() { doSomething(); return 1; } - * }.run() - * - * @param the return type of the doBody() method - */ - private abstract class WithLock { - private final String myName; - - public WithLock(String myName) { - this.myName = myName; - } - - protected abstract T doBody(); - - public T run() { - boolean locked = false; - try { - lock(myName); - locked = true; - return doBody(); - } finally { - if (locked) unlock(myName); - } - } - } - - // -------------------------------------------------------------------------------- - // - // main function for testing performance - // - // -------------------------------------------------------------------------------- - public static void main(String[] args) { - //BasicConfigurator.configure(); - - final String ref = args[0]; - final File file = new File(args[1]); - final int cycles = Integer.valueOf(args[2]); - - File referenceFile = new File(ref); - try { - final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(referenceFile); - final String chr1 = fasta.getSequenceDictionary().getSequence(1).getSequenceName(); - final GenomeLocParser genomeLocParser = new GenomeLocParser(fasta); - - final class MyTest { - String name; - GenomeLocProcessingTracker tracker; - - MyTest(String name, GenomeLocProcessingTracker tracker) { - this.name = name; - this.tracker = tracker; - } - - public void execute(int cycles) { - SimpleTimer delta = new SimpleTimer("delta"); - SimpleTimer timer = new SimpleTimer("none"); - - if ( file.exists() ) file.delete(); - timer.start(); - delta.start(); - for ( int i = 1; i < cycles; i++ ) { - tracker.claimOwnership(genomeLocParser.createGenomeLoc(chr1, i, i+1), "ABCDEFGHIJKL"); - if ( i % 1000 == 0 ) { - System.out.printf("%s\t%d\t%d\t%.4f\t%.4f%n", name, i, timer.currentTime(), timer.getElapsedTime(), delta.getElapsedTime() ); - delta.restart(); - } - } - } - } - - System.out.printf("name\tcycle\tcurrent.time\telapsed.time\tdelta%n"); - new MyTest("in-memory", new SharedMemoryGenomeLocProcessingTracker(new ClosableReentrantLock())).execute(cycles); - new MyTest("nio", new FileBackedGenomeLocProcessingTracker(file, genomeLocParser, new ClosableReentrantLock(), null)).execute(cycles); - new MyTest("nio-file-lock", new FileBackedGenomeLocProcessingTracker(file, genomeLocParser, new SharedFileThreadSafeLock(file,1), null)).execute(cycles); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(referenceFile,ex); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java deleted file mode 100644 index ad2a6d31b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java +++ /dev/null @@ -1,26 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; - -/** - * Base class, and null tracker. Always says that a GenomeLoc is ready for processing. It is - * critical that this class already return that a loc is owned, no matter if it's been seen before, - * etc. ReadShards can differ in their contents but have the same "unmapped" genome loc - */ -public class NoOpGenomeLocProcessingTracker extends GenomeLocProcessingTracker { - public NoOpGenomeLocProcessingTracker() { - super(new ClosableReentrantLock(), null); - } - - @Override - protected void registerNewLocs(Collection loc) { - ; - } - - @Override - protected List readNewLocs() { - return Collections.emptyList(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java b/public/java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java deleted file mode 100644 index ee2283dcf..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.HasGenomeLocation; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 1/19/11 - * Time: 8:06 AM - * - * Information about processing locations and their owners. Contains two basic data, associated - * together. The first is a genome loc, and the second is the name of the owner, as a string. - * - * chr1:1-10 Mark - * chr2:11-20 DePristo - * - * would be two ProcessingLocs that first indicate that the first 10 bp of chr1 are owned by Mark, - * and the second is owned by DePristo. - */ -public class ProcessingLoc implements HasGenomeLocation { - private final GenomeLoc loc; - private final String owner; - - /** - * Create a loc that's already owned - * @param loc - * @param owner - */ - public ProcessingLoc(GenomeLoc loc, String owner) { - if ( loc == null || owner == null ) { - throw new ReviewedStingException("BUG: invalid ProcessingLoc detected: " + loc + " owner " + owner); - } - - this.loc = loc; - this.owner = owner.intern(); // reduce memory consumption by interning the string - } - - public GenomeLoc getLocation() { - return loc; - } - - public String getOwner() { - return owner; - } - - /** - * Returns true iff the owner of this processing loc is name. Can be used to determine - * the owner of this processing location. - * - * @param name - * @return - */ - public boolean isOwnedBy(String name) { - return getOwner().equals(name); - } - - public String toString() { return String.format("ProcessingLoc(%s,%s)", loc, owner); } - - public boolean equals(Object other) { - if (other instanceof ProcessingLoc ) - return this.loc.equals(((ProcessingLoc)other).loc) && this.owner.equals(((ProcessingLoc)other).owner); - else - return false; - } - - public int compareTo(ProcessingLoc other) { - return this.getLocation().compareTo(other.getLocation()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java b/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java deleted file mode 100644 index 0f47da413..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java +++ /dev/null @@ -1,171 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import org.apache.log4j.Logger; -import org.apache.lucene.store.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; - -/** - * User: depristo - * Date: 1/19/11 - * Time: 8:24 AM - * - * A reentrant lock for a shared file common file in the file system. Relies on a a Lucene SimpleFSLock - * to manage on disk file locking. - */ -public class SharedFileLock extends ClosableReentrantLock { // todo -- kinda gross inheritance. The super lock is never used - private static Logger logger = Logger.getLogger(SharedFileLock.class); - - private static final String VERIFY_HOST = System.getProperty("verify.host", "gsa1"); - private static final boolean VERIFY = false; - private static final int VERIFY_PORT = 5050; - - // 5 minutes => 360 seconds of trying -> failure - protected static final int DEFAULT_N_TRIES = 1000; - protected static final long DEFAULT_MILLISECONDS_PER_TRY = 360; - - /** The file we are locking */ - private final File file; - - private final LockFactory lockFactory; - private Lock fileLock = null; - - /** - * A counter that indicates the number of 'locks' on this file. - * If locks == 2, then two unlocks are required - * before any resources are freed. - */ - int fileLockReentrantCounter = 0; - - // type of locking - private final int nRetries; - private final long milliSecPerTry; - - /** - * Create a SharedFileThreadSafeLock object locking the file - * @param file - */ - public SharedFileLock(File file, int nRetries, long milliSecPerTry, int ID) { - super(); - this.file = file; - this.nRetries = nRetries; - this.milliSecPerTry = milliSecPerTry; - - File lockDir = new File(file.getParent() == null ? "./" : file.getParent()); - try { - LockFactory factory = new SimpleFSLockFactory(lockDir); - if ( VERIFY ) { // don't forget to start up the VerifyLockServer - this.lockFactory = new VerifyingLockFactory((byte)ID, factory, VERIFY_HOST, VERIFY_PORT); - } else { - this.lockFactory = factory; - } - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(lockDir, "Could not create coordination file locking directory " + lockDir, e); - } - } - - public SharedFileLock(File file, int ID) { - this(file, DEFAULT_N_TRIES, DEFAULT_MILLISECONDS_PER_TRY, ID); - } - - @Override - public void close() { - if ( ownsLock() ) throw new ReviewedStingException("closing SharedFileLock while still owned: ownership count " + fileLockReentrantCounter); - } - - @Override - public int getHoldCount() { - return fileLockReentrantCounter; - } - - @Override - public boolean ownsLock() { - return fileLockReentrantCounter > 0; - } - - // ------------------------------------------------------------------------------------------ - // - // workhorse routines -- acquiring file locks - // - // ------------------------------------------------------------------------------------------ - - private boolean obtainFileLock() throws IOException { - // annoying bug work around for verifylockserver - if ( VERIFY ) - try { - return fileLock.obtain(1); - } catch ( LockObtainFailedException e ) { - return false; - } - else - return fileLock.obtain(); - } - - /** - * Two stage [threading then file] locking mechanism. Reenterant in that multiple lock calls will be - * unwound appropriately. Uses file channel lock *after* thread locking. - */ - @Override - public void lock() { - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" lock() " + Thread.currentThread().getName() + ", fileLockReentrantCounter = " + fileLockReentrantCounter); - if ( fileLockReentrantCounter++ == 0 ) { - // Precondition -- lock is always null while we don't have a lock - if ( fileLock != null ) - throw new ReviewedStingException("BUG: lock() function called when a lock already is owned!"); - - int i = 1; - fileLock = lockFactory.makeLock(file.getName() + ".lock"); - try { - boolean obtained = obtainFileLock(); // todo -- maybe use intrinsic lock features - for ( ; ! obtained && i < nRetries; i++ ) { - try { - //logger.warn("tryLock failed on try " + i + ", waiting " + milliSecPerTry + " millseconds for retry"); - Thread.sleep(milliSecPerTry); - } catch ( InterruptedException e ) { - throw new UserException("SharedFileThreadSafeLock interrupted during wait for file lock", e); - } - obtained = obtainFileLock(); // gross workaround for error in verify server - } - - if ( i > 1 ) logger.warn("tryLock required " + i + " tries before completing, waited " + i * milliSecPerTry + " millseconds"); - - if ( ! obtained ) { - fileLock = null; - // filelock == null -> we never managed to acquire the lock! - throw new UserException("SharedFileThreadSafeLock failed to obtain the lock after " + nRetries + " attempts"); - } - - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" lock() " + Thread.currentThread().getName() + ", obtained = " + obtained + ", tries = " + i); - } catch (IOException e) { - fileLock = null; - throw new ReviewedStingException("Coordination file could not be created because a lock could not be obtained.", e); - } - } - } - - @Override - public void unlock() { - // update for reentrant unlocking - if ( fileLock == null ) throw new ReviewedStingException("BUG: file lock is null -- file lock was not obtained"); - if ( fileLockReentrantCounter <= 0 ) throw new ReviewedStingException("BUG: file lock counter < 0"); - - // this unlock counts as 1 unlock. If this is our last unlock, actually do something - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" unlock() " + Thread.currentThread().getName() + ", count = " + fileLockReentrantCounter); - if ( --fileLockReentrantCounter == 0 ) { - try { - if ( ! fileLock.isLocked() ) throw new ReviewedStingException("BUG: call to unlock() when we don't have a valid lock!"); - fileLock.release(); - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" unlock() " + Thread.currentThread().getName() + ", actually releasing"); - } catch ( IOException e ) { - throw new ReviewedStingException("Could not free file lock on file " + file, e); - } finally { // make sure we null out the filelock, regardless of our state - fileLock = null; - } - } else { - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" unlock() " + Thread.currentThread().getName() + ", skipping, count = " + fileLockReentrantCounter); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java b/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java deleted file mode 100644 index d70879a0a..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java +++ /dev/null @@ -1,75 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.File; - -/** - * User: depristo - * Date: 1/19/11 - * Time: 8:24 AM - * - * A reentrant lock that supports multi-threaded locking as well as a shared file lock on a common - * file in the file system. It itself a shared memory reenterant lock to managed thread safety and - * contains a SharedFileLock to handle the file integrity. - */ -public class SharedFileThreadSafeLock extends ClosableReentrantLock { - private static Logger logger = Logger.getLogger(SharedFileThreadSafeLock.class); - protected static final boolean DEBUG = false; - - private final SharedFileLock fileLock; - - /** - * Create a SharedFileThreadSafeLock object locking the file - * @param file - */ - public SharedFileThreadSafeLock(File file, int nRetries, long milliSecPerTry, int ID) { - super(); - this.fileLock = new SharedFileLock(file, nRetries, milliSecPerTry, ID); - } - - public SharedFileThreadSafeLock(File file, int ID) { - this(file, SharedFileLock.DEFAULT_N_TRIES, SharedFileLock.DEFAULT_MILLISECONDS_PER_TRY, ID); - } - - @Override - public void close() { - super.close(); - fileLock.close(); - } - - @Override - public int getHoldCount() { - if ( super.getHoldCount() != fileLock.getHoldCount() ) - throw new ReviewedStingException("BUG: unequal hold counts. threadlock = " + super.getHoldCount() + ", filelock = " + fileLock.getHoldCount()); - return super.getHoldCount(); - } - - @Override - public boolean ownsLock() { - return super.isHeldByCurrentThread() && fileLock.ownsLock(); - } - - /** - * Two stage [threading then file] locking mechanism. Reenterant in that multiple lock calls will be - * unwound appropriately. Uses file channel lock *after* thread locking. - */ - @Override - public void lock() { - if ( DEBUG ) logger.warn("Attempting SharedFileThreadSafe lock: " + Thread.currentThread().getName()); - if ( DEBUG ) logger.warn(" going for thread lock: " + Thread.currentThread().getName()); - super.lock(); - if ( DEBUG ) logger.warn(" going for file lock: " + Thread.currentThread().getName()); - fileLock.lock(); // todo -- should this be in a try? - } - - @Override - public void unlock() { - if ( DEBUG ) logger.warn(" releasing filelock: " + Thread.currentThread().getName()); - fileLock.unlock(); - if ( DEBUG ) logger.warn(" releasing threadlock: " + Thread.currentThread().getName()); - super.unlock(); - if ( DEBUG ) logger.warn(" unlock() complete: " + Thread.currentThread().getName()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java deleted file mode 100644 index 9bf8b58b1..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** - * Thread-safe shared memory only implementation. Uses a simple list to manage the newly - * added processing locations. - */ -public class SharedMemoryGenomeLocProcessingTracker extends GenomeLocProcessingTracker { - private List newPLocs = new ArrayList(); - - protected SharedMemoryGenomeLocProcessingTracker(ClosableReentrantLock lock) { - super(lock, null); - } - - protected SharedMemoryGenomeLocProcessingTracker(ClosableReentrantLock lock, PrintStream status) { - super(lock, status); - } - - @Override - protected void registerNewLocs(Collection plocs) { - newPLocs.addAll(plocs); - } - - @Override - protected List readNewLocs() { - List r = newPLocs; - newPLocs = new ArrayList(); - return r; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index 0b5976c3c..fdf3d97db 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -57,6 +57,13 @@ public class Genotype { return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased()); } + public static Genotype removePLs(Genotype g) { + Map attrs = new HashMap(g.getAttributes()); + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); + return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased()); + } + public static Genotype modifyAlleles(Genotype g, List alleles) { return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java index a191670a4..a752f4a1b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java @@ -27,15 +27,15 @@ public class MutableVariantContext extends VariantContext { } public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + super(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); } public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { - this(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + super(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); } public MutableVariantContext(VariantContext parent) { - this(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes()); + super(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.getReferenceBaseForIndel()); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index eab392c4d..cfd59b504 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -5,6 +5,7 @@ import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -163,11 +164,12 @@ import java.util.*; public class VariantContext implements Feature { // to enable tribble intergration protected InferredGeneticContext commonInfo = null; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; - public final static String REFERENCE_BASE_FOR_INDEL_KEY = "_REFERENCE_BASE_FOR_INDEL_"; public final static String UNPARSED_GENOTYPE_MAP_KEY = "_UNPARSED_GENOTYPE_MAP_"; public final static String UNPARSED_GENOTYPE_PARSER_KEY = "_UNPARSED_GENOTYPE_PARSER_"; public final static String ID_KEY = "ID"; + private final Byte REFERENCE_BASE_FOR_INDEL; + public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); /** The location of this VariantContext */ @@ -205,6 +207,25 @@ public class VariantContext implements Feature { // to enable tribble intergrati // --------------------------------------------------------------------------------------------------------- + /** + * the complete constructor. Makes a complete VariantContext from its arguments + * This is the only constructor that is able to create indels! DO NOT USE THE OTHER ONES. + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + * @param genotypes genotypes map + * @param negLog10PError qual + * @param filters filters: use null for unfiltered and empty set for passes filters + * @param attributes attributes + * @param referenceBaseForIndel padded reference base + */ + public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false); + } + /** * the complete constructor. Makes a complete VariantContext from its arguments * @@ -219,7 +240,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, false); + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false); } /** @@ -237,9 +258,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param negLog10PError qual * @param filters filters: use null for unfiltered and empty set for passes filters * @param attributes attributes + * @param referenceBaseForIndel padded reference base */ - public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, true); + public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { + this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true); } /** @@ -256,7 +278,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, false); + this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false); } /** @@ -269,11 +291,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param alleles alleles */ public VariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, false); + this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false); } /** - * Create a new variant context without genotypes and no Perror, no filters, and no attributes + * Create a new variant context with genotypes but without Perror, filters, and attributes * * @param source source * @param contig the contig @@ -292,7 +314,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param other the VariantContext to copy */ public VariantContext(VariantContext other) { - this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), false); + this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false); } /** @@ -307,8 +329,13 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param negLog10PError qual * @param filters filters: use null for unfiltered and empty set for passes filters * @param attributes attributes + * @param referenceBaseForIndel padded reference base + * @param genotypesAreUnparsed true if the genotypes have not yet been parsed */ - private VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, boolean genotypesAreUnparsed) { + private VariantContext(String source, String contig, long start, long stop, + Collection alleles, Map genotypes, + double negLog10PError, Set filters, Map attributes, + Byte referenceBaseForIndel, boolean genotypesAreUnparsed) { if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } this.contig = contig; this.start = start; @@ -323,6 +350,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati this.commonInfo = new InferredGeneticContext(source, negLog10PError, filters, attributes); filtersWereAppliedToContext = filters != null; + REFERENCE_BASE_FOR_INDEL = referenceBaseForIndel; if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } @@ -355,23 +383,27 @@ public class VariantContext implements Feature { // to enable tribble intergrati // --------------------------------------------------------------------------------------------------------- public static VariantContext modifyGenotypes(VariantContext vc, Map genotypes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), false); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false); } public static VariantContext modifyLocation(VariantContext vc, String chr, int start, int end) { - return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), true); + return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); } public static VariantContext modifyFilters(VariantContext vc, Set filters) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); } public static VariantContext modifyAttributes(VariantContext vc, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true); + } + + public static VariantContext modifyReferencePadding(VariantContext vc, Byte b) { + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), b, true); } public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true); } // --------------------------------------------------------------------------------------------------------- @@ -414,7 +446,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return vc subcontext */ public VariantContext subContextFromGenotypes(Collection genotypes, Set alleles) { - return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes()); + return new VariantContext(getSource(), contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes(), getReferenceBaseForIndel()); } @@ -553,24 +585,24 @@ public class VariantContext implements Feature { // to enable tribble intergrati /** * @return true if the alleles indicate a simple insertion (i.e., the reference allele is Null) */ - public boolean isInsertion() { - // can't just call !isDeletion() because of complex indels - return getType() == Type.INDEL && getReference().isNull(); + public boolean isSimpleInsertion() { + // can't just call !isSimpleDeletion() because of complex indels + return getType() == Type.INDEL && getReference().isNull() && isBiallelic(); } /** * @return true if the alleles indicate a simple deletion (i.e., a single alt allele that is Null) */ - public boolean isDeletion() { - // can't just call !isInsertion() because of complex indels - return getType() == Type.INDEL && getAlternateAllele(0).isNull(); + public boolean isSimpleDeletion() { + // can't just call !isSimpleInsertion() because of complex indels + return getType() == Type.INDEL && getAlternateAllele(0).isNull() && isBiallelic(); } /** * @return true if the alleles indicate neither a simple deletion nor a simple insertion */ public boolean isComplexIndel() { - return isIndel() && !isDeletion() && !isInsertion(); + return isIndel() && !isSimpleDeletion() && !isSimpleInsertion(); } public boolean isSymbolic() { @@ -603,6 +635,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati return (String)commonInfo.getAttribute(ID_KEY); } + public boolean hasReferenceBaseForIndel() { + return REFERENCE_BASE_FOR_INDEL != null; + } + + // the indel base that gets stripped off for indels + public Byte getReferenceBaseForIndel() { + return REFERENCE_BASE_FOR_INDEL; + } + // --------------------------------------------------------------------------------------------------------- // // get routines to access context info fields @@ -776,6 +817,28 @@ public class VariantContext implements Feature { // to enable tribble intergrati throw new IllegalArgumentException("Requested " + i + " alternative allele but there are only " + n + " alternative alleles " + this); } + /** + * @param other VariantContext whose alternate alleles to compare against + * @return true if this VariantContext has the same alternate alleles as other, + * regardless of ordering. Otherwise returns false. + */ + public boolean hasSameAlternateAllelesAs ( VariantContext other ) { + Set thisAlternateAlleles = getAlternateAlleles(); + Set otherAlternateAlleles = other.getAlternateAlleles(); + + if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) { + return false; + } + + for ( Allele allele : thisAlternateAlleles ) { + if ( ! otherAlternateAlleles.contains(allele) ) { + return false; + } + } + + return true; + } + // --------------------------------------------------------------------------------------------------------- // // Working with genotypes @@ -783,8 +846,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati // --------------------------------------------------------------------------------------------------------- private void loadGenotypes() { - if ( !hasAttribute(UNPARSED_GENOTYPE_MAP_KEY) ) + if ( !hasAttribute(UNPARSED_GENOTYPE_MAP_KEY) ) { + if ( genotypes == null ) + genotypes = NO_GENOTYPES; return; + } Object parserObj = getAttribute(UNPARSED_GENOTYPE_PARSER_KEY); if ( parserObj == null || !(parserObj instanceof VCFParser) ) @@ -939,7 +1005,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return true if it's monomorphic */ public boolean isMonomorphic() { - return ! isVariant() || getChromosomeCount(getReference()) == getChromosomeCount(); + return ! isVariant() || (hasGenotypes() && getHomRefCount() + getNoCallCount() == getNSamples()); } /** @@ -1020,11 +1086,12 @@ public class VariantContext implements Feature { // to enable tribble intergrati * Run all extra-strict validation tests on a Variant Context object * * @param reference the true reference allele + * @param paddedRefBase the reference base used for padding indels * @param rsIDs the true dbSNP IDs */ - public void extraStrictValidation(Allele reference, Set rsIDs) { + public void extraStrictValidation(Allele reference, Byte paddedRefBase, Set rsIDs) { // validate the reference - validateReferenceBases(reference); + validateReferenceBases(reference, paddedRefBase); // validate the RS IDs validateRSIDs(rsIDs); @@ -1039,10 +1106,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati //checkReferenceTrack(); } - public void validateReferenceBases(Allele reference) { - // don't validate if we're an insertion - if ( !reference.isNull() && !reference.basesMatch(getReference()) ) { - throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); + public void validateReferenceBases(Allele reference, Byte paddedRefBase) { + // don't validate if we're a complex event + if ( !isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference()) ) { + throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); + } + + // we also need to validate the padding base for simple indels + if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) { + throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), (char)paddedRefBase.byteValue(), (char)getReferenceBaseForIndel().byteValue())); } } @@ -1151,6 +1223,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati private boolean validate(boolean throwException) { try { + validateReferencePadding(); validateAlleles(); validateGenotypes(); } catch ( IllegalArgumentException e ) { @@ -1163,6 +1236,16 @@ public class VariantContext implements Feature { // to enable tribble intergrati return true; } + private void validateReferencePadding() { + if (hasSymbolicAlleles()) // symbolic alleles don't need padding... + return; + + boolean needsPadding = (getReference().length() == getEnd() - getStart()); // off by one because padded base was removed + + if ( needsPadding && !hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":" + getStart() + "; no padded reference base was provided."); + } + private void validateAlleles() { // check alleles boolean alreadySeenRef = false, alreadySeenNull = false; @@ -1221,16 +1304,6 @@ public class VariantContext implements Feature { // to enable tribble intergrati // // --------------------------------------------------------------------------------------------------------- - // the indel base that gets stripped off for indels - public boolean hasReferenceBaseForIndel() { - return hasAttribute(REFERENCE_BASE_FOR_INDEL_KEY); - } - - // the indel base that gets stripped off for indels - public byte getReferenceBaseForIndel() { - return hasReferenceBaseForIndel() ? (Byte)getAttribute(REFERENCE_BASE_FOR_INDEL_KEY) : (byte)'N'; - } - private void determineType() { if ( type == null ) { switch ( getNAlleles() ) { @@ -1357,8 +1430,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati return false; } - public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, byte inputRefBase, boolean refBaseShouldBeAppliedToEndOfAlleles) { - Allele refAllele = inputVC.getReference(); + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { // see if we need to pad common reference base from all alleles boolean padVC; @@ -1368,31 +1440,20 @@ public class VariantContext implements Feature { // to enable tribble intergrati long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; if (inputVC.hasSymbolicAlleles()) padVC = true; - else if (refAllele.length() == locLength) + else if (inputVC.getReference().length() == locLength) padVC = false; - else if (refAllele.length() == locLength-1) + else if (inputVC.getReference().length() == locLength-1) padVC = true; else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - // nothing to do if we don't need to pad bases if (padVC) { - Byte refByte; - Map attributes = inputVC.getAttributes(); + if ( !inputVC.hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); - // upper-case for consistency; note that we can safely make these casts because the input is constrained to be a byte - inputRefBase = (byte)Character.toUpperCase((char)inputRefBase); - if (attributes.containsKey(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) - refByte = (Byte)attributes.get(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY); - else if (inputRefBase == 'A' || inputRefBase == 'T' || inputRefBase == 'C' || inputRefBase == 'G' || inputRefBase == 'N') - refByte = inputRefBase; - else - throw new IllegalArgumentException("Error when trying to pad Variant Context at location " + String.valueOf(inputVC.getStart()) - + " in contig " + inputVC.getChr() + - ". Either input reference base ("+(char)inputRefBase+ - ", ascii code="+inputRefBase+") must be a regular base, or input VC must contain reference base key"); + Byte refByte = inputVC.getReferenceBaseForIndel(); List alleles = new ArrayList(); Map genotypes = new TreeMap(); @@ -1444,11 +1505,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati // Do not change the filter state if filters were not applied to this context Set inputVCFilters = inputVC.filtersWereAppliedToContext ? inputVC.getFilters() : null; - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), - inputVCFilters, attributes); - - - + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVCFilters, inputVC.getAttributes()); } else return inputVC; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 212600360..986d6305c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -295,10 +295,7 @@ public class VariantContextUtils { @Requires("vc != null") @Ensures("result != null") public static VariantContext sitesOnlyVariantContext(VariantContext vc) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), - vc.getAlleles(), vc.getNegLog10PError(), - vc.filtersWereApplied() ? vc.getFilters() : null, - vc.getAttributes()); + return VariantContext.modifyGenotypes(vc, null); } /** @@ -348,11 +345,33 @@ public class VariantContextUtils { } public enum GenotypeMergeType { - UNIQUIFY, PRIORITIZE, UNSORTED, REQUIRE_UNIQUE + /** + * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. + */ + UNIQUIFY, + /** + * Take genotypes in priority order (see the priority argument). + */ + PRIORITIZE, + /** + * Take the genotypes in any order. + */ + UNSORTED, + /** + * Require that all samples/genotypes be unique between all inputs. + */ + REQUIRE_UNIQUE } public enum FilteredRecordMergeType { - KEEP_IF_ANY_UNFILTERED, KEEP_IF_ALL_UNFILTERED + /** + * Union - leaves the record if any record is unfiltered. + */ + KEEP_IF_ANY_UNFILTERED, + /** + * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. + */ + KEEP_IF_ALL_UNFILTERED } /** @@ -449,7 +468,7 @@ public class VariantContextUtils { FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, boolean annotateOrigin, boolean printMessages, byte inputRefBase ) { - return simpleMerge(genomeLocParser, unsortedVCs, priorityListOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, inputRefBase, "set", false, false); + return simpleMerge(genomeLocParser, unsortedVCs, priorityListOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, "set", false, false); } /** @@ -464,7 +483,6 @@ public class VariantContextUtils { * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? - * @param inputRefBase the ref base * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? @@ -472,7 +490,7 @@ public class VariantContextUtils { */ public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection unsortedVCs, List priorityListOfVCs, FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, - boolean annotateOrigin, boolean printMessages, byte inputRefBase, String setKey, + boolean annotateOrigin, boolean printMessages, String setKey, boolean filteredAreUncalled, boolean mergeInfoWithMaxAC ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -490,7 +508,7 @@ public class VariantContextUtils { for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc,inputRefBase,false)); + VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc, false)); } if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -592,6 +610,14 @@ public class VariantContextUtils { } } + // if we have more alternate alleles in the merged VC than in one or more of the original VCs, we need to strip out the GL/PLs (because they are no longer accurate) + for ( VariantContext vc : VCs ) { + if ( vc.alleles.size() != alleles.size() ) { + genotypes = stripPLs(genotypes); + break; + } + } + // take the VC with the maxAC and pull the attributes into a modifiable map if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); @@ -631,12 +657,94 @@ public class VariantContextUtils { VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) ); // Trim the padded bases of all alleles if necessary - merged = AbstractVCFCodec.createVariantContextWithTrimmedAlleles(merged); + merged = createVariantContextWithTrimmedAlleles(merged); if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; } + public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { + // see if we need to trim common reference base from all alleles + boolean trimVC; + + // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles + Allele refAllele = inputVC.getReference(); + if (!inputVC.isVariant()) + trimVC = false; + else if (refAllele.isNull()) + trimVC = false; + else { + trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList(inputVC.getAlternateAlleles()), + inputVC.getReference().getDisplayString()) > 0); + } + + // nothing to do if we don't need to trim bases + if (trimVC) { + List alleles = new ArrayList(); + Map genotypes = new TreeMap(); + + // set the reference base for indels in the attributes + Map attributes = new TreeMap(inputVC.getAttributes()); + + Map originalToTrimmedAlleleMap = new HashMap(); + + for (Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length()); + Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation + // example: mixed records such as {TA*,TGA,TG} + boolean hasNullAlleles = false; + + for (Allele a: originalToTrimmedAlleleMap.values()) { + if (a.isNull()) + hasNullAlleles = true; + if (a.isReference()) + refAllele = a; + } + + if (!hasNullAlleles) + return inputVC; + // now we can recreate new genotypes with trimmed alleles + for ( Map.Entry sample : inputVC.getGenotypes().entrySet() ) { + + List originalAlleles = sample.getValue().getAlleles(); + List trimmedAlleles = new ArrayList(); + for ( Allele a : originalAlleles ) { + if ( a.isCalled() ) + trimmedAlleles.add(originalToTrimmedAlleleMap.get(a)); + else + trimmedAlleles.add(Allele.NO_CALL); + } + genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); + + } + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0])); + + } + + return inputVC; + } + + public static Map stripPLs(Map genotypes) { + Map newGs = new HashMap(genotypes.size()); + + for ( Map.Entry g : genotypes.entrySet() ) { + newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? Genotype.removePLs(g.getValue()) : g.getValue()); + } + + return newGs; + } + public static Map> separateVariantContextsByType(Collection VCs) { HashMap> mappedVCs = new HashMap>(); for ( VariantContext vc : VCs ) { diff --git a/public/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java b/public/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java index 27b76537f..cf0f9051e 100644 --- a/public/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java +++ b/public/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java @@ -27,7 +27,6 @@ package net.sf.picard.reference; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSourceProgressListener; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -40,7 +39,6 @@ import java.io.FileNotFoundException; public class FastaSequenceIndexBuilderUnitTest extends BaseTest { private FastaSequenceIndexBuilder builder; - private ReferenceDataSourceProgressListener progress; private File fastaFile; private FastaSequenceIndex controlIndex; @@ -58,7 +56,7 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest { logger.warn("Executing unixFileTest"); fastaFile = new File(validationDataLocation + "exampleFASTA.fasta"); - builder = new FastaSequenceIndexBuilder(fastaFile, progress); + builder = new FastaSequenceIndexBuilder(fastaFile, false); FastaSequenceIndex index = builder.createIndex(); controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 100000, 60, 61,0)); @@ -75,7 +73,7 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest { logger.warn("Executing windowsFileTest"); fastaFile = new File(validationDataLocation + "exampleFASTA-windows.fasta"); - builder = new FastaSequenceIndexBuilder(fastaFile, progress); + builder = new FastaSequenceIndexBuilder(fastaFile, false); FastaSequenceIndex index = builder.createIndex(); controlIndex.add(new FastaSequenceIndexEntry("chr2", 7, 29, 7, 9,0)); @@ -91,7 +89,7 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest { logger.warn("Executing combinedWindowsUnix"); fastaFile = new File(validationDataLocation + "exampleFASTA-combined.fasta"); - builder = new FastaSequenceIndexBuilder(fastaFile, progress); + builder = new FastaSequenceIndexBuilder(fastaFile, false); FastaSequenceIndex index = builder.createIndex(); controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 100000, 60, 61,0)); controlIndex.add(new FastaSequenceIndexEntry("chr2", 101680, 29, 7, 9,1)); @@ -108,7 +106,7 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest { logger.warn("Executing threeVariableLengthContigs"); fastaFile = new File(validationDataLocation + "exampleFASTA-3contigs.fasta"); - builder = new FastaSequenceIndexBuilder(fastaFile, progress); + builder = new FastaSequenceIndexBuilder(fastaFile, false); FastaSequenceIndex index = builder.createIndex(); controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 17, 5, 6,0)); controlIndex.add(new FastaSequenceIndexEntry("chr2", 35, 21, 7, 8,1)); diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index ef46d4bff..63faf1ab9 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -64,10 +64,10 @@ public abstract class BaseTest { public static final String b37Refseq = refseqAnnotationLocation + "refGene-big-table-b37.txt"; public static final String dbsnpDataLocation = GATKDataLocation; - public static final String hg18dbSNP129 = dbsnpDataLocation + "dbsnp_129_hg18.rod"; - public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.rod"; - public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.rod"; + public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf"; + public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf"; public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf"; + public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf"; public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/"; public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf"; diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index bea9eaec5..0194e114a 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -47,6 +47,7 @@ public class MD5DB { /** * Subdirectory under the ant build directory where we store integration test md5 results */ + private static final int MAX_RECORDS_TO_READ = 10000; public static final String LOCAL_MD5_DB_DIR = "integrationtests"; public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests"; @@ -78,8 +79,8 @@ public class MD5DB { * @return */ public static String getMD5FilePath(final String md5, final String valueIfNotFound) { - // we prefer the local db to the global DB, so match it first - for ( String dir : Arrays.asList(LOCAL_MD5_DB_DIR, GLOBAL_MD5_DB_DIR)) { + // we prefer the global db to the local DB, so match it first + for ( String dir : Arrays.asList(GLOBAL_MD5_DB_DIR, LOCAL_MD5_DB_DIR)) { File f = getFileForMD5(md5, dir); if ( f.exists() && f.canRead() ) return f.getPath(); @@ -232,7 +233,7 @@ public class MD5DB { // inline differences DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(System.out, 20, 10, 0); - boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), params); + boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params); if ( success ) System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n", pathToExpectedMD5File, pathToFileMD5File); diff --git a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java index 54e3b35bc..f04731214 100755 --- a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java @@ -25,6 +25,9 @@ package org.broadinstitute.sting.commandline; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -37,11 +40,14 @@ import java.util.EnumSet; * Test suite for the parsing engine. */ public class ParsingEngineUnitTest extends BaseTest { + /** we absolutely cannot have this file existing, or we'll fail the UnitTest */ + private final static String NON_EXISTANT_FILENAME_VCF = "this_file_should_not_exist_on_disk_123456789.vcf"; private ParsingEngine parsingEngine; @BeforeMethod public void setUp() { parsingEngine = new ParsingEngine(null); + RodBinding.resetNameCounter(); } private class InputFileArgProvider { @@ -62,7 +68,7 @@ public class ParsingEngineUnitTest extends BaseTest { Assert.assertEquals(argProvider.inputFile,"na12878.bam","Argument is not correctly initialized"); } - + @Test public void multiCharShortNameArgumentTest() { final String[] commandLine = new String[] {"-out","out.txt"}; @@ -211,7 +217,7 @@ public class ParsingEngineUnitTest extends BaseTest { Assert.assertEquals(argProvider.testEnum, TestEnum.ONE, "Enum value is not correct"); } - + @Test public void enumDefaultTest() { final String[] commandLine = new String[] {}; @@ -552,7 +558,7 @@ public class ParsingEngineUnitTest extends BaseTest { commandLine = new String[] {"--foo","5","--bar","6"}; parsingEngine.parse( commandLine ); - parsingEngine.validate(); + parsingEngine.validate(); } private class MutuallyExclusiveArgProvider { @@ -618,4 +624,317 @@ public class ParsingEngineUnitTest extends BaseTest { @ArgumentCollection RequiredArgProvider rap2 = new RequiredArgProvider(); } + + // -------------------------------------------------------------------------------- + // + // Tests of the RodBinding system + // + // -------------------------------------------------------------------------------- + + private class SingleRodBindingArgProvider { + @Input(fullName="binding", shortName="V", required=true) + public RodBinding binding; + } + + @Test + public void basicRodBindingArgumentTest() { + final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + SingleRodBindingArgProvider argProvider = new SingleRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.isBound(), true, "Bound() isn't returning its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + private class ShortNameOnlyRodBindingArgProvider { + @Input(shortName="short", required=false) + public RodBinding binding; // = RodBinding.makeUnbound(Feature.class); + } + + @Test + public void shortNameOnlyRodBindingArgumentTest() { + final String[] commandLine = new String[] {"-short:vcf",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( ShortNameOnlyRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ShortNameOnlyRodBindingArgProvider argProvider = new ShortNameOnlyRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.isBound(), true, "Bound() isn't returning its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + private class OptionalRodBindingArgProvider { + @Input(fullName="binding", shortName="V", required=false) + public RodBinding binding; + + @Input(fullName="bindingNull", shortName="VN", required=false) + public RodBinding bindingNull = null; + } + + @Test + public void optionalRodBindingArgumentTest() { + final String[] commandLine = new String[] {}; + + parsingEngine.addArgumentSource( OptionalRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + OptionalRodBindingArgProvider argProvider = new OptionalRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertNotNull(argProvider.binding, "Default value not applied corrected to RodBinding"); + Assert.assertEquals(argProvider.binding.getName(), RodBinding.UNBOUND_VARIABLE_NAME, "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), RodBinding.UNBOUND_SOURCE, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.isBound(), false, "Bound() isn't returning its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 0, "Tags aren't correctly set"); + + Assert.assertNotNull(argProvider.bindingNull, "Default value not applied corrected to RodBinding"); + Assert.assertEquals(argProvider.bindingNull.getName(), RodBinding.UNBOUND_VARIABLE_NAME, "Name isn't set properly"); + Assert.assertEquals(argProvider.bindingNull.getSource(), RodBinding.UNBOUND_SOURCE, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.bindingNull.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.bindingNull.isBound(), false, "Bound() isn't returning its expected value"); + Assert.assertEquals(argProvider.bindingNull.getTags().getPositionalTags().size(), 0, "Tags aren't correctly set"); + } + + @Test(expectedExceptions = UserException.class) + public void rodBindingArgumentTestMissingType() { + final String[] commandLine = new String[] {"-V",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + SingleRodBindingArgProvider argProvider = new SingleRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + } + + @Test(expectedExceptions = UserException.class) + public void rodBindingArgumentTestTooManyTags() { + final String[] commandLine = new String[] {"-V:x,y,z",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( SingleRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + SingleRodBindingArgProvider argProvider = new SingleRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + } + + private class VariantContextRodBindingArgProvider { + @Input(fullName = "binding", shortName="V") + public RodBinding binding; + } + + @Test + public void variantContextBindingArgumentTest() { + final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + @Test + public void variantContextBindingArgumentTestVCF3() { + final String[] commandLine = new String[] {"-V:vcf3",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + private class ListRodBindingArgProvider { + @Input(fullName = "binding", shortName="V", required=false) + public List> bindings; + } + + @Test + public void listRodBindingArgumentTest() { + final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF}; + + parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.bindings.size(), 1, "Unexpected number of bindings"); + RodBinding binding = argProvider.bindings.get(0); + Assert.assertEquals(binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + @Test + public void listRodBindingArgumentTest2Args() { + final String[] commandLine = new String[] {"-V:vcf",NON_EXISTANT_FILENAME_VCF, "-V:vcf", "bar.vcf"}; + + parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.bindings.size(), 2, "Unexpected number of bindings"); + + RodBinding binding = argProvider.bindings.get(0); + Assert.assertEquals(binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(binding.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + + RodBinding binding2 = argProvider.bindings.get(1); + Assert.assertEquals(binding2.getName(), "binding2", "Name isn't set properly"); + Assert.assertEquals(binding2.getSource(), "bar.vcf", "Source isn't set to its expected value"); + Assert.assertEquals(binding2.getType(), Feature.class, "Type isn't set to its expected value"); + Assert.assertEquals(binding2.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + @Test + public void listRodBindingArgumentTest0Args() { + final String[] commandLine = new String[] {}; + + parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertNull(argProvider.bindings, "Bindings were not null"); + } + + @Test + public void listRodBindingArgumentTestExplicitlyNamed() { + final String[] commandLine = new String[] {"-V:foo,vcf",NON_EXISTANT_FILENAME_VCF, "-V:foo,vcf", "bar.vcf"}; + + parsingEngine.addArgumentSource( ListRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + ListRodBindingArgProvider argProvider = new ListRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.bindings.size(), 2, "Unexpected number of bindings"); + Assert.assertEquals(argProvider.bindings.get(0).getName(), "foo", "Name isn't set properly"); + Assert.assertEquals(argProvider.bindings.get(1).getName(), "foo2", "Name isn't set properly"); + } + + private final static String HISEQ_VCF = testDir + "HiSeq.10000.vcf"; + private final static String TRANCHES_FILE = testDir + "tranches.6.txt"; + + @Test + public void variantContextBindingTestDynamicTyping1() { + final String[] commandLine = new String[] {"-V", HISEQ_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 0, "Tags aren't correctly set"); + } + + @Test + public void variantContextBindingTestDynamicTypingNameAsSingleArgument() { + final String[] commandLine = new String[] {"-V:name", HISEQ_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "name", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set"); + } + + @Test() + public void variantContextBindingTestDynamicTypingTwoTagsPassing() { + final String[] commandLine = new String[] {"-V:name,vcf", HISEQ_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + + Assert.assertEquals(argProvider.binding.getName(), "name", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 2, "Tags aren't correctly set"); + } + + @Test() + public void variantContextBindingTestDynamicTypingTwoTagsCausingTypeFailure() { + final String[] commandLine = new String[] {"-V:name,beagle", HISEQ_VCF}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + + Assert.assertEquals(argProvider.binding.getName(), "name", "Name isn't set properly"); + Assert.assertEquals(argProvider.binding.getSource(), HISEQ_VCF, "Source isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTribbleType(), "beagle", "Type isn't set to its expected value"); + Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 2, "Tags aren't correctly set"); + } + + @Test(expectedExceptions = UserException.class) + public void variantContextBindingTestDynamicTypingUnknownTribbleType() { + final String[] commandLine = new String[] {"-V", TRANCHES_FILE}; + + parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class ); + parsingEngine.parse( commandLine ); + parsingEngine.validate(); + + VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); + parsingEngine.loadArgumentsIntoObject( argProvider ); + } } diff --git a/public/java/test/org/broadinstitute/sting/commandline/RodBindingUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/RodBindingUnitTest.java new file mode 100644 index 000000000..206f32532 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/RodBindingUnitTest.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.Test; +import org.testng.annotations.BeforeMethod; + +/** + * Test suite for the parsing engine. + */ +public class RodBindingUnitTest extends BaseTest { + Tags mytags = new Tags(); + + @BeforeMethod + public void setUp() { + RodBinding.resetNameCounter(); + } + + @Test + public void testStandardRodBinding() { + RodBinding b = new RodBinding(VariantContext.class, "b", "foo", "vcf", mytags); + Assert.assertEquals(b.getName(), "b"); + Assert.assertEquals(b.getType(), VariantContext.class); + Assert.assertEquals(b.getSource(), "foo"); + Assert.assertEquals(b.getTribbleType(), "vcf"); + Assert.assertEquals(b.isBound(), true); + } + + @Test + public void testUnboundRodBinding() { + RodBinding u = RodBinding.makeUnbound(VariantContext.class); + Assert.assertEquals(u.getName(), RodBinding.UNBOUND_VARIABLE_NAME); + Assert.assertEquals(u.getSource(), RodBinding.UNBOUND_SOURCE); + Assert.assertEquals(u.getType(), VariantContext.class); + Assert.assertEquals(u.getTribbleType(), RodBinding.UNBOUND_TRIBBLE_TYPE); + Assert.assertEquals(u.isBound(), false); + } + + @Test + public void testMultipleBindings() { + String name = "binding"; + RodBinding b1 = new RodBinding(VariantContext.class, name, "foo", "vcf", mytags); + Assert.assertEquals(b1.getName(), name); + Assert.assertEquals(b1.getType(), VariantContext.class); + Assert.assertEquals(b1.getSource(), "foo"); + Assert.assertEquals(b1.getTribbleType(), "vcf"); + Assert.assertEquals(b1.isBound(), true); + + RodBinding b2 = new RodBinding(VariantContext.class, name, "foo", "vcf", mytags); + Assert.assertEquals(b2.getName(), name + "2"); + Assert.assertEquals(b2.getType(), VariantContext.class); + Assert.assertEquals(b2.getSource(), "foo"); + Assert.assertEquals(b2.getTribbleType(), "vcf"); + Assert.assertEquals(b2.isBound(), true); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java new file mode 100644 index 000000000..5b5083ef3 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + +/** + * + */ +public class EngineFeaturesIntegrationTest extends WalkerTest { + private void testBadRODBindingInput(String type, String name, Class c) { + WalkerTestSpec spec = new WalkerTestSpec("-T SelectVariants -L 1:1 --variant:variant," + type + " " + + b37dbSNP132 + " -R " + b37KGReference + " -o %s", + 1, c); + executeTest(name, spec); + } + + @Test() private void testBadRODBindingInputType1() { + testBadRODBindingInput("beagle", "BEAGLE input to VCF expecting walker", UserException.BadArgumentValue.class); + } + + @Test() private void testBadRODBindingInputType2() { + testBadRODBindingInput("vcf3", "VCF3 input to VCF expecting walker", UserException.class); + } + + @Test() private void testBadRODBindingInputType3() { + testBadRODBindingInput("bed", "Bed input to VCF expecting walker", UserException.BadArgumentValue.class); + } + + @Test() private void testBadRODBindingInputTypeUnknownType() { + testBadRODBindingInput("bedXXX", "Unknown input to VCF expecting walker", UserException.UnknownTribbleType.class); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java index cd43927a4..6149a1e51 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/WalkerManagerUnitTest.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk; import org.testng.Assert; import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.qc.CountLociWalker; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; @@ -64,7 +63,6 @@ public class WalkerManagerUnitTest { } @Hidden -@Requires(value={}) class UninstantiableWalker extends Walker { // Private constructor will generate uninstantiable message private UninstantiableWalker() {} diff --git a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java index 59edf934e..3a242cb13 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java @@ -81,7 +81,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest { collect.samFiles = input; collect.strictnessLevel = SAMFileReader.ValidationStringency.STRICT; collect.referenceFile = new File("referenceFile".toLowerCase()); - collect.DBSNPFile = "DBSNPFile".toLowerCase(); collect.unsafe = ValidationExclusion.TYPE.ALL; collect.downsampleFraction = null; collect.downsampleCoverage = null; @@ -89,14 +88,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest { collect.intervals.add("intervals".toLowerCase()); collect.excludeIntervals = new ArrayList(); collect.numberOfThreads = 1; - - // make some rod bindings up - ArrayList fakeBindings = new ArrayList(); - fakeBindings.add("Bind1"); - fakeBindings.add("Bind2"); - fakeBindings.add("Bind3"); - - collect.RODBindings = fakeBindings; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index b32473b9d..85ae1e1f7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -1,15 +1,16 @@ package org.broadinstitute.sting.gatk.datasources.providers; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; @@ -69,8 +70,8 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.emptyList()); ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); - RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",10)); - Assert.assertEquals(tracker.getAllRods().size(), 0, "The tracker should not have produced any data"); + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",10), null); + Assert.assertEquals(tracker.getValues(Feature.class).size(), 0, "The tracker should not have produced any data"); } /** @@ -87,8 +88,8 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.singletonList(dataSource)); ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); - RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); - TableFeature datum = tracker.lookup("tableTest",TableFeature.class); + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); + TableFeature datum = tracker.getFirstValue(TableFeature.class, "tableTest"); Assert.assertEquals(datum.get("COL1"),"C","datum parameter for COL1 is incorrect"); Assert.assertEquals(datum.get("COL2"),"D","datum parameter for COL2 is incorrect"); @@ -113,14 +114,14 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Arrays.asList(dataSource1,dataSource2)); ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); - RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); - TableFeature datum1 = tracker.lookup("tableTest1",TableFeature.class); + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); + TableFeature datum1 = tracker.getFirstValue(TableFeature.class, "tableTest1"); Assert.assertEquals(datum1.get("COL1"),"C","datum1 parameter for COL1 is incorrect"); Assert.assertEquals(datum1.get("COL2"),"D","datum1 parameter for COL2 is incorrect"); Assert.assertEquals(datum1.get("COL3"),"E","datum1 parameter for COL3 is incorrect"); - TableFeature datum2 = tracker.lookup("tableTest2", TableFeature.class); + TableFeature datum2 = tracker.getFirstValue(TableFeature.class, "tableTest2"); Assert.assertEquals(datum2.get("COL1"),"C","datum2 parameter for COL1 is incorrect"); Assert.assertEquals(datum2.get("COL2"),"D","datum2 parameter for COL2 is incorrect"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java index 5b0d67e88..d45f6e667 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.gatk.datasources.rmd; import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java new file mode 100644 index 000000000..1e39fd26f --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata; + +import net.sf.samtools.SAMFileHeader; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.*; +import java.util.*; +import java.util.List; + +public class RefMetaDataTrackerUnitTest { + final protected static Logger logger = Logger.getLogger(RefMetaDataTrackerUnitTest.class); + private static SAMFileHeader header; + private ReferenceContext context; + private GenomeLocParser genomeLocParser; + private GenomeLoc locus; + private final static int START_POS = 10; + Allele A,C,G,T; + VariantContext AC_SNP, AG_SNP, AT_SNP; + TableFeature span10_10, span1_20, span10_20; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + locus = genomeLocParser.createGenomeLoc("chr1", START_POS, START_POS); + context = new ReferenceContext(genomeLocParser, locus, (byte)'A'); + A = Allele.create("A", true); + C = Allele.create("C"); + G = Allele.create("G"); + T = Allele.create("T"); + AC_SNP = new VariantContext("x", "chr1", START_POS, START_POS, Arrays.asList(A, C)); + AG_SNP = new VariantContext("x", "chr1", START_POS, START_POS, Arrays.asList(A, G)); + AT_SNP = new VariantContext("x", "chr1", START_POS, START_POS, Arrays.asList(A, T)); + span10_10 = makeSpan(10, 10); + span1_20 = makeSpan(1, 20); + span10_20 = makeSpan(10, 20); + } + + @BeforeMethod + public void reset() { + RodBinding.resetNameCounter(); + } + + private class MyTest extends BaseTest.TestDataProvider { + public RODRecordList AValues, BValues; + + private MyTest(Class c, final List AValues, final List BValues) { + super(c); + this.AValues = AValues == null ? null : makeRODRecord("A", AValues); + this.BValues = BValues == null ? null : makeRODRecord("B", BValues); + } + + private MyTest(final List AValues, final List BValues) { + super(MyTest.class); + this.AValues = AValues == null ? null : makeRODRecord("A", AValues); + this.BValues = BValues == null ? null : makeRODRecord("B", BValues); + } + + @Override + public String toString() { + return String.format("A=%s, B=%s", AValues, BValues); + } + + private final RODRecordList makeRODRecord(String name, List features) { + List x = new ArrayList(); + for ( Feature f : features ) + x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, f, name)); + return new RODRecordListImpl(name, x, locus); + } + + public List expected(String name) { + if ( name.equals("A+B") ) return allValues(); + if ( name.equals("A") ) return expectedAValues(); + if ( name.equals("B") ) return expectedBValues(); + throw new RuntimeException("FAIL"); + } + + public List allValues() { + List x = new ArrayList(); + x.addAll(expectedAValues()); + x.addAll(expectedBValues()); + return x; + } + + public List expectedAValues() { + return AValues == null ? Collections.emptyList() : AValues; + } + + public List expectedBValues() { + return BValues == null ? Collections.emptyList() : BValues; + } + + public RefMetaDataTracker makeTracker() { + List x = new ArrayList(); + if ( AValues != null ) x.add(AValues); + if ( BValues != null ) x.add(BValues); + return new RefMetaDataTracker(x, context); + } + + public int nBoundTracks() { + int n = 0; + if ( AValues != null ) n++; + if ( BValues != null ) n++; + return n; + } + } + + private final TableFeature makeSpan(int start, int stop) { + return new TableFeature(genomeLocParser.createGenomeLoc("chr1", start, stop), + Collections.emptyList(), Collections.emptyList()); + } + + @DataProvider(name = "tests") + public Object[][] createTests() { + new MyTest(null, null); + new MyTest(Arrays.asList(AC_SNP), null); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), null); + new MyTest(Arrays.asList(AC_SNP), Arrays.asList(AG_SNP)); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(AG_SNP)); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(span10_10)); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(span10_10, span10_20)); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(span10_10, span10_20, span1_20)); + + // for requires starts + new MyTest(Arrays.asList(span1_20), null); + new MyTest(Arrays.asList(span10_10, span10_20), null); + new MyTest(Arrays.asList(span10_10, span10_20, span1_20), null); + + return MyTest.getTests(MyTest.class); + } + + @Test(enabled = true, dataProvider = "tests") + public void testRawBindings(MyTest test) { + logger.warn("Testing " + test + " for number of bound tracks"); + RefMetaDataTracker tracker = test.makeTracker(); + Assert.assertEquals(tracker.getNTracksWithBoundFeatures(), test.nBoundTracks()); + + testSimpleBindings("A", tracker, test.AValues); + testSimpleBindings("B", tracker, test.BValues); + } + + private void testSimpleBindings(String name, RefMetaDataTracker tracker, RODRecordList expected) { + List asValues = tracker.getValues(Feature.class, name); + + Assert.assertEquals(tracker.hasValues(name), expected != null); + Assert.assertEquals(asValues.size(), expected == null ? 0 : expected.size()); + + if ( expected != null ) { + for ( GATKFeature e : expected ) { + boolean foundValue = false; + for ( Feature f : asValues ) { + if ( e.getUnderlyingObject() == f ) foundValue = true; + } + Assert.assertTrue(foundValue, "Never found expected value of " + e.getUnderlyingObject() + " bound to " + name + " in " + tracker); + } + } + } + + @Test(enabled = true, dataProvider = "tests") + public void testGettersAsString(MyTest test) { + logger.warn("Testing " + test + " for get() methods"); + RefMetaDataTracker tracker = test.makeTracker(); + + for ( String name : Arrays.asList("A+B", "A", "B") ) { + List v1 = name.equals("A+B") ? tracker.getValues(Feature.class) : tracker.getValues(Feature.class, name); + testGetter(name, v1, test.expected(name), true, tracker); + + List v2 = name.equals("A+B") ? tracker.getValues(Feature.class, locus) : tracker.getValues(Feature.class, name, locus); + testGetter(name, v2, startingHere(test.expected(name)), true, tracker); + + Feature v3 = name.equals("A+B") ? tracker.getFirstValue(Feature.class) : tracker.getFirstValue(Feature.class, name); + testGetter(name, Arrays.asList(v3), test.expected(name), false, tracker); + + Feature v4 = name.equals("A+B") ? tracker.getFirstValue(Feature.class, locus) : tracker.getFirstValue(Feature.class, name, locus); + testGetter(name, Arrays.asList(v4), startingHere(test.expected(name)), false, tracker); + } + } + + @Test(enabled = true, dataProvider = "tests") + public void testGettersAsRodBindings(MyTest test) { + logger.warn("Testing " + test + " for get() methods as RodBindings"); + RefMetaDataTracker tracker = test.makeTracker(); + + for ( String nameAsString : Arrays.asList("A", "B") ) { + RodBinding binding = new RodBinding(Feature.class, nameAsString, "none", "vcf", new Tags()); + List v1 = tracker.getValues(binding); + testGetter(nameAsString, v1, test.expected(nameAsString), true, tracker); + + List v2 = tracker.getValues(binding, locus); + testGetter(nameAsString, v2, startingHere(test.expected(nameAsString)), true, tracker); + + Feature v3 = tracker.getFirstValue(binding); + testGetter(nameAsString, Arrays.asList(v3), test.expected(nameAsString), false, tracker); + + Feature v4 = tracker.getFirstValue(binding, locus); + testGetter(nameAsString, Arrays.asList(v4), startingHere(test.expected(nameAsString)), false, tracker); + } + } + + @Test(enabled = true, dataProvider = "tests") + public void testGettersAsListOfRodBindings(MyTest test) { + logger.warn("Testing " + test + " for get() methods for List"); + RefMetaDataTracker tracker = test.makeTracker(); + + String nameAsString = "A+B"; + RodBinding A = new RodBinding(Feature.class, "A", "none", "vcf", new Tags()); + RodBinding B = new RodBinding(Feature.class, "B", "none", "vcf", new Tags()); + List> binding = Arrays.asList(A, B); + + List v1 = tracker.getValues(binding); + testGetter(nameAsString, v1, test.expected(nameAsString), true, tracker); + + List v2 = tracker.getValues(binding, locus); + testGetter(nameAsString, v2, startingHere(test.expected(nameAsString)), true, tracker); + + Feature v3 = tracker.getFirstValue(binding); + testGetter(nameAsString, Arrays.asList(v3), test.expected(nameAsString), false, tracker); + + Feature v4 = tracker.getFirstValue(binding, locus); + testGetter(nameAsString, Arrays.asList(v4), startingHere(test.expected(nameAsString)), false, tracker); + } + + private List startingHere(List l) { + List x = new ArrayList(); + for ( GATKFeature f : l ) if ( f.getStart() == locus.getStart() ) x.add(f); + return x; + } + + private void testGetter(String name, List got, List expected, boolean requireExact, RefMetaDataTracker tracker) { + if ( got.size() == 1 && got.get(0) == null ) + got = Collections.emptyList(); + + if ( requireExact ) + Assert.assertEquals(got.size(), expected.size()); + + boolean foundAny = false; + for ( GATKFeature e : expected ) { + boolean found1 = false; + for ( Feature got1 : got ) { + if ( e.getUnderlyingObject() == got1 ) + found1 = true; + } + if ( requireExact ) + Assert.assertTrue(found1, "Never found expected GATKFeature " + e + " bound to " + name + " in " + tracker); + foundAny = found1 || foundAny; + } + + if ( ! requireExact && ! expected.isEmpty() ) + Assert.assertTrue(foundAny, "Never found any got values matching one of the expected values bound to " + name + " in " + tracker); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDataUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDataUnitTest.java deleted file mode 100644 index fa20ea913..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDataUnitTest.java +++ /dev/null @@ -1,48 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; - -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.List; - - -/** - * - * @author aaron - * - * Class ReferenceOrderedDataUnitTest - * - * some functionality to test parts of the reference ordered data system that I've added. This is by NO MEANS - * a complete test suite, but additions would be extremely welcome - */ -public class ReferenceOrderedDataUnitTest extends BaseTest { - @Test - public void extractRodsFromFileTest() { - String file = validationDataLocation + "testRODFileImpl.csv"; - List lst = new ArrayList(); - ReferenceOrderedData.extractRodsFromFile(lst,file); - Assert.assertEquals(lst.size(), 6); - int index = 0; - for (String entry: lst) { - String first = entry.subSequence(0,entry.indexOf(",")).toString(); - Assert.assertTrue(first.equals("rod" + String.valueOf(++index))); - } - } - @Test - public void extractRodsFromMultiFileTest() { - String file = validationDataLocation + "testRODFileImpl.csv"; - String file2 = validationDataLocation + "testRODFileImpl2.csv"; - List lst = new ArrayList(); - ReferenceOrderedData.extractRodsFromFile(lst,file); - ReferenceOrderedData.extractRodsFromFile(lst,file2); - Assert.assertEquals(lst.size(), 12); - int index = 0; - for (String entry: lst) { - String first = entry.subSequence(0,entry.indexOf(",")).toString(); - Assert.assertTrue(first.equals("rod" + String.valueOf(++index))); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java new file mode 100644 index 000000000..bae8e99ed --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.tracks; + + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.codecs.table.BedTableCodec; +import org.broadinstitute.sting.utils.codecs.table.TableFeature; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec; +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.*; +import java.util.*; + + +/** + * @author depristo + * + * UnitTests for RMD FeatureManager + */ +public class FeatureManagerUnitTest extends BaseTest { + private static final File RANDOM_FILE = new File(validationDataLocation + "exampleGATKReport.eval"); + private static final File VCF3_FILE = new File(validationDataLocation + "vcfexample3.vcf"); + private static final File VCF4_FILE = new File(testDir + "HiSeq.10000.vcf"); + private static final File VCF4_FILE_GZ = new File(testDir + "HiSeq.10000.vcf.gz"); + + private FeatureManager manager; + private GenomeLocParser genomeLocParser; + + @BeforeMethod + public void setup() { + File referenceFile = new File(b36KGReference); + try { + IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); + genomeLocParser = new GenomeLocParser(seq); + manager = new FeatureManager(); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(referenceFile,ex); + } + } + + @Test + public void testManagerCreation() { + Assert.assertTrue(manager.getFeatureDescriptors().size() > 0); + } + + private class FMTest extends BaseTest.TestDataProvider { + public Class codec; + public Class feature; + public String name; + public File associatedFile; + + private FMTest(final Class feature, final Class codec, final String name, final File file) { + super(FMTest.class); + this.codec = codec; + this.feature = feature; + this.name = name; + this.associatedFile = file; + } + + public void assertExpected(FeatureManager.FeatureDescriptor featureDescriptor) { + Assert.assertEquals(featureDescriptor.getCodecClass(), codec); + Assert.assertEquals(featureDescriptor.getFeatureClass(), feature); + Assert.assertEquals(featureDescriptor.getName().toLowerCase(), name.toLowerCase()); + } + + public String toString() { + return String.format("FMTest name=%s codec=%s feature=%s file=%s", + name, codec.getSimpleName(), feature.getSimpleName(), associatedFile); + } + } + + @DataProvider(name = "tests") + public Object[][] createTests() { + new FMTest(VariantContext.class, VCF3Codec.class, "VCF3", VCF3_FILE); + new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE); + new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE_GZ); + new FMTest(TableFeature.class, BedTableCodec.class, "bedtable", null); + return FMTest.getTests(FMTest.class); + } + + @Test(dataProvider = "tests") + public void testGetByFile(FMTest params) { + if ( params.associatedFile != null ) { + FeatureManager.FeatureDescriptor byFile = manager.getByFiletype(params.associatedFile); + Assert.assertNotNull(byFile, "Couldn't find any type associated with file " + params.associatedFile); + params.assertExpected(byFile); + } + } + + @Test + public void testGetByFileNoMatch() { + FeatureManager.FeatureDescriptor byFile = manager.getByFiletype(RANDOM_FILE); + Assert.assertNull(byFile, "Found type " + byFile + " associated with RANDOM, non-Tribble file " + RANDOM_FILE); + } + + @Test(dataProvider = "tests") + public void testGetters(FMTest params) { + params.assertExpected(manager.getByCodec(params.codec)); + params.assertExpected(manager.getByName(params.name)); + params.assertExpected(manager.getByName(params.name.toLowerCase())); + params.assertExpected(manager.getByName(params.name.toUpperCase())); + + Collection descriptors = manager.getByFeature(params.feature); + Assert.assertTrue(descriptors.size() > 0, "Look up by FeatureClass failed"); + } + + @Test + public void testUserFriendlyList() { + Assert.assertTrue(manager.userFriendlyListOfAvailableFeatures().length() > 0, "Expected at least one codec to be listed"); + Assert.assertTrue(manager.userFriendlyListOfAvailableFeatures().split(",").length > 0, "Expected at least two codecs, but only saw one"); + } + + @Test + public void testCodecCreation() { + FeatureManager.FeatureDescriptor descriptor = manager.getByName("vcf"); + Assert.assertNotNull(descriptor, "Couldn't find VCF feature descriptor!"); + + FeatureCodec c = manager.createCodec(descriptor, "foo", genomeLocParser); + Assert.assertNotNull(c, "Couldn't create codec"); + Assert.assertEquals(c.getClass(), descriptor.getCodecClass()); + Assert.assertEquals(c.getFeatureType(), descriptor.getFeatureClass()); + } + +} + diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java similarity index 96% rename from public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java index e475e732d..ae218e898 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -1,5 +1,6 @@ /* - * Copyright (c) 2010. The Broad Institute + * Copyright (c) 2011, The Broad Institute + * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -11,7 +12,7 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT @@ -21,13 +22,14 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.refdata.tracks.builders; +package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -73,8 +75,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { @Test public void testBuilder() { - Map classes = builder.getAvailableTrackNamesAndTypes(); - Assert.assertTrue(classes.size() > 0); + Assert.assertTrue(builder.getFeatureManager().getFeatureDescriptors().size() > 0); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java deleted file mode 100644 index cfd75c41a..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.report; - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; - -public class GATKReportParserUnitTest extends BaseTest { - @Test - public void testParse() throws Exception { - GATKReportParser parser = new GATKReportParser(); - parser.parse(new File(validationDataLocation + "exampleGATKReport.eval")); - - Assert.assertEquals(parser.getValue("CountVariants", "none.eval.none.all", "nProcessedLoci"), "100000"); - Assert.assertEquals(parser.getValue("CountVariants", "none.eval.none.all", "nNoCalls"), "99872"); - - Assert.assertEquals(parser.getValue("SimpleMetricsByAC.metrics", "none.eval.none.novel.ac2", "AC"), "2"); - Assert.assertNull(parser.getValue("SimpleMetricsByAC.metrics", "none.eval.none.novel.ac2.bad", "AC")); - Assert.assertNull(parser.getValue("SimpleMetricsByAC.metrics", "none.eval.none.novel.ac2", "AC.bad")); - Assert.assertNull(parser.getValue("SimpleMetricsByAC.metrics.bad", "none.eval.none.novel.ac2", "AC")); - - Assert.assertEquals(parser.getValue("ValidationReport", "none.eval.none.known", "sensitivity"), "NaN"); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java new file mode 100644 index 000000000..02e1ba99a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.report; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class GATKReportUnitTest extends BaseTest { + @Test + public void testParse() throws Exception { + String reportPath = validationDataLocation + "exampleGATKReport.eval"; + GATKReport report = new GATKReport(reportPath); + + GATKReportTable countVariants = report.getTable("CountVariants"); + Assert.assertEquals(countVariants.getVersion(), GATKReportVersion.V0_1); + Object countVariantsPK = countVariants.getPrimaryKey("none.eval.none.all"); + Assert.assertEquals(countVariants.get(countVariantsPK, "nProcessedLoci"), "100000"); + Assert.assertEquals(countVariants.get(countVariantsPK, "nNoCalls"), "99872"); + + GATKReportTable validationReport = report.getTable("ValidationReport"); + Assert.assertEquals(validationReport.getVersion(), GATKReportVersion.V0_1); + Object validationReportPK = countVariants.getPrimaryKey("none.eval.none.known"); + Assert.assertEquals(validationReport.get(validationReportPK, "sensitivity"), "NaN"); + + GATKReportTable simpleMetricsByAC = report.getTable("SimpleMetricsByAC.metrics"); + Assert.assertEquals(simpleMetricsByAC.getVersion(), GATKReportVersion.V0_1); + Object simpleMetricsByACPK = simpleMetricsByAC.getPrimaryKey("none.eval.none.novel.ac2"); + Assert.assertEquals(simpleMetricsByAC.get(simpleMetricsByACPK, "AC"), "2"); + + Assert.assertFalse(simpleMetricsByAC.containsPrimaryKey("none.eval.none.novel.ac2.bad")); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index c0d32a05b..7f4d96add 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -127,6 +127,7 @@ public class TraverseReadsUnitTest extends BaseTest { Object accumulator = countReadWalker.reduceInit(); while (shardStrategy.hasNext()) { + traversalEngine.startTimersIfNecessary(); Shard shard = shardStrategy.next(); if (shard == null) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java new file mode 100644 index 000000000..b4a8498e1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/CNV/SymbolicAllelesIntegrationTest.java @@ -0,0 +1,39 @@ +package org.broadinstitute.sting.gatk.walkers.CNV; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class SymbolicAllelesIntegrationTest extends WalkerTest { + + public static String baseTestString(String reference, String VCF) { + return "-T CombineVariants" + + " -R " + reference + + " --variant:vcf " + validationDataLocation + VCF + + " -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED" + + " -genotypeMergeOptions REQUIRE_UNIQUE" + + " -setKey null" + + " -o %s" + + " -NO_HEADER"; + } + + + @Test + public void test1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(b36KGReference, "symbolic_alleles_1.vcf"), + 1, + Arrays.asList("89a1c56f264ac27a2a4be81072473b6f")); + executeTest("Test symbolic alleles", spec); + } + + @Test + public void test2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(b36KGReference, "symbolic_alleles_2.vcf"), + 1, + Arrays.asList("6645babc8c7d46be0da223477c7b1291")); + executeTest("Test symbolic alleles mixed in with non-symbolic alleles", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java index a129f8adf..1565c419b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java @@ -37,8 +37,8 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { "-R " + hg18Reference + " -T ClipReads " + "-I " + validationDataLocation + "clippingReadsTest.bam " + - "-o %s " + - "-ob %s " + args, + "-os %s " + + "-o %s " + args, 2, // just one output file Arrays.asList("tmp", "bam"), Arrays.asList(md51, md52)); @@ -72,9 +72,9 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { " -I " + validationDataLocation + "originalQuals.chr1.1-1K.bam" + " -L chr1:1-1,000" + " -OQ -QT 4 -CR WRITE_Q0S" + - " -o %s -ob %s", + " -o %s -os %s", 2, - Arrays.asList("55c01ccc2e84481b22d3632cdb06c8ba", "22db22749f811d30216215e047461621")); + Arrays.asList("22db22749f811d30216215e047461621", "55c01ccc2e84481b22d3632cdb06c8ba")); executeTest("clipOriginalQuals", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java new file mode 100644 index 000000000..462abeba1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.testng.Assert; +import org.testng.annotations.Test; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff.SnpEffEffect; + +public class SnpEffUnitTest { + + @Test + public void testParseWellFormedEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( effect.isWellFormed() && effect.isCoding() ); + } + + @Test + public void testParseInvalidEffectNameEffect() { + String effectName = "MADE_UP_EFFECT"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseInvalidEffectImpactEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseWrongNumberOfMetadataFieldsEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertFalse(effect.isWellFormed()); + } + + @Test + public void testParseSnpEffWarningEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") ); + } + + @Test + public void testParseSnpEffErrorEffect() { + String effectName = "NON_SYNONYMOUS_CODING"; + String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" }; + + SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") ); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index e6300e6c9..08baae7a7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; @@ -14,7 +15,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testHasAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + baseTestString() + " --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, Arrays.asList("8a105fa5eebdfffe7326bc5b3d8ffd1c")); executeTest("test file has annotations, not asking for annotations, #1", spec); } @@ -22,7 +23,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testHasAnnotsNotAsking2() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + baseTestString() + " --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, Arrays.asList("964f1016ec9a3c55333f62dd834c14d6")); executeTest("test file has annotations, not asking for annotations, #2", spec); } @@ -30,7 +31,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, Arrays.asList("8e7de435105499cd71ffc099e268a83e")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -38,7 +39,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, Arrays.asList("64b6804cb1e27826e3a47089349be581")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -46,7 +47,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoAnnotsNotAsking1() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + baseTestString() + " --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, Arrays.asList("42ccee09fa9f8c58f4a0d4f1139c094f")); executeTest("test file doesn't have annotations, not asking for annotations, #1", spec); } @@ -54,7 +55,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoAnnotsNotAsking2() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + baseTestString() + " --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, Arrays.asList("f2ddfa8105c290b1f34b7a261a02a1ac")); executeTest("test file doesn't have annotations, not asking for annotations, #2", spec); } @@ -62,7 +63,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, Arrays.asList("fd1ffb669800c2e07df1e2719aa38e49")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -70,7 +71,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, Arrays.asList("09f8e840770a9411ff77508e0ed0837f")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -78,7 +79,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, + baseTestString() + " -G \"Standard\" --variant:VCF " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, Arrays.asList("78d2c19f8107d865970dbaf3e12edd92")); executeTest("test overwriting header", spec); } @@ -86,7 +87,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, + baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, Arrays.asList("16e3a1403fc376320d7c69492cad9345")); executeTest("not passing it any reads", spec); } @@ -94,7 +95,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -D " + GATKDataLocation + "dbsnp_129_b36.rod -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, + baseTestString() + " --dbsnp " + b36dbSNP129 + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, Arrays.asList("3da8ca2b6bdaf6e92d94a8c77a71313d")); executeTest("getting DB tag with dbSNP", spec); } @@ -102,7 +103,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B:compH3,VCF " + validationDataLocation + "fakeHM3.vcf -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, + baseTestString() + " --comp:H3 " + validationDataLocation + "fakeHM3.vcf -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, Arrays.asList("1bc01c5b3bd0b7aef75230310c3ce688")); executeTest("getting DB tag with HM3", spec); } @@ -110,7 +111,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B:foo,VCF " + validationDataLocation + "targetAnnotations.vcf -G \"Standard\" -B:variant,VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -BTI variant", 1, + baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -BTI variant", 1, Arrays.asList("e9c0d832dc6b4ed06c955060f830c140")); executeTest("using expression", spec); } @@ -120,9 +121,33 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { final String MD5 = "13269d5a2e16f06fd755cc0fb9271acf"; for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -A HomopolymerRun -B:variant,VCF " + validationDataLocation + "/" + file + " -BTI variant -NO_HEADER", 1, + baseTestString() + " -A HomopolymerRun --variant:VCF " + validationDataLocation + "/" + file + " -BTI variant -NO_HEADER", 1, Arrays.asList(MD5)); executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); } } + + @Test + public void testSnpEffAnnotations() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000", + 1, + Arrays.asList("486fc6a5ca1819f5ab180d5d72b1ebc9") + ); + executeTest("Testing SnpEff annotations", spec); + } + + @Test + public void testSnpEffAnnotationsUnsupportedVersion() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + + "snpEff.AFR.unfiltered.unsupported.version.vcf -L 1:1-1,500,000", + 1, + UserException.class + ); + executeTest("Testing SnpEff annotations (unsupported version)", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java deleted file mode 100755 index c75a5b2dc..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java +++ /dev/null @@ -1,83 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - - -import java.util.Arrays; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -public class GenomicAnnotatorIntegrationTest extends WalkerTest { - String testFileWithIndels = validationDataLocation + "/GenomicAnnotatorValidation/1KGBroadWEx.cleaned.indels.vcf"; - String testFileWithSNPsAndIndels = validationDataLocation + "/GenomicAnnotatorValidation/1KGBroadWEx.variants.vcf"; - - @Test - public void testGenomicAnnotatorOnDbSNP() { - - /* - TODO put this test back in once it gets faster. - String[] md5 = {"d19d6d1eb52fb09e7493653dc645d92a"}; - WalkerTestSpec spec = new WalkerTestSpec( - "-T GenomicAnnotator -R " + b36KGReference + " " + - "-B:variant,vcf /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf " + - "-B:dbsnp,AnnotatorInputTable /humgen/gsa-hpprojects/GATK/data/Annotations/dbsnp/b130/snp130-b36-only-the-SNPs.txt " + - "-m " + //generate many records from one input record if necessary - "-o %s " + - "-BTI variant", - 1, - Arrays.asList(md5)); - executeTest("test with dbSNP", spec); - */ - - - String[] md5WithDashSArg = {"efba4ce1641cfa2ef88a64395f2ebce8"}; - WalkerTestSpec specWithSArg = new WalkerTestSpec( - "-T GenomicAnnotator -R " + b36KGReference + - " -B:variant,vcf3 /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" + - " -B:dbsnp,AnnotatorInputTable /humgen/gsa-hpprojects/GATK/data/Annotations/dbsnp/b130/snp130-b36-only-the-SNPs.txt" + - " -m" + //generate many records from one input record if necessary - " -o %s" + - " -BTI variant" + - " -s dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet" + - " -NO_HEADER", - 1, - Arrays.asList(md5WithDashSArg)); - executeTest("test with dbSNP and -s arg", specWithSArg); - - } - - @Test - public void testGenomicAnnotatorOnIndels() { - WalkerTestSpec testOnIndels = new WalkerTestSpec( - buildCommandLine( - "-T GenomicAnnotator", - "-R " + b37KGReference, - "-L 22:10000000-20000000", - "-B:refseq,AnnotatorInputTable " + b37Refseq, - "-B:variant,VCF " + testFileWithIndels, - "-NO_HEADER", - "-o %s" - ), - 1, - Arrays.asList("772fc3f43b70770ec6c6acbb8bbbd4c0") - ); - executeTest("testGenomicAnnotatorOnIndels", testOnIndels); - } - - @Test - public void testGenomicAnnotatorOnSNPsAndIndels() { - WalkerTestSpec testOnSNPsAndIndels = new WalkerTestSpec( - buildCommandLine( - "-T GenomicAnnotator", - "-R " + b37KGReference, - "-L 22:10000000-20000000", - "-B:refseq,AnnotatorInputTable " + b37Refseq, - "-B:variant,VCF " + testFileWithSNPsAndIndels, - "-NO_HEADER", - "-o %s" - ), - 1, - Arrays.asList("081ade7f3d2d3c5f19cb1e8651a626f3") - ); - executeTest("testGenomicAnnotatorOnSNPsAndIndels", testOnSNPsAndIndels); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index fef1b6e64..1a01ef8e8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -37,11 +37,11 @@ public class BeagleIntegrationTest extends WalkerTest { public void testBeagleOutput() { WalkerTestSpec spec = new WalkerTestSpec( "-T BeagleOutputToVCF -R " + hg19Reference + " " + - "-B:variant,VCF3 " + beagleValidationDataLocation + "inttestbgl.input.vcf " + - "-B:beagleR2,BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + - "-B:beagleProbs,BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + - "-B:beaglePhased,BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s -NO_HEADER", 1, Arrays.asList("3531451e84208264104040993889aaf4")); + "--variant:VCF3 " + beagleValidationDataLocation + "inttestbgl.input.vcf " + + "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + + "-o %s -NO_HEADER", 1, Arrays.asList("b445d280fd8fee1eeb4aacb3f5a54847")); executeTest("test BeagleOutputToVCF", spec); } @@ -49,7 +49,7 @@ public class BeagleIntegrationTest extends WalkerTest { public void testBeagleInput() { WalkerTestSpec spec = new WalkerTestSpec( "-T ProduceBeagleInput -R " + hg19Reference + " " + - "-B:variant,VCF3 " + beagleValidationDataLocation + "inttestbgl.input.vcf " + + "--variant:VCF3 " + beagleValidationDataLocation + "inttestbgl.input.vcf " + "-o %s", 1, Arrays.asList("a01c704246f3dd1b9c65774007e51e69")); executeTest("test BeagleInput", spec); } @@ -57,8 +57,8 @@ public class BeagleIntegrationTest extends WalkerTest { @Test public void testBeagleInput2() { WalkerTestSpec spec = new WalkerTestSpec( - "-T ProduceBeagleInput -B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+ - "-B:validation,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+ + "-T ProduceBeagleInput --variant:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+ + "--validation:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+ "-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta -NO_HEADER ",2, Arrays.asList("660986891b30cdc937e0f2a3a5743faa","e96ddd51da9f4a797b2aa8c20e404166")); executeTest("test BeagleInputWithBootstrap",spec); @@ -68,11 +68,11 @@ public class BeagleIntegrationTest extends WalkerTest { public void testBeagleOutput2() { WalkerTestSpec spec = new WalkerTestSpec( "-T BeagleOutputToVCF -R "+hg19Reference+" "+ - "-B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.vcf "+ - "-B:beagleR2,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ - "-B:beagleProbs,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ - "-B:beaglePhased,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("8dd6ec53994fb46c5c22af8535d22965")); + "--variant:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.vcf "+ + "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ + "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ + "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ + "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("51a57ea565176edd96d907906914b0ee")); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java index 4a32d6701..1ba7a5e85 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLociWalkerIntegrationTest.java @@ -30,7 +30,7 @@ import org.testng.annotations.Test; import java.util.Arrays; public class CompareCallableLociWalkerIntegrationTest extends WalkerTest { - final static String commonArgs = "-R " + hg18Reference + " -T CompareCallableLoci -B:comp1,Bed " + validationDataLocation + "1kg_slx.chr1_10mb.callable.bed -B:comp2,Bed " + validationDataLocation + "ga2_slx.chr1_10mb.callable.bed -o %s"; + final static String commonArgs = "-R " + hg18Reference + " -T CompareCallableLoci --comp1:Bed " + validationDataLocation + "1kg_slx.chr1_10mb.callable.bed --comp2:Bed " + validationDataLocation + "ga2_slx.chr1_10mb.callable.bed -o %s"; @Test public void testCompareCallableLociWalker1() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java index 77159d9c2..1f11b5886 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java @@ -30,8 +30,6 @@ import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; -import java.util.Collections; -import java.util.List; public class DiffObjectsIntegrationTest extends WalkerTest { private class TestParams extends TestDataProvider { @@ -52,8 +50,8 @@ public class DiffObjectsIntegrationTest extends WalkerTest { @DataProvider(name = "data") public Object[][] createData() { - new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "4d9f4636de05b93c354d05011264546e"); - new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "37e6efd833b5cd6d860a9df3df9713fc"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "dc1ca75c6ecf32641967d61e167acfff"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "df0fcb568a3a49fc74830103b2e26f6c"); return TestParams.getTests(TestParams.class); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java index 0c034eba9..9af39e92c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceIntegrationTest.java @@ -24,15 +24,15 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest { executeTest("testFastaReference", spec1b); WalkerTestSpec spec2 = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -B:indels,VCF " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -B:snpmask,dbsnp " + GATKDataLocation + "dbsnp_129_b36.rod -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s", + "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380;1:10,093,447-10,093,847;1:10,271,252-10,271,452 -o %s", 1, - Arrays.asList("3a48986c3832a768b478c3e95f994b0f")); + Arrays.asList("0567b32ebdc26604ddf2a390de4579ac")); executeTest("testFastaAlternateReferenceIndels", spec2); WalkerTestSpec spec3 = new WalkerTestSpec( - "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -B:snps,GeliText " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.geli.calls -B:snpmask,dbsnp " + GATKDataLocation + "dbsnp_129_b36.rod -L 1:10,023,400-10,023,500;1:10,029,200-10,029,500 -o %s", + "-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500;1:10,029,200-10,029,500 -o %s", 1, - Arrays.asList("82705a88f6fc25880dd2331183531d9a")); + Arrays.asList("8b6cd2e20c381f9819aab2d270f5e641")); executeTest("testFastaAlternateReferenceSnps", spec3); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 7bec67d2e..1cb43ceb1 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -15,7 +15,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testNoAction() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("8a105fa5eebdfffe7326bc5b3d8ffd1c")); executeTest("test no action", spec); } @@ -23,7 +23,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testClusteredSnps() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -window 10 -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -window 10 --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("27b13f179bb4920615dff3a32730d845")); executeTest("test clustered SNPs", spec); } @@ -31,17 +31,17 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testMasks() { WalkerTestSpec spec1 = new WalkerTestSpec( - baseTestString() + " -mask foo -B:mask,VCF3 " + validationDataLocation + "vcfexample2.vcf -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -maskName foo --mask:VCF3 " + validationDataLocation + "vcfexample2.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("578f9e774784c25871678e6464fd212b")); executeTest("test mask all", spec1); WalkerTestSpec spec2 = new WalkerTestSpec( - baseTestString() + " -mask foo -B:mask,VCF " + validationDataLocation + "vcfMask.vcf -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -maskName foo --mask:VCF " + validationDataLocation + "vcfMask.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("bfa86a674aefca1b13d341cb14ab3c4f")); executeTest("test mask some", spec2); WalkerTestSpec spec3 = new WalkerTestSpec( - baseTestString() + " -mask foo -maskExtend 10 -B:mask,VCF " + validationDataLocation + "vcfMask.vcf -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -maskName foo -maskExtend 10 --mask:VCF " + validationDataLocation + "vcfMask.vcf --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("5939f80d14b32d88587373532d7b90e5")); executeTest("test mask extend", spec3); } @@ -49,7 +49,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testFilter1() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("45219dbcfb6f81bba2ea0c35f5bfd368")); executeTest("test filter #1", spec); } @@ -57,7 +57,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testFilter2() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("c95845e817da7352b9b72bc9794f18fb")); executeTest("test filter #2", spec); } @@ -65,7 +65,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testFilterWithSeparateNames() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("b8cdd7f44ff1a395e0a9b06a87e1e530")); executeTest("test filter with separate names #2", spec); } @@ -73,12 +73,12 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testGenotypeFilters() { WalkerTestSpec spec1 = new WalkerTestSpec( - baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("96b61e4543a73fe725e433f007260039")); executeTest("test genotype filter #1", spec1); WalkerTestSpec spec2 = new WalkerTestSpec( - baseTestString() + " -G_filter 'AF == 0.04 && isHomVar == 1' -G_filterName foo -B:variant,VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -G_filter 'AF == 0.04 && isHomVar == 1' -G_filterName foo --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1, Arrays.asList("6c8112ab17ce39c8022c891ae73bf38e")); executeTest("test genotype filter #2", spec2); } @@ -86,7 +86,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { @Test public void testDeletions() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --filterExpression 'QUAL < 100' --filterName foo -B:variant,VCF " + validationDataLocation + "twoDeletions.vcf", 1, + baseTestString() + " --filterExpression 'QUAL < 100' --filterName foo --variant:VCF " + validationDataLocation + "twoDeletions.vcf", 1, Arrays.asList("569546fd798afa0e65c5b61b440d07ac")); executeTest("test deletions", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 1f23d262e..41496bdf1 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -16,8 +16,9 @@ import java.util.Map; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH"; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL"; + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- // @@ -28,38 +29,20 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("c97829259463d04b0159591bb6fb44af")); + Arrays.asList("e6639ea2dc81635c706e6c35921406d7")); executeTest("test MultiSample Pilot1", spec); } - // @Test - // todo - currently not working because when calling indels, using GENOTYPE_GIVEN_ALLELES yields a different result than in normal mode. To be fixed when extended events are removed. - public void testMultiSamplePilot2AndRecallingWithAlleles() { - String md5 = "b45636b29891f9df573ad2af6f507ee0"; - - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList(md5)); - List result = executeTest("test MultiSample Pilot2", spec1).getFirst(); - - GenomeAnalysisEngine.resetRandomGenerator(); - - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList(md5)); - executeTest("test MultiSample Pilot2 with alleles passed in", spec2); - } - @Test public void testWithAllelesPassedIn() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("2b69667f4770e8c0c894066b7f27e440")); + baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, + Arrays.asList("8de2602679ffc92388da0b6cb4325ef6")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("b77fe007c2a97fcd59dfd5eef94d8b95")); + baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, + Arrays.asList("ec43daadfb15b00b41aeb0017a45df0b")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -67,7 +50,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("ee8a5e63ddd470726a749e69c0c20f60")); + Arrays.asList("d1cbd1fb9f3f7323941a95bc2def7e5a")); executeTest("test SingleSample Pilot2", spec); } @@ -77,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "ef31654a2b85b9b2d3bba4f4a75a17b6"; + private final static String COMPRESSED_OUTPUT_MD5 = "2732b169cdccb21eb3ea00429619de79"; @Test public void testCompressedOutput() { @@ -87,15 +70,6 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test compressed output", spec); } - // todo -- fixme -// @Test -// public void testCompressedOutputParallel() { -// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( -// baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000 -nt 4", 1, -// Arrays.asList("gz"), Arrays.asList(COMPRESSED_OUTPUT_MD5)); -// executeTest("testCompressedOutput-nt4", spec); -// } - // -------------------------------------------------------------------------------------------------------------- // // testing parallelization @@ -107,7 +81,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "46868a9c4134651c54535fb46b408aee"; + String md5 = "cbac3960bbcb9d6192c57549208c182c"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -138,9 +112,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testCallingParameters() { HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "5043c9a101e691602eb7a3f9704bdf20" ); - e.put( "--min_mapping_quality_score 26", "71a833eb8fd93ee62ae0d5a430f27940" ); - e.put( "--p_nonref_model GRID_SEARCH", "ddf443e9dcadef367476b26b4d52c134" ); + e.put( "--min_base_quality_score 26", "531966aee1cd5dced61c96c4fedb59a9" ); + e.put( "--min_mapping_quality_score 26", "c71ca370947739cb7d87b59452be7a07" ); + e.put( "--computeSLOD", "1a5648f26c18ced27df4be031b44e72d" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -153,9 +127,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameter() { HashMap e = new HashMap(); - e.put( "-sites_only", "eaad6ceb71ab94290650a70bea5ab951" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "05bf7db8a3d19ef4a3d14772c90b732f" ); - e.put( "--output_mode EMIT_ALL_SITES", "e4b86740468d7369f0156550855586c7" ); + e.put( "-sites_only", "d40114aa201aa33ff5f174f15b6b73af" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "3c681b053fd2280f3c42041d24243752" ); + e.put( "--output_mode EMIT_ALL_SITES", "eafa6d71c5ecd64dfee5d7a3f60e392e" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -169,12 +143,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("71a833eb8fd93ee62ae0d5a430f27940")); + Arrays.asList("c71ca370947739cb7d87b59452be7a07")); executeTest("test confidence 1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("79968844dc3ddecb97748c1acf2984c7")); + Arrays.asList("1c0a599d475cc7d5e745df6e9b6c0d29")); executeTest("test confidence 2", spec2); } @@ -186,8 +160,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "4e878664f61d2d800146d3762303fde1" ); - e.put( 1.0 / 1850, "9204caec095ff5e63ca21a10b6fab453" ); + e.put( 0.01, "aed69402ddffe7f2ed5ca98563bfba02" ); + e.put( 1.0 / 1850, "fa94a059f08c1821b721335d93ed2ea5" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -211,7 +185,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("1a58ec52df545f946f80cc16c5736a91")); + Arrays.asList("1c080e6596d4c830bb5d147b04e2a82c")); executeTest(String.format("test multiple technologies"), spec); } @@ -230,25 +204,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("62d0f6d9de344ce68ce121c13b1e78b1")); + Arrays.asList("9129ad748ca3be2d3b321d2d7e83ae5b")); executeTest(String.format("test calling with BAQ"), spec); } - @Test - public void testCallingWithBAQOff() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,100,000" + - " -baq OFF", - 1, - Arrays.asList("1a58ec52df545f946f80cc16c5736a91")); - - executeTest(String.format("test calling with BAQ OFF"), spec); - } - // -------------------------------------------------------------------------------------------------------------- // // testing indel caller @@ -263,7 +223,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("631ae1f1eb6bc4c1a4136b8495250536")); + Arrays.asList("0bece77ce6bc447438ef9b2921b2dc41")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -278,7 +238,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("fd556585c79e2b892a5976668f45aa43")); + Arrays.asList("5fe98ee853586dc9db58f0bc97daea63")); executeTest(String.format("test indel caller in SLX witn low min allele count"), spec); } @@ -291,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("9cd56feedd2787919e571383889fde70")); + Arrays.asList("790b1a1d6ab79eee8c24812bb8ca6fae")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -299,18 +259,32 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testWithIndelAllelesPassedIn() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("315e1b78d7a403d7fcbcf0caa8c496b8")); + Arrays.asList("408d3aba4d094c067fc00a43992c2292")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("cf89e0c54f14482a23c105b73a333d8a")); + Arrays.asList("94977d6e42e764280e9deaf4e3ac8c80")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); + + WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, + Arrays.asList("e66b7321e2ac91742ad3ef91040daafd")); + executeTest("test MultiSample Pilot2 indels with complicated records", spec3); + + WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( + baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, + Arrays.asList("4be308fd9e8167ebee677f62a7a753b7")); + executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java index 866c27f8d..3ff453dab 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java @@ -15,7 +15,7 @@ public class UnifiedGenotyperPerformanceTest extends WalkerTest { " -glm BOTH" + " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" + " -L chr1:1-50,000,000" + - " -D " + GATKDataLocation + "dbsnp_129_hg18.rod" + + " --dbsnp:VCF " + hg18dbSNP132 + " -o /dev/null", 0, new ArrayList(0)); @@ -30,7 +30,7 @@ public class UnifiedGenotyperPerformanceTest extends WalkerTest { " -glm BOTH" + " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" + " -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" + - " -D " + GATKDataLocation + "dbsnp_129_hg18.rod" + + " --dbsnp:vcf " + hg18dbSNP132 + " -o /dev/null", 0, new ArrayList(0)); @@ -46,7 +46,7 @@ public class UnifiedGenotyperPerformanceTest extends WalkerTest { " -glm BOTH" + " -L chr1:1-50,000,000" + " -nt 10" + - " -D " + GATKDataLocation + "dbsnp_129_hg18.rod" + + " --dbsnp:vcf " + hg18dbSNP132 + " -o /dev/null", 0, new ArrayList(0)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index 2676f7067..0ff6fc244 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.indels; import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; @@ -28,47 +27,28 @@ public class IndelRealignerIntegrationTest extends WalkerTest { executeTest("test realigner defaults", spec1); WalkerTestSpec spec2 = new WalkerTestSpec( - baseCommand + "-B:indels,vcf " + knownIndels, + baseCommand + "-known " + knownIndels, 1, Arrays.asList(base_md5_with_SW_or_VCF)); executeTest("test realigner defaults with VCF", spec2); - - WalkerTestSpec spec3 = new WalkerTestSpec( - baseCommand + "-D " + GATKDataLocation + "dbsnp_129_b36.rod", - 1, - Arrays.asList(base_md5)); - executeTest("realigner defaults with dbsnp", spec3); - } @Test public void testKnownsOnly() { WalkerTestSpec spec1 = new WalkerTestSpec( - baseCommand + "--consensusDeterminationModel KNOWNS_ONLY -B:indels,vcf " + knownIndels, + baseCommand + "--consensusDeterminationModel KNOWNS_ONLY -known " + knownIndels, 1, Arrays.asList("3dd5d2c9931b375455af0bff1a2c4888")); executeTest("realigner known indels only from VCF", spec1); - - WalkerTestSpec spec2 = new WalkerTestSpec( - baseCommand + "--consensusDeterminationModel KNOWNS_ONLY -D " + GATKDataLocation + "dbsnp_129_b36.rod", - 1, - Arrays.asList("05a114623c126b0398fbc1703437461e")); - executeTest("realigner known indels only from dbsnp", spec2); } @Test public void testUseSW() { WalkerTestSpec spec1 = new WalkerTestSpec( - baseCommand + "--consensusDeterminationModel USE_SW -B:indels,vcf " + knownIndels, + baseCommand + "--consensusDeterminationModel USE_SW -known " + knownIndels, 1, Arrays.asList(base_md5_with_SW_or_VCF)); executeTest("realigner use SW from VCF", spec1); - - WalkerTestSpec spec2 = new WalkerTestSpec( - baseCommand + "--consensusDeterminationModel USE_SW -D " + GATKDataLocation + "dbsnp_129_b36.rod", - 1, - Arrays.asList(base_md5_with_SW_or_VCF)); - executeTest("realigner use SW from dbsnp", spec2); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java index fd5ad0b22..77675b0f4 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java @@ -30,7 +30,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -LOD 5" + " -maxConsensuses 100" + " -greedy 100" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -known " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -o /dev/null" + " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" + " -L chr1:1-5,650,000" + @@ -45,7 +45,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -LOD 5" + " -maxConsensuses 100" + " -greedy 100" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -known " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -o /dev/null" + " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" + " -L chr1:1-150,000,000" + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java index 4b225aaea..1873ccbe2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -11,19 +11,19 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest { public void testIntervals() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - "-T RealignerTargetCreator -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", + "-T RealignerTargetCreator -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam --mismatchFraction 0.15 -L 1:10,000,000-10,050,000 -o %s", 1, Arrays.asList("e7accfa58415d6da80383953b1a3a986")); executeTest("test standard", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - "-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", + "-T RealignerTargetCreator --known " + b36dbSNP129 + " -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("f23ba17ee0f9573dd307708175d90cd2")); + Arrays.asList("0367d39a122c8ac0899fb868a82ef728")); executeTest("test dbsnp", spec2); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - "-T RealignerTargetCreator -R " + b36KGReference + " -B:indels,VCF " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -BTI indels -o %s", + "-T RealignerTargetCreator -R " + b36KGReference + " --known " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -BTI known -o %s", 1, Arrays.asList("5206cee6c01b299417bf2feeb8b3dc96")); executeTest("test rods only", spec3); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java index 0b6694fd9..cc37cc191 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java @@ -12,7 +12,7 @@ public class RealignerTargetCreatorPerformanceTest extends WalkerTest { WalkerTestSpec spec1 = new WalkerTestSpec( "-R " + hg18Reference + " -T RealignerTargetCreator" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " --known " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" + " -L chr1:1-50,000,000" + " -o /dev/null", @@ -23,7 +23,7 @@ public class RealignerTargetCreatorPerformanceTest extends WalkerTest { WalkerTestSpec spec2 = new WalkerTestSpec( "-R " + hg18Reference + " -T RealignerTargetCreator" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " --known " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" + " -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" + " -o /dev/null", diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypesIntegrationTest.java index 21435dd7d..cf6b4e581 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypesIntegrationTest.java @@ -16,8 +16,8 @@ public class MergeAndMatchHaplotypesIntegrationTest extends WalkerTest { buildCommandLine( "-T MergeAndMatchHaplotypes", "-R " + b37KGReference, - "-B:pbt,VCF " + fundamentalTestPBTVCF, - "-B:rbp,VCF " + fundamentalTestRBPVCF, + "--pbt " + fundamentalTestPBTVCF, + "--rbp " + fundamentalTestRBPVCF, "-o %s" ), 1, diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java index c88eac149..2e4556af0 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeMNPsIntegrationTest.java @@ -10,7 +10,7 @@ public class MergeMNPsIntegrationTest extends WalkerTest { public static String baseTestString(String reference, String VCF, int maxDistMNP) { return "-T MergeMNPs" + " -R " + reference + - " -B:variant,VCF " + validationDataLocation + VCF + + " --variant:vcf " + validationDataLocation + VCF + " --maxGenomicDistanceForMNP " + maxDistMNP + " -o %s" + " -NO_HEADER"; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java index f855c1dd3..db1e4a82f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesIntegrationTest.java @@ -10,7 +10,7 @@ public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest public static String baseTestString(String reference, String VCF, int maxDist) { return "-T MergeSegregatingAlternateAlleles" + " -R " + reference + - " -B:variant,VCF " + validationDataLocation + VCF + + " --variant:vcf " + validationDataLocation + VCF + " --maxGenomicDistance " + maxDist + " -o %s" + " -NO_HEADER"; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index 9f59adeb6..c663c1dd7 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -10,35 +10,19 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf"; @Test - public void testBasicFunctionalityWithoutFilters() { + public void testBasicFunctionality() { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T PhaseByTransmission", + "-NO_HEADER", "-R " + b37KGReference, - "-B:variant,VCF " + fundamentalTestVCF, - "-f NA12892+NA12891=NA12878", - "-nofilters", - "-o %s" - ), - 1, - Arrays.asList("416a483e87358cdcb0b09a496e3254c0") - ); - executeTest("testBasicFunctionalityWithoutFilters", spec); - } - - @Test - public void testBasicFunctionalityWithFilters() { - WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T PhaseByTransmission", - "-R " + b37KGReference, - "-B:variant,VCF " + fundamentalTestVCF, + "--variant " + fundamentalTestVCF, "-f NA12892+NA12891=NA12878", "-o %s" ), 1, - Arrays.asList("8c5db343567e90e97993912c7e541d0d") + Arrays.asList("") ); - executeTest("testBasicFunctionalityWithFilters", spec); + executeTest("testBasicFunctionality", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java index 1bf3e579f..e1d22f107 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -11,7 +11,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { return "-T ReadBackedPhasing" + " -R " + reference + " -I " + validationDataLocation + reads + - " -B:variant,VCF " + validationDataLocation + VCF + + " --variant " + validationDataLocation + VCF + " --cacheWindowSize " + cacheWindowSize + " --maxPhaseSites " + maxPhaseSites + " --phaseQualityThresh " + phaseQualityThresh + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java index fc4e5ac66..4be848164 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/DictionaryConsistencyIntegrationTest.java @@ -56,7 +56,7 @@ public class DictionaryConsistencyIntegrationTest extends WalkerTest { } private WalkerTest.WalkerTestSpec testVCF(String ref, String vcf, Class c) { - return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 -B:two,vcf " + return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf " + vcf + " -F POS,CHROM -R " + ref + " -o %s", 1, c); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java index c5cdf9f02..ad190fae6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java @@ -18,7 +18,7 @@ public class ValidatingPileupIntegrationTest extends WalkerTest { "-T ValidatingPileup" + " -I " + validationDataLocation + "MV1994.selected.bam" + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + - " -B:pileup,SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" + + " --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" + " -S SILENT -nt 8",0, Collections.emptyList()); executeTest("testEcoliThreaded",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index 129161da3..b2833b935 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.HashMap; @@ -12,78 +13,115 @@ import java.io.File; public class RecalibrationWalkersIntegrationTest extends WalkerTest { static HashMap paramsFiles = new HashMap(); - static HashMap paramsFilesNoReadGroupTest = new HashMap(); static HashMap paramsFilesSolidIndels = new HashMap(); - @Test - public void testCountCovariates1() { - HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "7b5832d4b2a23b8ef2bb639eb59bfa88" ); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "9c006f8e9fb5752b1c139f5a8cc7ea88"); - e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "e6f7b4ab9aa291022e0ba8b7dbe4c77e" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "e6b98af01c5a08e4954b79ec42db6fc3" ); + private static final class CCTest extends TestDataProvider { + String file, md5; - for ( String parallelism : Arrays.asList("", " -nt 4")) { - for ( Map.Entry entry : e.entrySet() ) { - String bam = entry.getKey(); - String md5 = entry.getValue(); + private CCTest(final String file, final String md5) { + super(CCTest.class); + this.file = file; + this.md5 = md5; + } - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b36KGReference + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -T CountCovariates" + - " -I " + bam + - ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" ) - ? " -L 1:10,800,000-10,810,000" : " -L 1:10,000,000-10,200,000" ) + - " -cov ReadGroupCovariate" + - " -cov QualityScoreCovariate" + - " -cov CycleCovariate" + - " -cov DinucCovariate" + - " --solid_recal_mode SET_Q_ZERO" + - " -recalFile %s" + parallelism, - 1, // just one output file - Arrays.asList(md5)); - List result = executeTest("testCountCovariates1" + parallelism, spec).getFirst(); - paramsFiles.put(bam, result.get(0).getAbsolutePath()); - } + public String toString() { + return "CCTest: " + file; } } - - @Test - public void testTableRecalibrator1() { - HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0278cce4cfdab869dc0c11d6852a984b" ); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "6797d7ffa4ef6c48413719ba32696ccf"); - e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "2bb3374dde131791d7638031ae3b3e10" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "1f9d8944b73169b367cb83b0d22e5432" ); - for ( Map.Entry entry : e.entrySet() ) { - String bam = entry.getKey(); - String md5 = entry.getValue(); - String paramsFile = paramsFiles.get(bam); - System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile); - if ( paramsFile != null ) { - WalkerTestSpec spec = new WalkerTestSpec( - "-R " + b36KGReference + - " -T TableRecalibration" + - " -I " + bam + - ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" ) + @DataProvider(name = "cctestdata") + public Object[][] createCCTestData() { + new CCTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "5a52b00d9794d27af723bcf93366681e" ); + new CCTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "17d4b8001c982a70185e344929cf3941"); + new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "714e65d6cb51ae32221a77ce84cbbcdc" ); + new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "64e9f17a1cf6fc04c1f2717c2d2eca67" ); + return CCTest.getTests(CCTest.class); + } + + @Test(dataProvider = "cctestdata") + public void testCountCovariates1(CCTest test) { + testCC(test, ""); + } + + @Test(dataProvider = "cctestdata") + public void testCountCovariates4(CCTest test) { + testCC(test, " -nt 4"); + } + + private final void testCC(CCTest test, String parallelism) { + String bam = test.file; + String md5 = test.md5; + + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " -knownSites " + b36dbSNP129 + + " -T CountCovariates" + + " -I " + bam + + ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" ) + ? " -L 1:10,800,000-10,810,000" : " -L 1:10,000,000-10,200,000" ) + + " -cov ReadGroupCovariate" + + " -cov QualityScoreCovariate" + + " -cov CycleCovariate" + + " -cov DinucCovariate" + + " --solid_recal_mode SET_Q_ZERO" + + " -recalFile %s" + parallelism, + 1, // just one output file + Arrays.asList(md5)); + List result = executeTest("testCountCovariates1" + parallelism, spec).getFirst(); + paramsFiles.put(bam, result.get(0).getAbsolutePath()); + } + + + private static final class TRTest extends TestDataProvider { + String file, md5; + + private TRTest(final String file, final String md5) { + super(TRTest.class); + this.file = file; + this.md5 = md5; + } + + public String toString() { + return "TRTest: " + file; + } + } + + @DataProvider(name = "trtestdata") + public Object[][] createTRTestData() { + new TRTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); + new TRTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "c164dd635721ba6df3f06dac1877c32d"); + new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "74314e5562c1a65547bb0edaacffe602" ); + new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "2a37c6001826bfabf87063b1dfcf594f" ); + return TRTest.getTests(TRTest.class); + } + + @Test(dataProvider = "trtestdata", dependsOnMethods = "testCountCovariates1") + public void testTableRecalibrator1(TRTest test) { + String bam = test.file; + String md5 = test.md5; + String paramsFile = paramsFiles.get(bam); + System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile); + if ( paramsFile != null ) { + WalkerTestSpec spec = new WalkerTestSpec( + "-R " + b36KGReference + + " -T TableRecalibration" + + " -I " + bam + + ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" ) ? " -L 1:10,800,000-10,810,000" : " -L 1:10,100,000-10,300,000" ) + - " -o %s" + - " --no_pg_tag" + - " --solid_recal_mode SET_Q_ZERO" + - " -recalFile " + paramsFile, - 1, // just one output file - Arrays.asList(md5)); - executeTest("testTableRecalibrator1", spec); - } + " -o %s" + + " --no_pg_tag" + + " --solid_recal_mode SET_Q_ZERO" + + " -recalFile " + paramsFile, + 1, // just one output file + Arrays.asList(md5)); + executeTest("testTableRecalibrator1", spec); } } @Test public void testCountCovariatesUseOriginalQuals() { HashMap e = new HashMap(); - e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "3404965ec4fa99873fe6a44521944fd5"); + e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "278846c55d97bd9812b758468a83f559"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -97,7 +135,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -standard" + " -OQ" + " -recalFile %s" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod", + " -knownSites " + b36dbSNP129, 1, // just one output file Arrays.asList(md5)); executeTest("testCountCovariatesUseOriginalQuals", spec); @@ -107,7 +145,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorMaxQ70() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0278cce4cfdab869dc0c11d6852a984b" ); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -136,7 +174,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "c9ea5f995e1e2b7a5688533e678dcedc" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "8379f24cf5312587a1f92c162ecc220f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -144,7 +182,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -knownSites " + b36dbSNP129 + " -T CountCovariates" + " -I " + bam + " -standard" + @@ -162,7 +200,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "993fae4270e7e1e15986f270acf247af" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "7d5edb75b176e4151de225f699719ee4" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -187,30 +225,6 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } - @Test - public void testCountCovariatesVCF() { - HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "170f0c3cc4b8d72c539136effeec9a16"); - - for ( Map.Entry entry : e.entrySet() ) { - String bam = entry.getKey(); - String md5 = entry.getValue(); - - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + b36KGReference + - " -B:dbsnp,VCF3 " + validationDataLocation + "vcfexample3.vcf" + - " -T CountCovariates" + - " -I " + bam + - " -L 1:10,000,000-10,200,000" + - " -standard" + - " --solid_recal_mode SET_Q_ZERO" + - " -recalFile %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testCountCovariatesVCF", spec); - } - } - @Test public void testCountCovariatesBED() { HashMap e = new HashMap(); @@ -222,7 +236,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " -B:bed,bed " + validationDataLocation + "recalibrationTest.bed" + + " -knownSites:bed " + validationDataLocation + "recalibrationTest.bed" + " -T CountCovariates" + " -I " + bam + " -L 1:10,000,000-10,200,000" + @@ -238,7 +252,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesVCFPlusDBsnp() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "a3d892bd60d8f679affda3c1e3af96c1"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "9131d96f39badbf9753653f55b148012"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -246,10 +260,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " -B:anyNameABCD,VCF3 " + validationDataLocation + "vcfexample3.vcf" + + " -knownSites:anyNameABCD,VCF3 " + validationDataLocation + "vcfexample3.vcf" + " -T CountCovariates" + " -I " + bam + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -knownSites " + b36dbSNP129 + " -L 1:10,000,000-10,200,000" + " -cov ReadGroupCovariate" + " -cov QualityScoreCovariate" + @@ -266,7 +280,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "284ccac1f8fe485e52c86333cac7c2d4" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "8993d32df5cb66c7149f59eccbd57f4c" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -274,7 +288,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -knownSites " + b36dbSNP129 + " -T CountCovariates" + " -I " + bam + " -cov ReadGroupCovariate" + @@ -292,7 +306,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "c167799c2d9cab815d7c9b23337f162e" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "5f913c98ca99754902e9d34f99df468f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java index ade34c964..bccb95795 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java @@ -16,7 +16,7 @@ public class RecalibrationWalkersPerformanceTest extends WalkerTest { " -L chr1:1-50,000,000" + " -standard" + " -OQ" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_hg18.rod" + + " -knownSites " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -recalFile /dev/null" + moreArgs, 0, new ArrayList(0)); @@ -31,7 +31,7 @@ public class RecalibrationWalkersPerformanceTest extends WalkerTest { " -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" + " -standard" + " -OQ" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_hg18.rod" + + " -knownSites " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -recalFile /dev/null" + moreArgs, 0, new ArrayList(0)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java index 95f4ac0ae..0a0d8c5b2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java @@ -19,8 +19,8 @@ public class ValidationAmpliconsIntegrationTest extends WalkerTest { String siteVCF = validationDataLocation + "sites_to_validate.vcf"; String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf"; String intervalTable = validationDataLocation + "amplicon_interval_table1.table"; - String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s"; - testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF; + String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons --ValidateAlleles:VCF "+siteVCF+" -o %s"; + testArgs += " --ProbeIntervals:table "+intervalTable+" -BTI ProbeIntervals --MaskAlleles:VCF "+maskVCF; testArgs += " --virtualPrimerSize 30"; WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, Arrays.asList("27f9450afa132888a8994167f0035fd7")); @@ -32,8 +32,8 @@ public class ValidationAmpliconsIntegrationTest extends WalkerTest { String siteVCF = validationDataLocation + "sites_to_validate.vcf"; String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf"; String intervalTable = validationDataLocation + "amplicon_interval_table1.table"; - String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s"; - testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF; + String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons --ValidateAlleles:VCF "+siteVCF+" -o %s"; + testArgs += " --ProbeIntervals:table "+intervalTable+" -BTI ProbeIntervals --MaskAlleles:VCF "+maskVCF; testArgs += " --virtualPrimerSize 30 --doNotUseBWA"; WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, Arrays.asList("f2611ff1d9cd5bedaad003251fed8bc1")); @@ -45,8 +45,8 @@ public class ValidationAmpliconsIntegrationTest extends WalkerTest { String siteVCF = validationDataLocation + "sites_to_validate.vcf"; String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf"; String intervalTable = validationDataLocation + "amplicon_interval_table1.table"; - String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s"; - testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF; + String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons --ValidateAlleles:VCF "+siteVCF+" -o %s"; + testArgs += " --ProbeIntervals:table "+intervalTable+" -BTI ProbeIntervals --MaskAlleles:VCF "+maskVCF; testArgs += " --virtualPrimerSize 30 --filterMonomorphic"; WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, Arrays.asList("77b3f30e38fedad812125bdf6cf3255f")); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 23c606ad0..b90e6d0ff 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -4,11 +4,9 @@ import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.Test; import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; public class VariantEvalIntegrationTest extends WalkerTest { - private static String variantEvalTestDataRoot = validationDataLocation + "/VariantEval"; + private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf"; private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf"; @@ -16,19 +14,46 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String cmdRoot = "-T VariantEval" + " -R " + b36KGReference; - private static String root = cmdRoot + - " -D " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:eval,VCF3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + - " -B:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; + @Test + public void testFunctionClassWithSnpeff() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-noEV", + "-EV TiTvVariantEvaluator", + "-noST", + "-ST FunctionalClass", + "-BTI eval", + "-o %s" + ), + 1, + Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12") + ); + executeTest("testFunctionClassWithSnpeff", spec); + } - private static String rootGZ = cmdRoot + - " -D " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:eval,VCF3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf.gz" + - " -B:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf.gz"; - - // TODO -- I can't seem to reindex this VCF using Tabix without it causing failures. Looking into it. [EB] - // private static String[] testsEnumerations = {root, rootGZ}; - private static String[] testsEnumerations = {root}; + @Test + public void testStratifySamplesAndExcludeMonomorphicSites() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf", + "-noEV", + "-EV TiTvVariantEvaluator", + "-ST Sample", + "-BTI eval", + "-o %s" + ), + 1, + Arrays.asList("6a71b17c19f5914c277a99f45f5d9c39") + ); + executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); + } @Test public void testFundamentalsCountVariantsSNPsAndIndels() { @@ -36,8 +61,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -45,7 +70,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("48b8417c1f8bd74ff7b9808580abd2a2") + Arrays.asList("1fefd6cf9c2554d5f886c3998defd4d0") ); executeTest("testFundamentalsCountVariantsSNPsandIndels", spec); } @@ -56,8 +81,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -66,7 +91,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("86d45ecefdf5849c55b3ca8f82a3d525") + Arrays.asList("d470e00a368b5a0468012818994c6a89") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -77,8 +102,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -88,7 +113,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("3d18901ec1766aa2e748eac913f5ddcd") + Arrays.asList("12856e52c2682328f91594089328596c") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } @@ -99,8 +124,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -109,7 +134,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("677fe398643e62a10d6739d36a720a12") + Arrays.asList("91610b9240f64e0eb03cfd2602cf57af") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithCpG", spec); } @@ -120,8 +145,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -130,7 +155,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("5fb44fd7cb00941c986a9941e43e44cd") + Arrays.asList("e40b77e7ed6581328e373a24b93cd170") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } @@ -141,8 +166,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -151,7 +176,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("daaca7ef3b7313e5af217cbc6f37c9e2") + Arrays.asList("15beaf3823c131cabc5fb0445239f978") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithDegeneracy", spec); } @@ -162,8 +187,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -172,7 +197,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("97c466f8ffd0fcf2c30ef08669d213d9") + Arrays.asList("7ddd4ee74938d229ce5cb7b9b9104abe") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithSample", spec); } @@ -183,8 +208,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -195,7 +220,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("df8cdfcf3d0c2fc795812c6eae6a76f8") + Arrays.asList("a90f33906a732ef5eb346e559c96ccc1") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithJexlExpression", spec); } @@ -206,8 +231,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -220,7 +245,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("c7aed12265e2b2311d17a0cc8a29f6aa") + Arrays.asList("2567f90d3d7354850c5a59730ecc6e4f") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithMultipleJexlExpressions", spec); } @@ -231,7 +256,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-B:eval,VCF " + fundamentalTestVCF, + "--eval " + fundamentalTestVCF, "-noEV", "-EV CountVariants", "-noST", @@ -239,7 +264,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("d44c8f44384189a09eea85a8e89d7299") + Arrays.asList("fa091aa8967893389c51102fd9f0bebb") ); executeTest("testFundamentalsCountVariantsNoCompRod", spec); } @@ -247,27 +272,29 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testSelect1() { String extraArgs = "-L 1:1-10,000,000"; - for (String tests : testsEnumerations) { - WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("cdbe47ea01b9dd79ff1c5ce6f5fa8bec")); - executeTestParallel("testSelect1", spec); - } + String tests = cmdRoot + + " --dbsnp " + b36dbSNP129 + + " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; + WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", + 1, Arrays.asList("f70997b6a3e7fdc89d11e1d61a2463d4")); + executeTestParallel("testSelect1", spec); } @Test public void testVEGenotypeConcordance() { String vcfFile = "GenotypeConcordanceEval.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -ST CpG -B:eval,VCF3 " + validationDataLocation + vcfFile + " -B:comp,VCF3 " + validationDataLocation + "GenotypeConcordanceComp.vcf -noEV -EV GenotypeConcordance -o %s", + WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -ST CpG --eval:VCF3 " + validationDataLocation + vcfFile + " --comp:VCF3 " + validationDataLocation + "GenotypeConcordanceComp.vcf -noEV -EV GenotypeConcordance -o %s", 1, - Arrays.asList("e4c981f7f5d78680c71310fc9be9a1c1")); + Arrays.asList("96f27163f16bb945f19c6623cd6db34e")); executeTestParallel("testVEGenotypeConcordance" + vcfFile, spec); } @Test public void testCompVsEvalAC() { - String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance -B:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf -B:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("162daa5039e1965eb2423a8589339a69")); + String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("407682de41dcf139ea635e9cda21b912")); executeTestParallel("testCompVsEvalAC",spec); } @@ -275,45 +302,41 @@ public class VariantEvalIntegrationTest extends WalkerTest { return String.format("%s -select '%s' -selectName %s", cmd, select, name); } - @Test + @Test(enabled = false) // no longer supported in the GATK public void testTranches() { - String extraArgs = "-T VariantEval -R "+ hg18Reference +" -B:eval,vcf " + validationDataLocation + "GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.vcf -o %s -EV TiTvVariantEvaluator -L chr1 -noEV -ST CpG -tf " + testDir + "tranches.6.txt"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("90cd98044e754b80034a9f4e6d2c55b9")); + String extraArgs = "-T VariantEval -R "+ hg18Reference +" --eval " + validationDataLocation + "GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.vcf -o %s -EV TiTvVariantEvaluator -L chr1 -noEV -ST CpG -tf " + testDir + "tranches.6.txt"; + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("6af2b9959aa1778a5b712536de453952")); executeTestParallel("testTranches",spec); } @Test public void testCompOverlap() { - String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals -B:comphapmap,vcf " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf -B:eval,vcf " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("70aa420929de7f888a6f48c2d01bbcda")); + String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals --comp:comphapmap " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf --eval " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("009ecc8376a20dce81ff5299ef6bfecb")); executeTestParallel("testCompOverlap",spec); } @Test public void testEvalTrackWithoutGenotypes() { - String dbsnp = GATKDataLocation + "dbsnp_129_b37.rod"; - String extraArgs = "-T VariantEval -R " + b37KGReference + " -L 20" + - " -D " + dbsnp + - " -B:evalBI,VCF " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + + " --dbsnp " + b37dbSNP132 + + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("5b1fc9a4066aca61f1b5f7b933ad37d9")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("424c9d438b1faa59b2c29413ba32f37b")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @Test public void testMultipleEvalTracksWithoutGenotypes() { - String dbsnp = GATKDataLocation + "dbsnp_129_b37.rod"; - String extraArgs = "-T VariantEval -R " + b37KGReference + " -L 20" + - " -D " + dbsnp + - " -B:evalBI,VCF " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + - " -B:evalBC,VCF " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + + " --dbsnp " + b37dbSNP132 + + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("6d902d9d4d8fef5219a43e416a51cee6")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("18fa0b89ebfff51141975d7e4ce7a159")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -323,27 +346,27 @@ public class VariantEvalIntegrationTest extends WalkerTest { String extraArgs = "-T VariantEval" + " -R " + b37KGReference + - " -B:comp,VCF " + validationDataLocation + "/VariantEval/ALL.phase1.chr20.broad.snps.genotypes.subset.vcf" + - " -B:eval,VCF " + validationDataLocation + "/VariantEval/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.subset.vcf" + - " -B:dbsnp,VCF " + dbsnp + + " --comp " + validationDataLocation + "/VariantEval/ALL.phase1.chr20.broad.snps.genotypes.subset.vcf" + + " --eval " + validationDataLocation + "/VariantEval/NA12878.hg19.HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.subset.vcf" + + " --dbsnp " + dbsnp + " -L 20:10000000-10100000" + " -noST -noEV -ST Novelty -EV CompOverlap" + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("55a1c53bced20701c56accfc3eb782a7")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("0b81d97f843ec4a1a4222d1f9949bfca")); executeTestParallel("testMultipleCompTracks",spec); } @Test public void testPerSampleAndSubsettedSampleHaveSameResults() { - String md5 = "454a1750fd36525f24172b21af5f49de"; + String md5 = "b0565ac61b2860248e4abd478a177b5e"; WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestSNPsVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsVCF, "-noEV", "-EV CompOverlap", "-sn HG00625", @@ -360,8 +383,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestSNPsOneSampleVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsOneSampleVCF, "-noEV", "-EV CompOverlap", "-noST", @@ -381,8 +404,8 @@ public class VariantEvalIntegrationTest extends WalkerTest { buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-D " + b37dbSNP129, - "-B:eval,VCF " + fundamentalTestSNPsVCF, + "--dbsnp " + b37dbSNP132, + "--eval " + fundamentalTestSNPsVCF, "-noEV", "-EV CountVariants", "-noST", @@ -391,7 +414,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("bf324e4c87fe0d21170fcd2a67a20371") + Arrays.asList("da65fc8f0d0eeaf0a0b06a07f444bb8e") ); executeTest("testAlleleCountStrat", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 3ac7e3785..c81891ac6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -41,11 +41,11 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b37KGReference + - " -B:dbsnp,VCF,known=true,training=false,truth=false,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + - " -B:hapmap,VCF,known=false,training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + - " -B:omni,VCF,known=false,training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:truth=true,training=true,prior=15.0 " + comparisonDataLocation + "Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + + " -resource:training=true,truth=true,prior=12.0 " + comparisonDataLocation + "Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" + " -T VariantRecalibrator" + - " -B:input,VCF " + params.inVCF + + " -input " + params.inVCF + " -L 20:1,000,000-40,000,000" + " -an QD -an HaplotypeScore -an HRun" + " -percentBad 0.07" + @@ -64,12 +64,59 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -T ApplyRecalibration" + " -L 20:12,000,000-30,000,000" + " -NO_HEADER" + - " -B:input,VCF " + params.inVCF + + " -input " + params.inVCF + " -o %s" + " -tranchesFile " + MD5DB.getMD5FilePath(params.tranchesMD5, null) + " -recalFile " + MD5DB.getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); executeTest("testApplyRecalibration-"+params.inVCF, spec); } + + VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf", + "6d7ee4cb651c8b666e4a4523363caaff", // tranches + "4759b111a5aa53975d46e0f22c7983bf", // recal file + "5d7e07d8813db96ba3f3dfe4737f83d1"); // cut VCF + + @DataProvider(name = "VRIndelTest") + public Object[][] createData2() { + return new Object[][]{ {indel} }; + } + + @Test(dataProvider = "VRIndelTest") + public void testVariantRecalibratorIndel(VRTest params) { + //System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -resource:known=true,prior=10.0 " + GATKDataLocation + "dbsnp_132_b37.leftAligned.vcf" + + " -resource:training=true,truth=true,prior=15.0 " + comparisonDataLocation + "Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf" + + " -T VariantRecalibrator" + + " -input " + params.inVCF + + " -L 20:1,000,000-40,000,000" + + " -an QD -an ReadPosRankSum -an HaplotypeScore" + + " -percentBad 0.08" + + " -mode INDEL -mG 3" + + " --minNumBadVariants 0" + + " --trustAllPolymorphic" + // for speed + " -recalFile %s" + + " -tranchesFile %s", + Arrays.asList(params.recalMD5, params.tranchesMD5)); + executeTest("testVariantRecalibratorIndel-"+params.inVCF, spec).getFirst(); + } + + @Test(dataProvider = "VRIndelTest",dependsOnMethods="testVariantRecalibratorIndel") + public void testApplyRecalibrationIndel(VRTest params) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:12,000,000-30,000,000" + + " -mode INDEL" + + " -NO_HEADER" + + " -input " + params.inVCF + + " -o %s" + + " -tranchesFile " + MD5DB.getMD5FilePath(params.tranchesMD5, null) + + " -recalFile " + MD5DB.getMD5FilePath(params.recalMD5, null), + Arrays.asList(params.cutVCFMD5)); + executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 904a5b29b..3267173a7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -38,21 +38,21 @@ public class CombineVariantsIntegrationTest extends WalkerTest { return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; } - public void test1InOut(String file, String md5, boolean vcf3) { - test1InOut(file, md5, "", vcf3); + public void test1InOut(String file, String md5) { + test1InOut(file, md5, ""); } - public void test1InOut(String file, String md5, String args, boolean vcf3) { + public void test1InOut(String file, String md5, String args) { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -priority v1 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file + args), + baseTestString(" -priority v1 -V:v1 " + validationDataLocation + file + args), 1, Arrays.asList(md5)); executeTest("testInOut1--" + file, spec); } - public void combine2(String file1, String file2, String args, String md5, boolean vcf3) { + public void combine2(String file1, String file2, String args, String md5) { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -priority v1,v2 -B:v1,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file1 + " -B:v2,VCF" + (vcf3 ? "3 " : " ") + validationDataLocation + file2 + args), + baseTestString(" -priority v1,v2 -V:v1 " + validationDataLocation + file1 + " -V:v2 "+ validationDataLocation + file2 + args), 1, Arrays.asList(md5)); executeTest("combine2 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec); @@ -63,38 +63,48 @@ public class CombineVariantsIntegrationTest extends WalkerTest { String file2 = "hapmap_3.3.b37.sites.vcf"; WalkerTestSpec spec = new WalkerTestSpec( "-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference - + " -L 1:1-10,000,000 -B:omni,VCF " + validationDataLocation + file1 - + " -B:hm3,VCF " + validationDataLocation + file2 + args, + + " -L 1:1-10,000,000 -V:omni " + validationDataLocation + file1 + + " -V:hm3 " + validationDataLocation + file2 + args, 1, Arrays.asList(md5)); executeTest("combineSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); } - @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "c608b9fc1e36dba6cebb4f259883f9f0", true); } - @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "20caad94411d6ab48153b214de916df8", " -setKey foo", true); } - @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "004f3065cb1bc2ce2f9afd695caf0b48", " -setKey null", true); } - @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "c9c901ff9ef2a982624b203a8086dff0", false); } // official project VCF files in tabix format + public void combinePLs(String file1, String file2, String md5) { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants -NO_HEADER -o %s -R " + b36KGReference + " -priority v1,v2 -V:v1 " + validationDataLocation + file1 + " -V:v2 " + validationDataLocation + file2, + 1, + Arrays.asList(md5)); + executeTest("combine PLs 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec); + } - @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "7593be578d4274d672fc22fced38012b", false); } - @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "1cd467863c4e948fadd970681552d57e", false); } + @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "c608b9fc1e36dba6cebb4f259883f9f0"); } + @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "20caad94411d6ab48153b214de916df8", " -setKey foo"); } + @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "004f3065cb1bc2ce2f9afd695caf0b48", " -setKey null"); } + @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "c9c901ff9ef2a982624b203a8086dff0"); } // official project VCF files in tabix format - @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f", false); } // official project VCF files in tabix format - @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9", false); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a", false); } + @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "7593be578d4274d672fc22fced38012b"); } + @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "1cd467863c4e948fadd970681552d57e"); } - @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083", false); } + @Test public void combineWithPLs() { combinePLs("combine.3.vcf", "combine.4.vcf", "0f873fed02aa99db5b140bcd6282c10a"); } - @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "89f55abea8f59e39d1effb908440548c", true); } + @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f"); } // official project VCF files in tabix format + @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9"); } // official project VCF files in tabix format + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a"); } + + @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083"); } + + @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "89f55abea8f59e39d1effb908440548c"); } @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "4836086891f6cbdd40eebef3076d215a"); } @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "6a34b5d743efda8b2f3b639f3a2f5de8"); } @Test public void threeWayWithRefs() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -B:NA19240_BGI,VCF "+validationDataLocation+"NA19240.BGI.RG.vcf" + - " -B:NA19240_ILLUMINA,VCF "+validationDataLocation+"NA19240.ILLUMINA.RG.vcf" + - " -B:NA19240_WUGSC,VCF "+validationDataLocation+"NA19240.WUGSC.RG.vcf" + - " -B:denovoInfo,VCF "+validationDataLocation+"yri_merged_validation_data_240610.annotated.b36.vcf" + + baseTestString(" -V:NA19240_BGI "+validationDataLocation+"NA19240.BGI.RG.vcf" + + " -V:NA19240_ILLUMINA "+validationDataLocation+"NA19240.ILLUMINA.RG.vcf" + + " -V:NA19240_WUGSC "+validationDataLocation+"NA19240.WUGSC.RG.vcf" + + " -V:denovoInfo "+validationDataLocation+"yri_merged_validation_data_240610.annotated.b36.vcf" + " -setKey centerSet" + " -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED" + " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + @@ -104,15 +114,14 @@ public class CombineVariantsIntegrationTest extends WalkerTest { executeTest("threeWayWithRefs", spec); } - // complex examples with filtering, indels, and multiple alleles public void combineComplexSites(String args, String md5) { String file1 = "combine.1.vcf"; String file2 = "combine.2.vcf"; WalkerTestSpec spec = new WalkerTestSpec( "-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference - + " -B:one,VCF " + validationDataLocation + file1 - + " -B:two,VCF " + validationDataLocation + file2 + args, + + " -V:one " + validationDataLocation + file1 + + " -V:two " + validationDataLocation + file2 + args, 1, Arrays.asList(md5)); executeTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); @@ -120,6 +129,6 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void complexTestFull() { combineComplexSites("", "b5a53ee92bdaacd2bb3327e9004ae058"); } @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "df96cb3beb2dbb5e02f80abec7d3571e"); } - @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "f72a178137e25dbe0b931934cdc0079d"); } + @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "f704caeaaaed6711943014b847fe381a"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "f704caeaaaed6711943014b847fe381a"); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java new file mode 100644 index 000000000..2139a53e7 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests LeftAlignVariants + */ +public class LeftAlignVariantsIntegrationTest extends WalkerTest { + + @Test + public void testLeftAlignment() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignVariants -o %s -R " + b37KGReference + " --variant:vcf " + validationDataLocation + "forLeftAlignVariantsTest.vcf -NO_HEADER", + 1, + Arrays.asList("158b1d71b28c52e2789f164500b53732")); + executeTest("test left alignment", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java index 82c894c6f..d10bb4452 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java @@ -38,7 +38,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { @Test public void testb36Tohg19() { WalkerTestSpec spec = new WalkerTestSpec( - "-T LiftoverVariants -o %s -R " + b36KGReference + " -B:variant,vcf3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", + "-T LiftoverVariants -o %s -R " + b36KGReference + " --variant:vcf3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, Arrays.asList("70aeaca5b74cc7ba8e2da7b71ff0fbfd")); executeTest("test b36 to hg19", spec); @@ -47,7 +47,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { @Test public void testb36Tohg19UnsortedSamples() { WalkerTestSpec spec = new WalkerTestSpec( - "-T LiftoverVariants -o %s -R " + b36KGReference + " -B:variant,vcf3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.unsortedSamples.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", + "-T LiftoverVariants -o %s -R " + b36KGReference + " --variant:vcf3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.unsortedSamples.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, Arrays.asList("3fd7ec2dc4064ef410786276b0dc9d08")); executeTest("test b36 to hg19, unsorted samples", spec); @@ -56,7 +56,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { @Test public void testhg18Tohg19Unsorted() { WalkerTestSpec spec = new WalkerTestSpec( - "-T LiftoverVariants -o %s -R " + hg18Reference + " -B:variant,vcf " + validationDataLocation + "liftover_test.vcf -chain " + validationDataLocation + "hg18ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", + "-T LiftoverVariants -o %s -R " + hg18Reference + " --variant:vcf " + validationDataLocation + "liftover_test.vcf -chain " + validationDataLocation + "hg18ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, Arrays.asList("ab2c6254225d7e2ecf52eee604d5673b")); executeTest("test hg18 to hg19, unsorted", spec); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index b5f41542e..20409d4ca 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -7,7 +7,7 @@ import java.util.Arrays; public class SelectVariantsIntegrationTest extends WalkerTest { public static String baseTestString(String args) { - return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s" + args; + return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s -NO_HEADER" + args; } @Test @@ -16,7 +16,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF3 " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile), 1, Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") ); @@ -24,12 +24,26 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testComplexSelection--" + testfile, spec); } + @Test + public void testSampleExclusion() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant:VCF3 " + testfile, + 1, + Arrays.asList("730f021fd6ecf1d195dabbee2e233bfd") + ); + + executeTest("testSampleExclusion--" + testfile, spec); + } + @Test public void testRepeatedLineSelection() { String testfile = validationDataLocation + "test.dup.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn B -sn C -B:variant,VCF3 " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile), 1, Arrays.asList("b74038779fe6485dbb8734ae48178356") ); @@ -42,7 +56,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -disc myvar -L 20:1012700-1020000 -B:variant,VCF " + b37hapmapGenotypes + " -B:myvar,VCF " + testFile + " -o %s -NO_HEADER", + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant:VCF " + b37hapmapGenotypes + " -disc:VCF " + testFile + " -o %s -NO_HEADER", 1, Arrays.asList("78e6842325f1f1bc9ab30d5e7737ee6e") ); @@ -55,7 +69,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -conc hapmap -L 20:1012700-1020000 -B:hapmap,VCF " + b37hapmapGenotypes + " -B:variant,VCF " + testFile + " -o %s -NO_HEADER", + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc:VCF " + b37hapmapGenotypes + " --variant " + testFile + " -o %s -NO_HEADER", 1, Arrays.asList("d2ba3ea30a810f6f0fbfb1b643292b6a") ); @@ -63,4 +77,29 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testConcordance--" + testFile, spec); } + @Test + public void testVariantTypeSelection() { + String testFile = validationDataLocation + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("e0b12c0b47a8a7a988b3587b47bfa8cf") + ); + + executeTest("testVariantTypeSelection--" + testFile, spec); + } + + @Test(enabled=false) + public void testRemovePLs() { + String testFile = validationDataLocation + "combine.3.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("") + ); + + executeTest("testWithPLs--" + testFile, spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index d7efe4212..00044f859 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -56,7 +56,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants" + " -R " + b36KGReference + - " -B:variant,vcf3,storage=STREAM " + tmpFifo.getAbsolutePath() + + " --variant:vcf3,storage=STREAM " + tmpFifo.getAbsolutePath() + " --NO_HEADER" + " -o %s", 1, @@ -80,7 +80,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { WalkerTestSpec selectTestSpec = new WalkerTestSpec( "-T SelectVariants" + " -R " + b36KGReference + - " -B:variant,vcf3,storage=STREAM " + testFile + + " --variant:vcf3,storage=STREAM " + testFile + " --NO_HEADER" + " -select 'QD > 2.0'" + " -o " + tmpFifo.getAbsolutePath(), @@ -93,12 +93,12 @@ public class VCFStreamingIntegrationTest extends WalkerTest { selectTestSpec = new WalkerTestSpec( "-T VariantEval" + " -R " + b36KGReference + - " -B:eval,vcf3 " + testFile + - " -B:comp,vcf,storage=STREAM " + tmpFifo.getAbsolutePath() + + " --eval:vcf3 " + testFile + + " --comp:vcf,storage=STREAM " + tmpFifo.getAbsolutePath() + " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e" + Arrays.asList("d46a735ffa898f4aa6b3758c5b03f06d") ); executeTest("testVCFStreamingChain", selectTestSpec); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java index 67c4297b1..5f71f82fd 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -34,7 +34,7 @@ import java.util.Arrays; public class ValidateVariantsIntegrationTest extends WalkerTest { public static String baseTestString(String file, String type) { - return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 -B:variant,VCF " + validationDataLocation + file + " --validationType " + type; + return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 --variant:vcf " + validationDataLocation + file + " --validationType " + type; } @Test @@ -95,7 +95,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { @Test public void testBadID() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString("validationExampleBad.vcf", "IDS") + " -D " + GATKDataLocation + "dbsnp_129_b36.rod", + baseTestString("validationExampleBad.vcf", "IDS") + " --dbsnp " + b36dbSNP129, 0, UserException.MalformedFile.class ); @@ -113,4 +113,16 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { executeTest("test bad alt allele", spec); } + + @Test + public void testBadAllele2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleBad3.vcf", "REF"), + 0, + UserException.MalformedFile.class + ); + + executeTest("test bad ref allele in deletion", spec); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java index 1db712353..19021c1c2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -35,7 +35,7 @@ import java.io.File; public class VariantsToTableIntegrationTest extends WalkerTest { private String variantsToTableCmd(String moreArgs) { return "-R " + hg18Reference + - " -B:eval,vcf " + validationDataLocation + "/soap_gatk_annotated.vcf" + + " --variant:vcf " + validationDataLocation + "/soap_gatk_annotated.vcf" + " -T VariantsToTable" + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F TRANSITION -F DP -F SB -F set -F RankSumP -F refseq.functionalClass*" + " -L chr1 -KMA -o %s" + moreArgs; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java index 8c96c1e11..95fafac8d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCFIntegrationTest.java @@ -16,6 +16,22 @@ import java.util.ArrayList; */ public class VariantsToVCFIntegrationTest extends WalkerTest { + @Test + public void testVariantsToVCFUsingDbsnpInput() { + List md5 = new ArrayList(); + md5.add("d64942fed2a5b7b407f9537dd2b4832e"); + + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " --variant:OldDbsnp " + GATKDataLocation + "Comparisons/Validated/dbSNP/dbsnp_129_b36.rod" + + " -T VariantsToVCF" + + " -L 1:1-30,000,000" + + " -o %s" + + " -NO_HEADER", + 1, // just one output file + md5); + executeTest("testVariantsToVCFUsingDbsnpInput", spec).getFirst(); + } @Test public void testVariantsToVCFUsingGeliInput() { @@ -24,7 +40,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " -B:variant,GeliText " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" + + " --variant:GeliText " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" + " -T VariantsToVCF" + " -L 1:10,000,000-11,000,000" + " -sample NA123AB" + @@ -32,25 +48,25 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { " -NO_HEADER", 1, // just one output file md5); - executeTest("testVariantsToVCFUsingGeliInput #1", spec).getFirst(); + executeTest("testVariantsToVCFUsingGeliInput - calls", spec).getFirst(); } @Test public void testGenotypesToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("71e8c98d7c3a73b6287ecc339086fe03"); + md5.add("2413f036ec4100b8d5db179946159a82"); WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " -B:variant,GeliText " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" + + " --variant:GeliText " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" + " -T VariantsToVCF" + - " -L 1:10,000,000-11,000,000" + + " -L 1:10,100,000-10,200,000" + " -sample NA123AB" + " -o %s" + " -NO_HEADER", 1, // just one output file md5); - executeTest("testVariantsToVCFUsingGeliInput #2", spec).getFirst(); + executeTest("testVariantsToVCFUsingGeliInput - genotypes", spec).getFirst(); } @Test @@ -60,7 +76,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " -B:variant,HapMap " + validationDataLocation + "rawHapMap.yri.chr1.txt" + + " --variant:RawHapMap " + validationDataLocation + "rawHapMap.yri.chr1.txt" + " -T VariantsToVCF" + " -L 1:1-1,000,000" + " -o %s" + @@ -77,7 +93,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " -B:variant,VCF " + validationDataLocation + "complexExample.vcf4" + + " --variant:VCF " + validationDataLocation + "complexExample.vcf4" + " -T VariantsToVCF" + " -o %s" + " -NO_HEADER", diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java new file mode 100644 index 000000000..3dfd0550d --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import org.apache.commons.io.FileUtils; +import org.broadinstitute.sting.BaseTest; +import org.ggf.drmaa.*; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +public class JnaSessionIntegrationTest extends BaseTest { + private static final SessionFactory factory = new JnaSessionFactory(); + + @Test + public void testDrmaa() throws Exception { + Session session = factory.getSession(); + Version version = session.getVersion(); + System.out.println(String.format("DRMAA version: %d.%d", version.getMajor(), version.getMinor())); + System.out.println(String.format("DRMAA contact(s): %s", session.getContact())); + System.out.println(String.format("DRM system(s): %s", session.getDrmSystem())); + System.out.println(String.format("DRMAA implementation(s): %s", session.getDrmaaImplementation())); + } + + @Test + public void testSubmitEcho() throws Exception { + File outFile = createNetworkTempFile("JnaSessionIntegrationTest-", ".out"); + Session session = factory.getSession(); + session.init(null); + try { + JobTemplate template = session.createJobTemplate(); + template.setRemoteCommand("sh"); + template.setOutputPath(":" + outFile.getAbsolutePath()); + template.setJoinFiles(true); + template.setArgs(Arrays.asList("-c", "echo \"Hello world.\"")); + + String jobId = session.runJob(template); + System.out.println(String.format("Job id %s", jobId)); + session.deleteJobTemplate(template); + + System.out.println("Waiting for job to run: " + jobId); + int remotePs = Session.QUEUED_ACTIVE; + + List runningStatuses = Arrays.asList(Session.QUEUED_ACTIVE, Session.RUNNING); + + while (runningStatuses.contains(remotePs)) { + Thread.sleep(30 * 1000L); + remotePs = session.getJobProgramStatus(jobId); + } + + Assert.assertEquals(remotePs, Session.DONE, "Job status is not DONE."); + + JobInfo jobInfo = session.wait(jobId, Session.TIMEOUT_NO_WAIT); + + Assert.assertTrue(jobInfo.hasExited(), String.format("Job did not exit cleanly: %s", jobId)); + Assert.assertEquals(jobInfo.getExitStatus(), 0, String.format("Exit status for jobId %s is non-zero", jobId)); + if (jobInfo.hasSignaled()) + Assert.fail(String.format("JobId %s exited with signal %s and core dump flag %s", jobId, jobInfo.getTerminatingSignal(), jobInfo.hasCoreDump())); + Assert.assertFalse(jobInfo.wasAborted(), String.format("Job was aborted: %s", jobId)); + } finally { + session.exit(); + } + + Assert.assertTrue(FileUtils.waitFor(outFile, 120), "File not found: " + outFile.getAbsolutePath()); + System.out.println("--- output ---"); + System.out.println(FileUtils.readFileToString(outFile)); + System.out.println("--- output ---"); + Assert.assertTrue(outFile.delete(), "Unable to delete " + outFile.getAbsolutePath()); + System.out.println("Validating that we reached the end of the test without exit."); + } + + @Test + public void testCollectionConversions() { + Collection list = Arrays.asList("a=1", "foo=bar", "empty="); + Map map = new LinkedHashMap(); + map.put("a", "1"); + map.put("foo", "bar"); + map.put("empty", ""); + + Assert.assertEquals(JnaSession.collectionToMap(list), map); + Assert.assertEquals(JnaSession.mapToCollection(map), list); + } + + @Test + public void testLimitConversions() { + Assert.assertEquals(JnaSession.formatLimit(0), "0:00:00"); + Assert.assertEquals(JnaSession.formatLimit(59), "0:00:59"); + Assert.assertEquals(JnaSession.formatLimit(60), "0:01:00"); + Assert.assertEquals(JnaSession.formatLimit(3540), "0:59:00"); + Assert.assertEquals(JnaSession.formatLimit(3599), "0:59:59"); + Assert.assertEquals(JnaSession.formatLimit(7200), "2:00:00"); + Assert.assertEquals(JnaSession.formatLimit(7260), "2:01:00"); + Assert.assertEquals(JnaSession.formatLimit(7261), "2:01:01"); + + Assert.assertEquals(JnaSession.parseLimit("0"), 0); + Assert.assertEquals(JnaSession.parseLimit("00"), 0); + Assert.assertEquals(JnaSession.parseLimit("0:00"), 0); + Assert.assertEquals(JnaSession.parseLimit("00:00"), 0); + Assert.assertEquals(JnaSession.parseLimit("0:00:00"), 0); + + Assert.assertEquals(JnaSession.parseLimit("1"), 1); + Assert.assertEquals(JnaSession.parseLimit("01"), 1); + Assert.assertEquals(JnaSession.parseLimit("0:01"), 1); + Assert.assertEquals(JnaSession.parseLimit("00:01"), 1); + Assert.assertEquals(JnaSession.parseLimit("0:00:01"), 1); + + Assert.assertEquals(JnaSession.parseLimit("10"), 10); + Assert.assertEquals(JnaSession.parseLimit("0:10"), 10); + Assert.assertEquals(JnaSession.parseLimit("00:10"), 10); + Assert.assertEquals(JnaSession.parseLimit("0:00:10"), 10); + + Assert.assertEquals(JnaSession.parseLimit("1:0"), 60); + Assert.assertEquals(JnaSession.parseLimit("1:00"), 60); + Assert.assertEquals(JnaSession.parseLimit("01:00"), 60); + Assert.assertEquals(JnaSession.parseLimit("0:01:00"), 60); + + Assert.assertEquals(JnaSession.parseLimit("1:00:00"), 3600); + + Assert.assertEquals(JnaSession.parseLimit("1:02:03"), 3723); + } +} diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java new file mode 100644 index 000000000..d98281ad3 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.jna.drmaa.v1_0; + +import com.sun.jna.Memory; +import com.sun.jna.NativeLong; +import com.sun.jna.Pointer; +import com.sun.jna.StringArray; +import com.sun.jna.ptr.IntByReference; +import com.sun.jna.ptr.PointerByReference; +import org.apache.commons.io.FileUtils; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class LibDrmaaIntegrationTest extends BaseTest { + private String implementation = null; + + @Test + public void testDrmaa() throws Exception { + Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER); + int errnum; + + IntByReference major = new IntByReference(); + IntByReference minor = new IntByReference(); + Memory contact = new Memory(LibDrmaa.DRMAA_CONTACT_BUFFER); + Memory drmSystem = new Memory(LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER); + Memory drmaaImplementation = new Memory(LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER); + + errnum = LibDrmaa.drmaa_version(major, minor, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get version from the DRMAA library: %s", error.getString(0))); + + System.out.println(String.format("DRMAA version: %d.%d", major.getValue(), minor.getValue())); + + errnum = LibDrmaa.drmaa_get_contact(contact, LibDrmaa.DRMAA_CONTACT_BUFFER_LEN, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get contacts from the DRMAA library: %s", error.getString(0))); + + System.out.println(String.format("DRMAA contact(s): %s", contact.getString(0))); + + errnum = LibDrmaa.drmaa_get_DRM_system(drmSystem, LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER_LEN, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get DRM system from the DRMAA library: %s", error.getString(0))); + + System.out.println(String.format("DRM system(s): %s", drmSystem.getString(0))); + + errnum = LibDrmaa.drmaa_get_DRMAA_implementation(drmaaImplementation, LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER_LEN, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get DRMAA implementation from the DRMAA library: %s", error.getString(0))); + + System.out.println(String.format("DRMAA implementation(s): %s", drmaaImplementation.getString(0))); + + this.implementation = drmaaImplementation.getString(0); + } + + @Test(dependsOnMethods = { "testDrmaa" }) + public void testSubmitEcho() throws Exception { + if (implementation.indexOf("LSF") >= 0) { + System.err.println(" *********************************************************"); + System.err.println(" ***********************************************************"); + System.err.println(" **** ****"); + System.err.println(" **** Skipping LibDrmaaIntegrationTest.testSubmitEcho() ****"); + System.err.println(" **** Are you using the dotkit .combined_LSF_SGE? ****"); + System.err.println(" **** ****"); + System.err.println(" ***********************************************************"); + System.err.println(" *********************************************************"); + return; + } + + Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER); + int errnum; + + File outFile = createNetworkTempFile("LibDrmaaIntegrationTest-", ".out"); + + errnum = LibDrmaa.drmaa_init(null, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not initialize the DRMAA library: %s", error.getString(0))); + + try { + PointerByReference jtRef = new PointerByReference(); + Pointer jt; + Memory jobIdMem = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER); + String jobId; + IntByReference remotePs = new IntByReference(); + IntByReference stat = new IntByReference(); + PointerByReference rusage = new PointerByReference(); + + errnum = LibDrmaa.drmaa_allocate_job_template(jtRef, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not create job template: %s", error.getString(0))); + + jt = jtRef.getValue(); + + errnum = LibDrmaa.drmaa_set_attribute(jt, LibDrmaa.DRMAA_REMOTE_COMMAND, "sh", error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_REMOTE_COMMAND, error.getString(0))); + + errnum = LibDrmaa.drmaa_set_attribute(jt, LibDrmaa.DRMAA_OUTPUT_PATH, ":" + outFile.getAbsolutePath(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_OUTPUT_PATH, error.getString(0))); + + errnum = LibDrmaa.drmaa_set_attribute(jt, LibDrmaa.DRMAA_JOIN_FILES, "y", error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_JOIN_FILES, error.getString(0))); + + StringArray args = new StringArray(new String[] { "-c", "echo \"Hello world.\"" }); + + errnum = LibDrmaa.drmaa_set_vector_attribute(jt, LibDrmaa.DRMAA_V_ARGV, args, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_V_ARGV, error.getString(0))); + + errnum = LibDrmaa.drmaa_run_job(jobIdMem, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, jt, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not submit job: %s", error.getString(0))); + + jobId = jobIdMem.getString(0); + + System.out.println(String.format("Job id %s", jobId)); + + errnum = LibDrmaa.drmaa_delete_job_template(jt, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not delete job template: %s", error.getString(0))); + + System.out.println("Waiting for job to run: " + jobId); + remotePs.setValue(LibDrmaa.DRMAA_PS.DRMAA_PS_QUEUED_ACTIVE); + + List runningStatuses = Arrays.asList( + LibDrmaa.DRMAA_PS.DRMAA_PS_QUEUED_ACTIVE, LibDrmaa.DRMAA_PS.DRMAA_PS_RUNNING); + + while (runningStatuses.contains(remotePs.getValue())) { + Thread.sleep(30 * 1000L); + + errnum = LibDrmaa.drmaa_job_ps(jobId, remotePs, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not get status for jobId %s: %s", jobId, error.getString(0))); + } + + Assert.assertEquals(remotePs.getValue(), LibDrmaa.DRMAA_PS.DRMAA_PS_DONE, "Job status is not DONE."); + + errnum = LibDrmaa.drmaa_wait(jobId, Pointer.NULL, new NativeLong(0), stat, LibDrmaa.DRMAA_TIMEOUT_NO_WAIT, + rusage, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Wait failed for jobId %s: %s", jobId, error.getString(0))); + + IntByReference exited = new IntByReference(); + IntByReference exitStatus = new IntByReference(); + IntByReference signaled = new IntByReference(); + Memory signal = new Memory(LibDrmaa.DRMAA_SIGNAL_BUFFER); + IntByReference coreDumped = new IntByReference(); + IntByReference aborted = new IntByReference(); + + errnum = LibDrmaa.drmaa_wifexited(exited, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Exit check failed for jobId %s: %s", jobId, error.getString(0))); + + Assert.assertTrue(exited.getValue() != 0, String.format("Job did not exit cleanly: %s", jobId)); + + errnum = LibDrmaa.drmaa_wexitstatus(exitStatus, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Exit status failed for jobId %s: %s", jobId, error.getString(0))); + + Assert.assertEquals(exitStatus.getValue(), 0, String.format("Exit status for jobId %s is non-zero", jobId)); + + errnum = LibDrmaa.drmaa_wifsignaled(signaled, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Signaled check failed for jobId %s: %s", jobId, error.getString(0))); + + if (signaled.getValue() != 0) { + errnum = LibDrmaa.drmaa_wtermsig(signal, LibDrmaa.DRMAA_SIGNAL_BUFFER_LEN, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Signal lookup failed for jobId %s: %s", jobId, error.getString(0))); + + errnum = LibDrmaa.drmaa_wcoredump(coreDumped, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Core dump check failed for jobId %s: %s", jobId, error.getString(0))); + + Assert.fail(String.format("JobId %s exited with signal %s and core dump flag %d", jobId, signal.getString(0), coreDumped.getValue())); + } + + errnum = LibDrmaa.drmaa_wifaborted(aborted, stat.getValue(), error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Aborted check failed for jobId %s: %s", jobId, error.getString(0))); + + Assert.assertTrue(aborted.getValue() == 0, String.format("Job was aborted: %s", jobId)); + + } finally { + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) { + LibDrmaa.drmaa_exit(error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + } else { + errnum = LibDrmaa.drmaa_exit(error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); + + if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS) + Assert.fail(String.format("Could not shut down the DRMAA library: %s", error.getString(0))); + } + } + + Assert.assertTrue(FileUtils.waitFor(outFile, 120), "File not found: " + outFile.getAbsolutePath()); + System.out.println("--- output ---"); + System.out.println(FileUtils.readFileToString(outFile)); + System.out.println("--- output ---"); + Assert.assertTrue(outFile.delete(), "Unable to delete " + outFile.getAbsolutePath()); + System.out.println("Validating that we reached the end of the test without exit."); + } +} diff --git a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java index 77db34cbc..b4fb5cfa3 100644 --- a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java @@ -91,7 +91,7 @@ public class LibBatIntegrationTest extends BaseTest { } @Test - public void testSubmitEcho() throws InterruptedException { + public void testSubmitEcho() throws Exception { String queue = "hour"; File outFile = createNetworkTempFile("LibBatIntegrationTest-", ".out"); @@ -114,6 +114,10 @@ public class LibBatIntegrationTest extends BaseTest { req.command = "echo \"Hello world.\""; + String[] argv = {"", "-a", "tv"}; + int setOptionResult = LibBat.setOption_(argv.length, new StringArray(argv), "a:", req, ~0, ~0, ~0, null); + Assert.assertTrue(setOptionResult != -1, "setOption_ returned -1"); + submitReply reply = new submitReply(); long jobId = LibBat.lsb_submit(req, reply); @@ -142,6 +146,9 @@ public class LibBatIntegrationTest extends BaseTest { Assert.assertTrue(Utils.isFlagSet(jobStatus, LibBat.JOB_STAT_DONE), String.format("Unexpected job status: 0x%02x", jobStatus)); Assert.assertTrue(FileUtils.waitFor(outFile, 120), "File not found: " + outFile.getAbsolutePath()); + System.out.println("--- output ---"); + System.out.println(FileUtils.readFileToString(outFile)); + System.out.println("--- output ---"); Assert.assertTrue(outFile.delete(), "Unable to delete " + outFile.getAbsolutePath()); Assert.assertEquals(reply.queue, req.queue, "LSF reply queue does not match requested queue."); System.out.println("Validating that we reached the end of the test without exit."); diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java new file mode 100644 index 000000000..1bbf74db9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.R; + +import org.apache.commons.io.FileUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.diffengine.DiffElement; +import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine; +import org.broadinstitute.sting.gatk.walkers.diffengine.DiffNode; +import org.broadinstitute.sting.gatk.walkers.diffengine.Difference; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Basic unit test for RScriptExecutor in reduced reads + */ +public class RScriptExecutorUnitTest extends BaseTest { + final static String testrscript = "print(\"hello, world\")\n"; + final static String publicRScript = "plot_Tranches.R"; + final static String privateRScript = "variantCallQC.R"; + + // -------------------------------------------------------------------------------- + // + // Difference testing routines + // + // -------------------------------------------------------------------------------- + + private void testOne(String script, String pathToRscript, String anotherSearchPath, boolean exceptOnError) { + RScriptExecutor.RScriptArgumentCollection collection = + new RScriptExecutor.RScriptArgumentCollection(); + if ( pathToRscript != null ) + collection.PATH_TO_RSCRIPT = pathToRscript; + if ( anotherSearchPath != null ) { + List x = new ArrayList(collection.PATH_TO_RESOURCES); + x.add(anotherSearchPath); + collection.PATH_TO_RESOURCES = x; + } + RScriptExecutor executor = new RScriptExecutor(collection, exceptOnError); + executor.callRScripts(script); + } + + @Test + public void testPublic() { testOne(publicRScript, null, null, true); } + + @Test + public void testPrivate() { testOne(privateRScript, null, null, true); } + + // make sure we don't break finding something in private by adding another directory + @Test + public void testPrivateWithAdditionalPath1() { testOne(privateRScript, null, "dist", true); } + + // make sure we don't break finding something in private by adding another directory + @Test + public void testPrivateWithAdditionalPath2() { testOne(privateRScript, null, "doesNotExist", true); } + + @Test(expectedExceptions = UserException.class) + public void testNonExistantScriptException() { testOne("does_not_exist.R", null, null, true); } + + @Test() + public void testNonExistantScriptNoException() { testOne("does_not_exist.R", null, null, false); } + + @Test(expectedExceptions = UserException.class) + public void testNonExistantRScriptException() { testOne(publicRScript, "badRScriptValue", null, true); } + + @Test() + public void testNonExistantRScriptNoException() { testOne(publicRScript, "badRScriptValue", null, false); } + + @Test() + public void testScriptInNewPath() throws IOException { + File t = createTempFile("myTestScript", ".R"); + FileUtils.writeStringToFile(t, testrscript); + testOne(t.getName(), null, t.getParent(), true); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java index f4c3163cf..2e4dac6da 100644 --- a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java @@ -11,7 +11,6 @@ import org.testng.annotations.Test; import org.testng.annotations.DataProvider; import org.testng.annotations.BeforeMethod; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.Utils; diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java index e912f97e9..5fd4c610e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java @@ -49,12 +49,12 @@ public class HapMapUnitTest { */ @Test public void testReadHeader() { - HapMapCodec codec = new HapMapCodec(); + RawHapMapCodec codec = new RawHapMapCodec(); AsciiLineReader reader = getReader(); try { String header = reader.readLine(); reader.close(); - Assert.assertTrue(header.equals((String)codec.readHeader(getReader()))); + Assert.assertTrue(header.equals(codec.readHeader(getReader()))); } catch (IOException e) { Assert.fail("Unable to read from file " + hapMapFile); } @@ -63,8 +63,8 @@ public class HapMapUnitTest { @Test public void testKnownRecordConversion() { - HapMapCodec codec = new HapMapCodec(); - HapMapFeature feature = (HapMapFeature)codec.decode(knownLine); + RawHapMapCodec codec = new RawHapMapCodec(); + RawHapMapFeature feature = (RawHapMapFeature)codec.decode(knownLine); // check that the alleles are right @@ -110,16 +110,16 @@ public class HapMapUnitTest { @Test public void testReadCorrectNumberOfRecords() { // setup the record for reading our 500 line file (499 records, 1 header line) - HapMapCodec codec = new HapMapCodec(); + RawHapMapCodec codec = new RawHapMapCodec(); AsciiLineReader reader = getReader(); - String line = null; + String line; int count = 0; try { codec.readHeader(reader); line = reader.readLine(); while (line != null) { - HapMapFeature feature = (HapMapFeature) codec.decode(line); + codec.decode(line); line = reader.readLine(); ++count; } @@ -133,14 +133,14 @@ public class HapMapUnitTest { @Test public void testGetSampleNames() { // setup the record for reading our 500 line file (499 records, 1 header line) - HapMapCodec codec = new HapMapCodec(); + RawHapMapCodec codec = new RawHapMapCodec(); AsciiLineReader reader = getReader(); - String line = null; + String line; try { codec.readHeader(reader); line = reader.readLine(); - HapMapFeature feature = (HapMapFeature) codec.decode(line); + RawHapMapFeature feature = (RawHapMapFeature) codec.decode(line); Assert.assertEquals(feature.getSampleIDs().length,87); } catch (IOException e) { diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java index 68a2ecf8d..d08cda949 100755 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java @@ -70,7 +70,7 @@ public class IndexFactoryUnitTest { CloseableTribbleIterator it = source.iterator(); while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) { VariantContext vc = it.next(); - writer.add(vc, vc.getReferenceBaseForIndel()); + writer.add(vc); } writer.close(); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 32ff25c7b..2ef116708 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -17,11 +17,11 @@ public class VCFIntegrationTest extends WalkerTest { String baseCommand = "-R " + b37KGReference + " -NO_HEADER -o %s "; - String test1 = baseCommand + "-T VariantAnnotator -BTI variant -B:variant,vcf " + testVCF; + String test1 = baseCommand + "-T VariantAnnotator --variant " + testVCF + " -BTI variant"; WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(md5ofInputVCF)); List result = executeTest("Test Variant Annotator with no changes", spec1).getFirst(); - String test2 = baseCommand + "-T VariantsToVCF -B:variant,vcf " + result.get(0).getAbsolutePath(); + String test2 = baseCommand + "-T VariantsToVCF --variant " + result.get(0).getAbsolutePath(); WalkerTestSpec spec2 = new WalkerTestSpec(test2, 1, Arrays.asList(md5ofInputVCF)); executeTest("Test Variants To VCF from new output", spec2); } diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index 74cd8d8fe..061dc99fa 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -31,8 +31,8 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { private static final int STEP_SIZE = 1; //private static final List QUERY_SIZES = Arrays.asList(1); - private static final List QUERY_SIZES = Arrays.asList(1, 10, 100, 1000); - private static final List CACHE_SIZES = Arrays.asList(-1, 10, 1000); + private static final List QUERY_SIZES = Arrays.asList(1, 10, 100); + private static final List CACHE_SIZES = Arrays.asList(-1, 1000); @DataProvider(name = "fastas") public Object[][] createData1() { diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index 34a2e616a..e3a926fb9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -57,8 +57,8 @@ public class VCFWriterUnitTest extends BaseTest { VCFHeader header = createFakeHeader(metaData,additionalColumns); VCFWriter writer = new StandardVCFWriter(fakeVCFFile); writer.writeHeader(header); - writer.add(createVC(header),"A".getBytes()[0]); - writer.add(createVC(header),"A".getBytes()[0]); + writer.add(createVC(header)); + writer.add(createVC(header)); writer.close(); VCFCodec reader = new VCFCodec(); AsciiLineReader lineReader; @@ -135,7 +135,7 @@ public class VCFWriterUnitTest extends BaseTest { genotypes.put(name,gt); } - return new VariantContext("RANDOM",loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, 0, filters, attributes); + return new VariantContext("RANDOM",loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, 0, filters, attributes, (byte)'A'); } diff --git a/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java new file mode 100644 index 000000000..45a618f71 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.text; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; + +public class TextFormattingUtilsUnitTest extends BaseTest { + @Test(expectedExceptions = ReviewedStingException.class) + public void testSplitWhiteSpaceNullLine() { + TextFormattingUtils.splitWhiteSpace(null); + } + + @Test + public void testSplitWhiteSpace() { + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("foo bar baz"), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("foo bar baz"), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace(" foo bar baz"), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace(" foo bar baz "), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("foo bar baz "), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("\tfoo\tbar\tbaz\t"), new String[]{"foo", "bar", "baz"}); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testGetWordStartsNullLine() { + TextFormattingUtils.getWordStarts(null); + } + + @Test + public void testGetWordStarts() { + Assert.assertEquals(TextFormattingUtils.getWordStarts("foo bar baz"), Arrays.asList(4, 8)); + Assert.assertEquals(TextFormattingUtils.getWordStarts("foo bar baz"), Arrays.asList(5, 10)); + Assert.assertEquals(TextFormattingUtils.getWordStarts(" foo bar baz"), Arrays.asList(1, 5, 9)); + Assert.assertEquals(TextFormattingUtils.getWordStarts(" foo bar baz "), Arrays.asList(1, 5, 9)); + Assert.assertEquals(TextFormattingUtils.getWordStarts("foo bar baz "), Arrays.asList(4, 8)); + Assert.assertEquals(TextFormattingUtils.getWordStarts("\tfoo\tbar\tbaz\t"), Arrays.asList(1, 5, 9)); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testSplitFixedWidthNullLine() { + TextFormattingUtils.splitFixedWidth(null, Collections.emptyList()); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testSplitFixedWidthNullColumnStarts() { + TextFormattingUtils.splitFixedWidth("foo bar baz", null); + } + + @Test + public void testSplitFixedWidth() { + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("foo bar baz", Arrays.asList(4, 8)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("foo bar baz", Arrays.asList(5, 10)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" foo bar baz", Arrays.asList(5, 9)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" foo bar baz ", Arrays.asList(5, 9)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("foo bar baz ", Arrays.asList(4, 8)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("\tfoo\tbar\tbaz\t", Arrays.asList(5, 9)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("f o b r b z", Arrays.asList(4, 8)), new String[] { "f o", "b r", "b z" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" f o b r b z", Arrays.asList(4, 8)), new String[] { "f o", "b r", "b z" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" f o b r b z", Arrays.asList(4, 8)), new String[] { "f", "o b", "r b z" }); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java deleted file mode 100644 index 78ab916db..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java +++ /dev/null @@ -1,402 +0,0 @@ -// our package -package org.broadinstitute.sting.utils.threading; - - -// the imports for unit testing. - - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.iterators.GenomeLocusIterator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.*; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.*; - -/** - * Basic unit test for GenomeLoc - */ -public class GenomeLocProcessingTrackerUnitTest extends BaseTest { - IndexedFastaSequenceFile fasta = null; - GenomeLocParser genomeLocParser = null; - String chr1 = null; - private final static String FILE_ROOT = "public/testdata/GLPTFile"; - - @BeforeTest - public void before() { - File referenceFile = new File(hg18Reference); - try { - fasta = new IndexedFastaSequenceFile(referenceFile); - chr1 = fasta.getSequenceDictionary().getSequence(1).getSequenceName(); - genomeLocParser = new GenomeLocParser(fasta); - - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(referenceFile,ex); - } - } - - @BeforeMethod - public void beforeMethod(Object[] data) { - if ( data.length > 0 ) - ((TestTarget)data[0]).init(); - } - - @AfterMethod - public void afterMethod(Object[] data) { - if ( data.length > 0 ) { - ((TestTarget)data[0]).getTracker().close(); - ((TestTarget)data[0]).cleanup(); - } - } - - abstract private class TestTarget { - String name; - int nShards; - int shardSize; - File file; - - public void init() { cleanup(); } - - public void cleanup() { - if ( file != null && file.exists() ) - file.delete(); - } - - public boolean isThreadSafe() { return true; } - - protected TestTarget(String name, int nShards, int shardSize, File file) { - this.name = name; - this.nShards = nShards; - this.shardSize = shardSize; - this.file = file; - } - - public abstract GenomeLocProcessingTracker getTracker(); - - public List getShards() { - List shards = new ArrayList(); - for ( int i = 0; i < nShards; i++ ) { - int start = shardSize * i; - int stop = start + shardSize; - shards.add(genomeLocParser.createGenomeLoc(chr1, start, stop)); - } - return shards; - } - - public String toString() { - return String.format("TestTarget %s: nShards=%d shardSize=%d", name, nShards, shardSize); - } - } - - @DataProvider(name = "threadData") - public Object[][] createThreadData() { - // gotta keep the tests small... - return createData(Arrays.asList(10, 100), Arrays.asList(10)); - //return createData(Arrays.asList(10, 100, 1000, 10000), Arrays.asList(10)); - } - - public Object[][] createData(List nShards, List shardSizes) { - List params = new ArrayList(); - - int counter = 0; - String name = null; - for ( int nShard : nShards ) { - for ( int shardSize : shardSizes ) { - // shared mem -- canonical implementation - params.add(new TestTarget("ThreadSafeSharedMemory", nShard, shardSize, null) { - GenomeLocProcessingTracker tracker = new SharedMemoryGenomeLocProcessingTracker(new ClosableReentrantLock()); - public GenomeLocProcessingTracker getTracker() { return tracker; } - }); - - final File file1 = new File(String.format("%s_ThreadSafeFileBacked_%d_%d", FILE_ROOT, counter++, nShard, shardSize)); - params.add(new TestTarget("ThreadSafeFileBacked", nShard, shardSize, file1) { - GenomeLocProcessingTracker tracker = new FileBackedGenomeLocProcessingTracker(file1, genomeLocParser, new ClosableReentrantLock(), null); - public GenomeLocProcessingTracker getTracker() { return tracker; } - }); - - name = "FileBackedSharedFileThreadSafe"; - final File file2 = new File(String.format("%s_%s_%d_%d", FILE_ROOT, name, counter++, nShard, shardSize)); - params.add(new TestTarget(name, nShard, shardSize, file2) { - GenomeLocProcessingTracker tracker = new FileBackedGenomeLocProcessingTracker(file2, genomeLocParser, new SharedFileThreadSafeLock(file2, -1), null); - public GenomeLocProcessingTracker getTracker() { return tracker; } - }); - - name = "FileBackedSharedFile"; - final File file3 = new File(String.format("%s_%s_%d_%d", FILE_ROOT, name, counter++, nShard, shardSize)); - params.add(new TestTarget(name, nShard, shardSize, file3) { - GenomeLocProcessingTracker tracker = new FileBackedGenomeLocProcessingTracker(file3, genomeLocParser, new SharedFileLock(file3, -1), null); - public GenomeLocProcessingTracker getTracker() { return tracker; } - public boolean isThreadSafe() { return false; } - }); - } - } - - List params2 = new ArrayList(); - for ( TestTarget x : params ) params2.add(new Object[]{x}); - return params2.toArray(new Object[][]{}); - } - - @DataProvider(name = "simpleData") - public Object[][] createSimpleData() { - return createData(Arrays.asList(1000), Arrays.asList(100)); - } - - private static final String NAME_ONE = "name1"; - private static final String NAME_TWO = "name2"; - - @Test(enabled = true) - public void testNoop() { - GenomeLocProcessingTracker tracker = new NoOpGenomeLocProcessingTracker(); - for ( int start = 1; start < 100; start++ ) { - for ( int n = 0; n < 2; n++ ) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(chr1, start, start +1); - ProcessingLoc ploc = tracker.claimOwnership(loc, NAME_ONE); - Assert.assertTrue(ploc.isOwnedBy(NAME_ONE)); - Assert.assertEquals(tracker.updateAndGetProcessingLocs(NAME_ONE).size(), 0); - } - } - } - - @Test(dataProvider = "simpleData", enabled = true) - public void testSingleProcessTracker(TestTarget test) { - GenomeLocProcessingTracker tracker = test.getTracker(); - List shards = test.getShards(); - logger.warn("testSingleProcessTracker " + test); - - int counter = 0; - for ( GenomeLoc shard : shards ) { - counter++; - - Assert.assertNull(tracker.findOwner(shard, NAME_ONE)); - Assert.assertFalse(tracker.locIsOwned(shard, NAME_ONE)); - - ProcessingLoc proc = tracker.claimOwnership(shard,NAME_ONE); - Assert.assertNotNull(proc); - Assert.assertNotNull(proc.getLocation()); - Assert.assertNotNull(proc.getOwner()); - Assert.assertEquals(proc.getLocation(), shard); - Assert.assertEquals(proc.getOwner(), NAME_ONE); - Assert.assertEquals(tracker.findOwner(shard, NAME_ONE), proc); - Assert.assertTrue(tracker.locIsOwned(shard, NAME_ONE)); - Assert.assertNotNull(tracker.updateAndGetProcessingLocs(NAME_ONE)); - Assert.assertEquals(tracker.updateAndGetProcessingLocs(NAME_ONE).size(), counter); - - ProcessingLoc badClaimAttempt = tracker.claimOwnership(shard,NAME_TWO); - Assert.assertFalse(badClaimAttempt.getOwner().equals(NAME_TWO)); - Assert.assertEquals(badClaimAttempt.getOwner(), NAME_ONE); - } - } - - @Test(dataProvider = "simpleData", enabled = true) - public void testIterator(TestTarget test) { - GenomeLocProcessingTracker tracker = test.getTracker(); - List shards = test.getShards(); - logger.warn("testIterator " + test); - - List markedShards = new ArrayList(); - List toFind = new ArrayList(); - - for ( int i = 0; i < shards.size(); i++ ) { - if ( ! (i % 10 == 0) ) { - markedShards.add(shards.get(i)); - tracker.claimOwnership(shards.get(i), NAME_TWO); - } else { - toFind.add(shards.get(i)); - } - } - - int nFound = 0; - Iterator it = shards.iterator(); - while ( it.hasNext() ) { - GenomeLoc shard = tracker.claimOwnershipOfNextAvailable(it, NAME_ONE); - - if ( shard == null ) { // everything to get is done - Assert.assertEquals(nFound, toFind.size(), "Didn't find all of the available shards"); - } else { - nFound++; - ProcessingLoc proc = tracker.findOwner(shard, NAME_ONE); - - Assert.assertTrue(proc.isOwnedBy(NAME_ONE)); - Assert.assertTrue(! markedShards.contains(shard), "Ran process was already marked!"); - Assert.assertTrue(toFind.contains(shard), "Claimed shard wasn't one of the unmarked!"); - } - } - } - - @Test(dataProvider = "simpleData", enabled = true) - public void testMarkedProcesses(TestTarget test) { - GenomeLocProcessingTracker tracker = test.getTracker(); - List shards = test.getShards(); - logger.warn("testMarkedProcesses " + test); - - List markedShards = new ArrayList(); - - for ( int i = 0; i < shards.size(); i++ ) { - if ( i % 2 == 0 ) { - markedShards.add(shards.get(i)); - tracker.claimOwnership(shards.get(i), NAME_TWO); - } - } - - for ( GenomeLoc shard : shards ) { - ProcessingLoc proc = tracker.claimOwnership(shard,NAME_ONE); - - Assert.assertTrue(proc.isOwnedBy(NAME_ONE) || proc.isOwnedBy(NAME_TWO)); - - if ( proc.isOwnedBy(NAME_ONE) ) - Assert.assertTrue(! markedShards.contains(shard), "Ran process was already marked!"); - else - Assert.assertTrue(markedShards.contains(shard), "Unran process wasn't marked"); - - if ( ! markedShards.contains(shard) ) { - Assert.assertEquals(tracker.findOwner(shard, NAME_ONE), proc); - } - } - } - - public class TestThread implements Callable { - public TestTarget test; - public String name; - public List ran, toRun; - boolean useIterator; - - public TestThread(TestTarget test, int count, List toRun, boolean useIterator) { - this.test = test; - this.toRun = toRun; - this.name = "thread" + count; - this.ran = new ArrayList(); - this.useIterator = useIterator; - } - - public Integer call() { - //logger.warn(String.format("Call() Thread %s", name)); - if ( useIterator ) { - for ( GenomeLoc shard : test.getTracker().onlyOwned(toRun.iterator(), name) ) { - if ( shard != null ) { // ignore the unclaimable end of the stream - ran.add(shard); - // do some work here - for ( int sum =0, i = 0; i < 100000; i++) sum += i; - } - } - - } else { - for ( GenomeLoc shard : toRun ) { - //System.out.printf("Claiming ownership in %s on %s%n", name, shard); - ProcessingLoc proc = test.getTracker().claimOwnership(shard,name); - //System.out.printf(" => ownership of %s is %s (I own? %b)%n", shard, proc.getOwner(), proc.isOwnedBy(name)); - if ( proc.isOwnedBy(name) ) { - ran.add(proc.getLocation()); - // do some work here - for ( int sum =0, i = 0; i < 100000; i++) sum += i; - } - //logger.warn(String.format("Thread %s on %s -> owned by %s", name, shard, proc.getOwner())); - } - } - - return 1; - } - } - - private static TestThread findOwner(String name, List threads) { - for ( TestThread thread : threads ) { - if ( thread.name.equals(name) ) - return thread; - } - return null; - } - - private static final void assertAllThreadsFinished(List> futures) { - try { - for ( Future f : futures ) { - Assert.assertTrue(f.isDone(), "Thread never finished running"); - Assert.assertTrue(f.get() != null, "Finished successfully"); - } - } catch (InterruptedException e) { - Assert.fail("Thread failed to run to completion", e); - } catch (ExecutionException e) { - Assert.fail("Thread generated an exception", e); - } - } - - private static final List subList(List l, int i) { - List r = new ArrayList(); - for ( int j = 0; j < l.size(); j++ ) { - if ( j % i == 0 ) - r.add(l.get(j)); - } - - return r; - } - - @Test(dataProvider = "threadData", enabled = true) - public void testThreadedProcessesLowLevelFunctions(TestTarget test) { - testThreading(test, false); - } - - @Test(dataProvider = "threadData", enabled = true) - public void testThreadedProcessesIterator(TestTarget test) { - testThreading(test, true); - } - - private void testThreading(TestTarget test, boolean useIterator) { - if ( ! test.isThreadSafe() ) - // skip tests that aren't thread safe - return; - - // start up 3 threads - logger.warn("ThreadedTesting " + test + " using iterator " + useIterator); - List threads = new ArrayList(); - for ( int i = 0; i < 4; i++) { - List toRun = subList(test.getShards(), i+1); - TestThread thread = new TestThread(test, i, toRun, useIterator); - threads.add(thread); - } - ExecutorService exec = java.util.concurrent.Executors.newFixedThreadPool(threads.size()); - - try { - List> results = exec.invokeAll(threads, 300, TimeUnit.SECONDS); - GenomeLocProcessingTracker tracker = test.getTracker(); - List shards = test.getShards(); - - for ( TestThread thread : threads ) - logger.warn(String.format("TestThread %s ran %d jobs of %d to run", thread.name, thread.ran.size(), thread.toRun.size())); - - assertAllThreadsFinished(results); - - // we ran everything - Assert.assertEquals(tracker.updateAndGetProcessingLocs(NAME_ONE).size(), shards.size(), "Not all shards were run"); - - for ( GenomeLoc shard : shards ) { - Assert.assertTrue(tracker.locIsOwned(shard, NAME_ONE), "Unowned shard"); - - ProcessingLoc proc = tracker.findOwner(shard, NAME_ONE); - Assert.assertNotNull(proc, "Proc was null"); - - Assert.assertNotNull(proc.getOwner(), "Owner was null"); - Assert.assertEquals(proc.getLocation(), shard, "Shard loc doesn't make ProcessingLoc"); - - TestThread owner = findOwner(proc.getOwner(), threads); - Assert.assertNotNull(owner, "Couldn't find owner"); - - Assert.assertTrue(owner.ran.contains(shard), "Owner doesn't contain ran shard"); - - for ( TestThread thread : threads ) - if ( ! proc.isOwnedBy(thread.name) && thread.ran.contains(shard) ) - Assert.fail("Shard appears in another run list: proc=" + proc + " shard=" + shard + " also in jobs of " + thread.name + " obj=" + thread.ran.get(thread.ran.indexOf(shard))); - - } - } catch (InterruptedException e) { - Assert.fail("Thread failure", e); - } - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java index 6ed00f0ea..67fe7d012 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.HashMap; @@ -14,51 +15,51 @@ public class VariantContextIntegrationTest extends WalkerTest { " -R " + b36KGReference; private static String root = cmdRoot + - " -D " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:vcf,VCF3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf"; + " -L 1:1-1,000,000 -V " + b36dbSNP129; - static HashMap expectations = new HashMap(); - static { - expectations.put("-L 1:1-10000 --printPerLocus", "e9d96677a57bc3a10fb6d9ba942c19f0"); - expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "8a1174d2b18b98e624abbe93e6af8fdd"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "3933f1fae5453c54c3f791a23de07599"); - expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "c9cf2f01bf045a58dcc7649fd6ea2396"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "2097e32988d603d3b353b50218c86d3b"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "a103d856e8bc558c949c6e3f184e8913"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "5f2265ac6c6d80d64dc6e69a05c1250b"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "06a3ae4c0afa23b429a9491ab7707f3c"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType NO_VARIATION", "39335acdb34c8a2af433dc50d619bcbc"); + private static final class VCITTest extends TestDataProvider { + String args, md5; + + private VCITTest(final String args, final String md5) { + super(VCITTest.class); + this.args = args; + this.md5 = md5; + } } - @Test - public void testConversionSelection() { - for ( Map.Entry entry : expectations.entrySet() ) { - String extraArgs = entry.getKey(); - String md5 = entry.getValue(); + @DataProvider(name = "VCITTestData") + public Object[][] createVCITTestData() { + new VCITTest("--printPerLocus", "e9d0f1fe80659bb55b40aa6c3a2e921e"); + new VCITTest("--printPerLocus --onlyContextsOfType SNP", "0e620db3e45771df42c54a9c0ae4a29f"); + new VCITTest("--printPerLocus --onlyContextsOfType INDEL", "b725c204fefe3814644d50e7c20f9dfe"); + new VCITTest("--printPerLocus --onlyContextsOfType MIXED", "3ccc33f496a1718df55722d11cc14334"); + new VCITTest("--printPerLocus --onlyContextsOfType NO_VARIATION", "39335acdb34c8a2af433dc50d619bcbc"); + new VCITTest("--printPerLocus --takeFirstOnly", "3a45561da042b2b44b6a679744f16103"); + new VCITTest("--printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "4746f269ecc377103f83eb61cc162c39"); + new VCITTest("--printPerLocus --onlyContextsStartinAtCurrentPosition", "2749e3fae458650a85a2317e346dc44c"); + new VCITTest("--printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "9bd48c2a40813023e29ffaa23d59d382"); - WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testDbSNPAndVCFConversions", spec); - } + return VCITTest.getTests(VCITTest.class); + } + + @Test(dataProvider = "VCITTestData") + public void testConversionSelection(VCITTest test) { + String extraArgs = test.args; + String md5 = test.md5; + + WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testSelectors", spec); } @Test public void testToVCF() { // this really just tests that we are seeing the same number of objects over all of chr1 - WalkerTestSpec spec = new WalkerTestSpec( cmdRoot + " -NO_HEADER -B:vcf,VCF3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.vcf -L 1:1-1000000 -o %s --outputVCF %s", + WalkerTestSpec spec = new WalkerTestSpec( cmdRoot + " -NO_HEADER -V:VCF3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.vcf -L 1:1-1000000 -o %s --outputVCF %s", 2, // just one output file Arrays.asList("e3c35d0c4b5d4935c84a270f9df0951f", "ff91731213fd0bbdc200ab6fd1c93e63")); executeTest("testToVCF", spec); } - - @Test - public void testLargeScaleConversion() { - // this really just tests that we are seeing the same number of objects over all of chr1 - WalkerTestSpec spec = new WalkerTestSpec( root + " -L 1" + " -o %s", - 1, // just one output file - Arrays.asList("045a5b02c86aeb9301dc0b724da0c8f7")); - executeTest("testLargeScaleConversion", spec); - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index e82817714..f8e6da20a 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -92,45 +92,45 @@ public class VariantContextUnitTest { // test INDELs alleles = Arrays.asList(Aref, ATC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(Tref, TA, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, Allele.create("ATCTC")); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); // test MIXED alleles = Arrays.asList(TAref, T, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(TAref, T, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(ACref, ATC, AT); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(Aref, T, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); // test SYMBOLIC alleles = Arrays.asList(Tref, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC); } @@ -146,8 +146,8 @@ public class VariantContextUnitTest { Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); Assert.assertTrue(vc.isSNP()); Assert.assertFalse(vc.isIndel()); - Assert.assertFalse(vc.isInsertion()); - Assert.assertFalse(vc.isDeletion()); + Assert.assertFalse(vc.isSimpleInsertion()); + Assert.assertFalse(vc.isSimpleDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertTrue(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 2); @@ -173,8 +173,8 @@ public class VariantContextUnitTest { Assert.assertEquals(VariantContext.Type.NO_VARIATION, vc.getType()); Assert.assertFalse(vc.isSNP()); Assert.assertFalse(vc.isIndel()); - Assert.assertFalse(vc.isInsertion()); - Assert.assertFalse(vc.isDeletion()); + Assert.assertFalse(vc.isSimpleInsertion()); + Assert.assertFalse(vc.isSimpleDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertFalse(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 1); @@ -191,7 +191,7 @@ public class VariantContextUnitTest { @Test public void testCreatingDeletionVariantContext() { List alleles = Arrays.asList(ATCref, del); - VariantContext vc = new VariantContext("test", delLoc, delLocStart, delLocStop, alleles); + VariantContext vc = new VariantContext("test", delLoc, delLocStart, delLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getChr(), delLoc); Assert.assertEquals(vc.getStart(), delLocStart); @@ -199,8 +199,8 @@ public class VariantContextUnitTest { Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); Assert.assertFalse(vc.isSNP()); Assert.assertTrue(vc.isIndel()); - Assert.assertFalse(vc.isInsertion()); - Assert.assertTrue(vc.isDeletion()); + Assert.assertFalse(vc.isSimpleInsertion()); + Assert.assertTrue(vc.isSimpleDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertTrue(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 2); @@ -218,7 +218,7 @@ public class VariantContextUnitTest { @Test public void testCreatingInsertionVariantContext() { List alleles = Arrays.asList(delRef, ATC); - VariantContext vc = new VariantContext("test", insLoc, insLocStart, insLocStop, alleles); + VariantContext vc = new VariantContext("test", insLoc, insLocStart, insLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getChr(), insLoc); Assert.assertEquals(vc.getStart(), insLocStart); @@ -226,8 +226,8 @@ public class VariantContextUnitTest { Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); Assert.assertFalse(vc.isSNP()); Assert.assertTrue(vc.isIndel()); - Assert.assertTrue(vc.isInsertion()); - Assert.assertFalse(vc.isDeletion()); + Assert.assertTrue(vc.isSimpleInsertion()); + Assert.assertFalse(vc.isSimpleDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertTrue(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 2); @@ -251,7 +251,7 @@ public class VariantContextUnitTest { new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = IllegalStateException.class) public void testBadConstructorArgs3() { new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)); } @@ -433,7 +433,7 @@ public class VariantContextUnitTest { Assert.assertFalse(vc14.isBiallelic()); Assert.assertTrue(vc5.isIndel()); - Assert.assertTrue(vc5.isDeletion()); + Assert.assertTrue(vc5.isSimpleDeletion()); Assert.assertTrue(vc5.isVariant()); Assert.assertTrue(vc5.isBiallelic()); diff --git a/public/packages/AnalyzeCovariates.xml b/public/packages/AnalyzeCovariates.xml index 7e31934df..a6675a63d 100644 --- a/public/packages/AnalyzeCovariates.xml +++ b/public/packages/AnalyzeCovariates.xml @@ -6,10 +6,7 @@ - - - - + diff --git a/public/packages/FindContaminatingReadGroups.xml b/public/packages/FindContaminatingReadGroups.xml deleted file mode 100644 index 880f64a81..000000000 --- a/public/packages/FindContaminatingReadGroups.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - - - - diff --git a/public/packages/GATKResources.xml b/public/packages/GATKResources.xml deleted file mode 100755 index 87e6e0e50..000000000 --- a/public/packages/GATKResources.xml +++ /dev/null @@ -1,20 +0,0 @@ - - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/IndelGenotyper.xml b/public/packages/IndelGenotyper.xml deleted file mode 100644 index c9e3ae0f6..000000000 --- a/public/packages/IndelGenotyper.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - - diff --git a/public/packages/LocalRealignmentAroundIndels.xml b/public/packages/LocalRealignmentAroundIndels.xml deleted file mode 100644 index 46960e69f..000000000 --- a/public/packages/LocalRealignmentAroundIndels.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff --git a/public/packages/QualityScoresRecalibration.xml b/public/packages/QualityScoresRecalibration.xml deleted file mode 100644 index 95e8b7c63..000000000 --- a/public/packages/QualityScoresRecalibration.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - - - - - - - - - - - - diff --git a/public/packages/RMDIndexer.xml b/public/packages/RMDIndexer.xml deleted file mode 100644 index 5d40876de..000000000 --- a/public/packages/RMDIndexer.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - diff --git a/public/packages/UnifiedGenotyper.xml b/public/packages/UnifiedGenotyper.xml deleted file mode 100644 index 67a17640c..000000000 --- a/public/packages/UnifiedGenotyper.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - - diff --git a/public/packages/VariantAnnotator.xml b/public/packages/VariantAnnotator.xml deleted file mode 100644 index 88c0701f0..000000000 --- a/public/packages/VariantAnnotator.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/public/packages/VariantEval.xml b/public/packages/VariantEval.xml deleted file mode 100644 index 791066fb7..000000000 --- a/public/packages/VariantEval.xml +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - - - - - - - - - - - - diff --git a/public/packages/VariantFiltration.xml b/public/packages/VariantFiltration.xml deleted file mode 100644 index 48fa0ff37..000000000 --- a/public/packages/VariantFiltration.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - diff --git a/public/packages/VariantRecalibration.xml b/public/packages/VariantRecalibration.xml deleted file mode 100644 index 6fe6b1eff..000000000 --- a/public/packages/VariantRecalibration.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff --git a/public/perl/liftOverVCF.pl b/public/perl/liftOverVCF.pl index 21cb8bb6b..ba4198292 100755 --- a/public/perl/liftOverVCF.pl +++ b/public/perl/liftOverVCF.pl @@ -36,7 +36,7 @@ my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; # lift over the file print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; if ($recordOriginalLocation) { $cmd .= " -recordOriginalLocation"; } @@ -66,7 +66,7 @@ system($cmd) == 0 or quit("The sorting step failed. Please correct the necessar # Filter the VCF for bad records print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out"; system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying."); # clean up diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index b64c715d4..f97ce4884 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -11,7 +11,7 @@ import net.sf.samtools.SAMFileReader import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.queue.util.QScriptUtils -import org.broadinstitute.sting.queue.function.{CommandLineFunction, ListWriterFunction} +import org.broadinstitute.sting.queue.function.ListWriterFunction class DataProcessingPipeline extends QScript { qscript => @@ -31,19 +31,14 @@ class DataProcessingPipeline extends QScript { var reference: File = _ @Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=true) - var dbSNP: File = _ + var dbSNP: List[File] = List() /**************************************************************************** * Optional Parameters ****************************************************************************/ - -// @Input(doc="path to Picard's SortSam.jar (if re-aligning a previously processed BAM file)", fullName="path_to_sort_jar", shortName="sort", required=false) -// var sortSamJar: File = _ -// - @Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false) - var indels: File = _ + var indels: List[File] = List() @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ @@ -84,12 +79,6 @@ class DataProcessingPipeline extends QScript { var nContigs: Int = 0 // Use the number of contigs for scatter gathering jobs var cleanModelEnum: ConsensusDeterminationModel = ConsensusDeterminationModel.USE_READS - if (cleaningModel == "KNOWNS_ONLY") { - cleanModelEnum = ConsensusDeterminationModel.KNOWNS_ONLY - } - else if (cleaningModel == "USE_SW") { - cleanModelEnum = ConsensusDeterminationModel.USE_SW - } @@ -112,7 +101,7 @@ class DataProcessingPipeline extends QScript { // Because the realignment only happens after these scripts are executed, in case you are using // bwa realignment, this function will operate over the original bam files and output over the // (to be realigned) bam files. - def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = { + def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = { // Creating a table with SAMPLE information from each input BAM file val sampleTable = scala.collection.mutable.Map.empty[String, List[File]] @@ -137,24 +126,7 @@ class DataProcessingPipeline extends QScript { sampleTable(sample) :+= rBam } } - - println("\n\n*** INPUT FILES ***\n") - // Creating one file for each sample in the dataset - val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] - for ((sample, flist) <- sampleTable) { - - println(sample + ":") - for (f <- flist) - println (f) - println() - - val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".bam") - sampleBamFiles(sample) = sampleFileName - add(joinBams(flist, sampleFileName)) - } - println("*** INPUT FILES ***\n\n") - - return sampleBamFiles.toMap + return sampleTable.toMap } // Rebuilds the Read Group string to give BWA @@ -176,18 +148,20 @@ class DataProcessingPipeline extends QScript { var realignedBams: List[File] = List() var index = 1 for (bam <- bams) { - val readSortedBam = swapExt(bam, ".bam", "." + index + ".sorted.bam" ) + // first revert the BAM file to the original qualities + val revertedBAM = revertBAM(bam) + val readSortedBam = swapExt(revertedBAM, ".bam", "." + index + ".sorted.bam" ) val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai") val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai") val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam") val realignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.bam") val rgRealignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.rg.bam") if (useBWAse) { - add(bwa_aln_se(bam, saiFile1), - bwa_sam_se(bam, saiFile1, realignedSamFile)) + add(bwa_aln_se(revertedBAM, saiFile1), + bwa_sam_se(revertedBAM, saiFile1, realignedSamFile)) } else { - add(sortSam(bam, readSortedBam, SortOrder.queryname), + add(sortSam(revertedBAM, readSortedBam, SortOrder.queryname), bwa_aln_pe(readSortedBam, saiFile1, 1), bwa_aln_pe(readSortedBam, saiFile2, 2), bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile)) @@ -200,6 +174,27 @@ class DataProcessingPipeline extends QScript { return realignedBams } + def getIndelCleaningModel(): ConsensusDeterminationModel = { + if (cleaningModel == "KNOWNS_ONLY") + ConsensusDeterminationModel.KNOWNS_ONLY + else if (cleaningModel == "USE_SW") + ConsensusDeterminationModel.USE_SW + else + ConsensusDeterminationModel.USE_READS + } + + def revertBams(bams: List[File]): List[File] = { + var revertedBAMList: List[File] = List() + for (bam <- bams) + revertedBAMList :+= revertBAM(bam) + return revertedBAMList + } + + def revertBAM(bam: File): File = { + val revertedBAM = swapExt(bam, ".bam", ".reverted.bam") + add(revert(bam, revertedBAM)) + return revertedBAM + } /**************************************************************************** * Main script @@ -207,34 +202,31 @@ class DataProcessingPipeline extends QScript { def script = { + // final output list of processed bam files + var cohortList: List[File] = List() + + // sets the model for the Indel Realigner + cleanModelEnum = getIndelCleaningModel() // keep a record of the number of contigs in the first bam file in the list val bams = QScriptUtils.createListFromFile(input) nContigs = QScriptUtils.getNumberOfContigs(bams(0)) - val realignedBams = if (useBWApe || useBWAse) {performAlignment(bams)} else {bams} + val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)} - // Generate a BAM file per sample joining all per lane files if necessary - val sampleBamFiles: Map[String, File] = createSampleFiles(bams, realignedBams) + // generate a BAM file per sample joining all per lane files if necessary + val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs) - // Final output list of processed bam files - var cohortList: List[File] = List() - - // Simple progress report - println("\nFound the following samples: ") - for ((sample, file) <- sampleBamFiles) - println("\t" + sample + " -> " + file) - println("\n") - - // If this is a 'knowns only' indel realignment run, do it only once for all samples. + // if this is a 'knowns only' indel realignment run, do it only once for all samples. val globalIntervals = new File(outputDir + projectName + ".intervals") if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY) add(target(null, globalIntervals)) - // Put each sample through the pipeline - for ((sample, bam) <- sampleBamFiles) { + // put each sample through the pipeline + for ((sample, bamList) <- sampleBAMFiles) { // BAM files generated by the pipeline + val bam = new File(qscript.projectName + "." + sample + ".bam") val cleanedBam = swapExt(bam, ".bam", ".clean.bam") val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam") val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam") @@ -249,17 +241,19 @@ class DataProcessingPipeline extends QScript { val preValidateLog = swapExt(bam, ".bam", ".pre.validation") val postValidateLog = swapExt(bam, ".bam", ".post.validation") + // Validation is an optional step for the BAM file generated after // alignment and the final bam file of the pipeline. if (!noValidation) { - add(validate(bam, preValidateLog), + for (sampleFile <- bamList) + add(validate(sampleFile, preValidateLog), validate(recalBam, postValidateLog)) } if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) - add(target(bam, targetIntervals)) + add(target(bamList, targetIntervals)) - add(clean(bam, targetIntervals, cleanedBam), + add(clean(bamList, targetIntervals, cleanedBam), dedup(cleanedBam, dedupedBam, metricsFile), cov(dedupedBam, preRecalFile), recal(dedupedBam, preRecalFile, recalBam), @@ -299,27 +293,27 @@ class DataProcessingPipeline extends QScript { this.maxRecordsInRam = 100000 } - case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { - if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) - this.input_file :+= inBams + case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { + if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY) + this.input_file = inBams this.out = outIntervals this.mismatchFraction = 0.0 - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.known ++= qscript.dbSNP if (indels != null) - this.rodBind :+= RodBind("indels", "VCF", indels) + this.known ++= qscript.indels this.scatterCount = nContigs this.analysisName = queueLogDir + outIntervals + ".target" this.jobName = queueLogDir + outIntervals + ".target" } - case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { - this.input_file :+= inBams + case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { + this.input_file = inBams this.targetIntervals = tIntervals this.out = outBam - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.known ++= qscript.dbSNP if (qscript.indels != null) - this.rodBind :+= RodBind("indels", "VCF", qscript.indels) - this.consensusDeterminationModel = consensusDeterminationModel + this.known ++= qscript.indels + this.consensusDeterminationModel = cleanModelEnum this.compress = 0 this.scatterCount = nContigs this.analysisName = queueLogDir + outBam + ".clean" @@ -327,7 +321,7 @@ class DataProcessingPipeline extends QScript { } case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.knownSites ++= qscript.dbSNP this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.input_file :+= inBam this.recal_file = outRecalFile @@ -367,16 +361,15 @@ class DataProcessingPipeline extends QScript { } case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") - this.input = List(inBam) + this.input :+= inBam this.output = outBam this.metrics = metricsFile + this.memoryLimit = 16 this.analysisName = queueLogDir + outBam + ".dedup" this.jobName = queueLogDir + outBam + ".dedup" } case class joinBams (inBams: List[File], outBam: File) extends MergeSamFiles with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input = inBams this.output = outBam this.analysisName = queueLogDir + outBam + ".joinBams" @@ -384,8 +377,7 @@ class DataProcessingPipeline extends QScript { } case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") - this.input = List(inSam) + this.input :+= inSam this.output = outBam this.sortOrder = sortOrderP this.analysisName = queueLogDir + outBam + ".sortSam" @@ -393,7 +385,7 @@ class DataProcessingPipeline extends QScript { } case class validate (inBam: File, outLog: File) extends ValidateSamFile with ExternalCommonArgs { - this.input = List(inBam) + this.input :+= inBam this.output = outLog this.REFERENCE_SEQUENCE = qscript.reference this.isIntermediate = false @@ -403,8 +395,7 @@ class DataProcessingPipeline extends QScript { case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") - this.input = List(inBam) + this.input :+= inBam this.output = outBam this.RGID = readGroup.id this.RGCN = readGroup.cn @@ -417,6 +408,14 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outBam + ".rg" } + case class revert (inBam: File, outBam: File) extends RevertSam with ExternalCommonArgs { + this.output = outBam + this.input :+= inBam + this.analysisName = queueLogDir + outBam + "revert" + this.jobName = queueLogDir + outBam + ".revert" + + } + case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with ExternalCommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file") var sai = outSai diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index a961beca1..109139d20 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -1,21 +1,9 @@ package org.broadinstitute.sting.queue.qscripts -import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.gatk.phonehome.GATKRunReport - - // ToDos: - // reduce the scope of the datasets so the script is more nimble - // create gold standard BAQ'd bam files, no reason to always do it on the fly - - // Analysis to add at the end of the script: - // auto generation of the cluster plots - // spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls - // produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF - - class MethodsDevelopmentCallingPipeline extends QScript { qscript => @@ -28,17 +16,14 @@ class MethodsDevelopmentCallingPipeline extends QScript { @Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false) var datasets: List[String] = Nil - @Argument(shortName="skipGoldStandard", doc="doesn't run the pipeline with the goldstandard VCF files for comparison", required=false) - var skipGoldStandard: Boolean = false + @Argument(shortName="runGoldStandard", doc="run the pipeline with the goldstandard VCF files for comparison", required=false) + var runGoldStandard: Boolean = false @Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false) var noBAQ: Boolean = false - @Argument(shortName="eval", doc="adds the VariantEval walker to the pipeline", required=false) - var eval: Boolean = false - - @Argument(shortName="indels", doc="calls indels with the Unified Genotyper", required=false) - var callIndels: Boolean = false + @Argument(shortName="noIndels", doc="do not call indels with the Unified Genotyper", required=false) + var noIndels: Boolean = false @Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false) var LOCAL_ET: Boolean = false @@ -52,8 +37,6 @@ class MethodsDevelopmentCallingPipeline extends QScript { @Argument(shortName="sample", doc="Samples to include in Variant Eval", required=false) var samples: List[String] = Nil - - class Target( val baseName: String, val reference: File, @@ -65,7 +48,9 @@ class MethodsDevelopmentCallingPipeline extends QScript { val intervals: String, val titvTarget: Double, val trancheTarget: Double, - val isLowpass: Boolean) { + val isLowpass: Boolean, + val isExome: Boolean, + val nSamples: Int) { val name = qscript.outputDir + baseName val clusterFile = new File(name + ".clusters") val rawVCF = new File(name + ".raw.vcf") @@ -84,14 +69,14 @@ class MethodsDevelopmentCallingPipeline extends QScript { val goldStandardClusterFile = new File(goldStandardName + ".clusters") } - val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + val b37_decoy = new File("/humgen/1kg/reference/human_g1k_v37_decoy.fasta") + val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta") val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta") val dbSNP_hg18_129 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_hg18.rod" // Special case for NA12878 collections that can't use 132 because they are part of it. - val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b36.rod" - val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf" - val dbSNP_b37_129 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf" // Special case for NA12878 collections that can't use 132 because they are part of it. + val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132.b36.excluding_sites_after_129.vcf" + val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf" // Special case for NA12878 collections that can't use 132 because they are part of it. val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf" val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf" val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" @@ -100,55 +85,69 @@ class MethodsDevelopmentCallingPipeline extends QScript { val omni_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed" val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed" + val training_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.highQuality.vcf" + val badSites_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.terrible.vcf" + val projectConsensus_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/ALL.wgs.projectConsensus_v2b.20101123.snps.sites.vcf" val lowPass: Boolean = true + val exome: Boolean = true val indels: Boolean = true val queueLogDir = ".qlog/" + // BUGBUG: We no longer support b36/hg18 because several of the necessary files aren't available aligned to those references + val targetDataSets: Map[String, Target] = Map( "HiSeq" -> new Target("NA12878.HiSeq", hg18, dbSNP_hg18_129, hapmap_hg18, "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.07, 99.0, !lowPass), - "HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.14, 99.0, !lowPass, !exome, 1), + "HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass), - "GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1), + "GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.GA2.hg19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1), "WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18_129, hapmap_hg18, "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, 97.0, !lowPass), - "WExTrio" -> new Target("CEUTrio.WEx", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 3.3, 98.0, !lowPass, exome, 1), + "WExTrio" -> new Target("CEUTrio.WEx", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 97.0, !lowPass), - "WGSTrio" -> new Target("CEUTrio.WGS", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37, + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 3.3, 98.0, !lowPass, exome, 3), + "WGSTrio" -> new Target("CEUTrio.WGS", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.bwa.cleaned.recal.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3), + "WExTrioDecoy" -> new Target("CEUTrio.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, + new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.list"), + new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 3.3, 98.0, !lowPass, exome, 3), + "WGSTrioDecoy" -> new Target("CEUTrio.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, + new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.list"), + new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3), "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass, !exome, 79), "TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 99.0, !lowPass), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, 99.0, !lowPass, exome, 96), "LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36, new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 99.0, lowPass), // chunked interval list to use with Queue's scatter/gather functionality + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 99.0, lowPass, !exome, 60), // chunked interval list to use with Queue's scatter/gather functionality "LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"), new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass) + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 99.0, lowPass, !exome, 363) ) @@ -166,13 +165,13 @@ class MethodsDevelopmentCallingPipeline extends QScript { val goldStandard = true for (target <- targets) { if( !skipCalling ) { - if (callIndels) add(new indelCall(target), new indelFilter(target), new indelEvaluation(target)) + if (!noIndels) add(new indelCall(target), new indelFilter(target), new indelEvaluation(target)) add(new snpCall(target)) add(new VQSR(target, !goldStandard)) add(new applyVQSR(target, !goldStandard)) - if (eval) add(new snpEvaluation(target)) + add(new snpEvaluation(target)) } - if ( !skipGoldStandard ) { + if ( runGoldStandard ) { add(new VQSR(target, goldStandard)) add(new applyVQSR(target, goldStandard)) } @@ -187,22 +186,19 @@ class MethodsDevelopmentCallingPipeline extends QScript { } def bai(bam: File) = new File(bam + ".bai") - val FiltersToIgnore = List("DPFilter", "ABFilter", "ESPStandard", "QualByDepth", "StrandBias", "HomopolymerRun") // 1.) Unified Genotyper Base class GenotyperBase (t: Target) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { this.memoryLimit = 3 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) - this.scatterCount = 63 // the smallest interval list has 63 intervals, one for each Mb on chr20 + this.scatterCount = 140 + this.nt = 2 this.dcov = if ( t.isLowpass ) { 50 } else { 250 } this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } this.input_file :+= t.bamList - if (t.dbsnpFile.endsWith(".rod")) - this.DBSNP = new File(t.dbsnpFile) - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) + this.D = new File(t.dbsnpFile) } // 1a.) Call SNPs with UG @@ -216,7 +212,6 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.baq = if (noBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY} this.analysisName = t.name + "_UGs" this.jobName = queueLogDir + t.name + ".snpcall" - this.A ++= List("FisherStrand") } // 1b.) Call Indels with UG @@ -234,15 +229,14 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) this.scatterCount = 10 - this.filterName ++= List("HARD_TO_VALIDATE") - this.filterExpression ++= List("\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"") - this.variantVCF = t.rawIndelVCF + this.V = t.rawIndelVCF this.out = t.filteredIndelVCF - this.filterName ++= List("LowQual", "StrandBias", "QualByDepth", "HomopolymerRun") - if (t.isLowpass) - this.filterExpression ++= List("\"QUAL<30.0\"", "\"SB>=-1.0\"", "\"QD<1.0\"", "\"HRun>=15\"") - else - this.filterExpression ++= List("\"QUAL<50.0\"", "\"SB>=-1.0\"", "\"QD<5.0\"", "\"HRun>=15\"") + this.filterName ++= List("IndelQD", "IndelReadPosRankSum", "IndelFS") + this.filterExpression ++= List("\"QD < 2.0\"", "\"ReadPosRankSum < -20.0\"", "\"FS > 200.0\"") + if (t.nSamples >= 10) { + this.filterName ++= List("IndelInbreedingCoeff") + this.filterExpression ++= List("\"InbreedingCoeff < -0.8\"") + } this.analysisName = t.name + "_VF" this.jobName = queueLogDir + t.name + ".indelfilter" } @@ -250,70 +244,72 @@ class MethodsDevelopmentCallingPipeline extends QScript { // 3.) Variant Quality Score Recalibration - Generate Recalibration table class VQSR(t: Target, goldStandard: Boolean) extends VariantRecalibrator with UNIVERSAL_GATK_ARGS { this.memoryLimit = 4 + this.nt = 2 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) - this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) - this.rodBind :+= RodBind("hapmap", "VCF", t.hapmapFile, "known=false,training=true,truth=true,prior=15.0") - if( t.hapmapFile.contains("b37") ) - this.rodBind :+= RodBind("omni", "VCF", omni_b37, "known=false,training=true,truth=true,prior=12.0") - else if( t.hapmapFile.contains("b36") ) - this.rodBind :+= RodBind("omni", "VCF", omni_b36, "known=false,training=true,truth=true,prior=12.0") - if (t.dbsnpFile.endsWith(".rod")) - this.rodBind :+= RodBind("dbsnp", "DBSNP", t.dbsnpFile, "known=true,training=false,truth=false,prior=10.0") - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile, "known=true,training=false,truth=false,prior=10.0") - this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "HRun", "FS") + this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) + this.resource :+= new TaggedFile( t.hapmapFile, "training=true,truth=true,prior=15.0" ) + this.resource :+= new TaggedFile( omni_b37, "training=true,truth=true,prior=12.0" ) + this.resource :+= new TaggedFile( training_1000G, "training=true,prior=10.0" ) + this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" ) + this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" ) + this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS") + if(t.nSamples >= 10) { + this.use_annotation ++= List("InbreedingCoeff") + } + if(!t.isExome) { + this.use_annotation ++= List("DP") + } else { + this.mG = 6 + } this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile } this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } this.allPoly = true this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0") this.rscript_file = t.vqsrRscript this.analysisName = t.name + "_VQSR" - this.jobName = queueLogDir + t.name + ".VQSR" + this.jobName = queueLogDir + t.name + ".VQSR" } // 4.) Apply the recalibration table to the appropriate tranches class applyVQSR (t: Target, goldStandard: Boolean) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 4 + this.memoryLimit = 6 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) - this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) + this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesFile} this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalFile } this.ts_filter_level = t.trancheTarget this.out = t.recalibratedVCF this.analysisName = t.name + "_AVQSR" - this.jobName = queueLogDir + t.name + ".applyVQSR" + this.jobName = queueLogDir + t.name + ".applyVQSR" } // 5.) Variant Evaluation Base(OPTIONAL) class EvalBase(t: Target) extends VariantEval with UNIVERSAL_GATK_ARGS { this.memoryLimit = 3 this.reference_sequence = t.reference - this.rodBind :+= RodBind("comphapmap", "VCF", t.hapmapFile) + this.comp :+= new TaggedFile(t.hapmapFile, "hapmap" ) this.intervalsString ++= List(t.intervals) - if (t.dbsnpFile.endsWith(".rod")) - this.DBSNP = new File(t.dbsnpFile) - else if (t.dbsnpFile.endsWith(".vcf")) - this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) + this.D = new File(t.dbsnpFile) this.sample = samples } // 5a.) SNP Evaluation (OPTIONAL) based on the cut vcf class snpEvaluation(t: Target) extends EvalBase(t) { - if (t.reference == b37 || t.reference == hg19) this.rodBind :+= RodBind("compomni", "VCF", omni_b37) - this.rodBind :+= RodBind("eval", "VCF", t.recalibratedVCF ) + if (t.reference == b37 || t.reference == hg19) this.comp :+= new TaggedFile( omni_b37, "omni" ) + this.eval :+= t.recalibratedVCF this.out = t.evalFile this.analysisName = t.name + "_VEs" - this.jobName = queueLogDir + t.name + ".snp.eval" + this.jobName = queueLogDir + t.name + ".snp.eval" } // 5b.) Indel Evaluation (OPTIONAL) class indelEvaluation(t: Target) extends EvalBase(t) { - this.rodBind :+= RodBind("eval", "VCF", t.filteredIndelVCF) + this.eval :+= t.filteredIndelVCF this.evalModule :+= "IndelStatistics" this.out = t.evalIndelFile this.analysisName = t.name + "_VEi" - this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval" + this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval" } } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala new file mode 100755 index 000000000..6947d4398 --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -0,0 +1,189 @@ +package org.broadinstitute.sting.queue.qscripts + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.util.QScriptUtils +import net.sf.samtools.SAMFileHeader.SortOrder +import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.commandline.Hidden +import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups} + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 4/20/11 + * Time: 16:29 PM + */ + + +class PacbioProcessingPipeline extends QScript { + + @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true) + var input: File = _ + + @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true) + var R: String = _ + + @Input(doc="Reference fasta file", shortName="R", required=true) + var reference: File = _ + + @Input(doc="dbsnp VCF file to use ", shortName="D", required=true) + var dbSNP: File = _ + + @Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) + var threads: Int = 0 + + @Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) + var sample: String = "NA" + + @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false) + var bwaPath: File = _ + + @Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false) + var BLASR_BAM: Boolean = false + + @Hidden + @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) + var dbq: Int = 20 + + @Hidden + @Input(shortName="bwastring", required=false) + var bwastring: String = "" + + val queueLogDir: String = ".qlog/" + + def script = { + + val fileList: List[File] = QScriptUtils.createListFromFile(input) + + for (file: File <- fileList) { + + var USE_BWA: Boolean = false + + if (file.endsWith(".fasta") || file.endsWith(".fq")) { + if (bwaPath == null) { + throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA"); + } + USE_BWA = true + } + + // FASTA -> BAM steps + val alignedSam: File = file.getName + ".aligned.sam" + val sortedBam: File = swapExt(alignedSam, ".sam", ".bam") + val rgBam: File = swapExt(file, ".bam", ".rg.bam") + + val bamBase = if (USE_BWA) {rgBam} else {file} + + // BAM Steps + val mqBAM: File = swapExt(bamBase, ".bam", ".mq.bam") + val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv") + val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv") + val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam") + val path1: String = recalBam + ".before" + val path2: String = recalBam + ".after" + + if (USE_BWA) { + add(align(file, alignedSam), + sortSam(alignedSam, sortedBam), + addReadGroup(sortedBam, rgBam, sample)) + } + + else if (BLASR_BAM) { + val reorderedBAM = swapExt(bamBase, ".bam", ".reordered.bam") + add(reorder(bamBase, reorderedBAM), + changeMQ(reorderedBAM, mqBAM)) + } + + val bam = if (BLASR_BAM) {mqBAM} else {bamBase} + + add(cov(bam, recalFile1), + recal(bam, recalFile1, recalBam), + cov(recalBam, recalFile2), + analyzeCovariates(recalFile1, path1), + analyzeCovariates(recalFile2, path2)) + } + } + + + // General arguments to non-GATK tools + trait ExternalCommonArgs extends CommandLineFunction { + this.memoryLimit = 4 + this.isIntermediate = true + } + + trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs { + this.reference_sequence = reference + } + + + case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs { + def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z20 -t16 " + reference + " " + inFastq + " > " + outSam + this.memoryLimit = 8 + this.analysisName = queueLogDir + outSam + ".bwa_sam_se" + this.jobName = queueLogDir + outSam + ".bwa_sam_se" + } + + case class sortSam (inSam: File, outBam: File) extends SortSam with ExternalCommonArgs { + this.input = List(inSam) + this.output = outBam + this.sortOrder = SortOrder.coordinate + this.analysisName = queueLogDir + outBam + ".sortSam" + this.jobName = queueLogDir + outBam + ".sortSam" + } + + case class reorder (inSam: File, outSam: File) extends ReorderSam with ExternalCommonArgs { + this.input = List(inSam) + this.output = outSam + this.sortReference = reference + } + + case class changeMQ(inBam: File, outBam: File) extends PrintReads with CommandLineGATKArgs { + this.input_file :+= inBam + this.out = outBam + this.read_filter :+= "ReassignMappingQuality" + } + + case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs { + @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") + this.input = List(inBam) + this.output = outBam + this.RGID = "1" + this.RGCN = "BI" + this.RGPL = "PacBio_RS" + this.RGSM = sample + this.RGLB = "default_library" + this.RGPU = "default_pu" + this.analysisName = queueLogDir + outBam + ".rg" + this.jobName = queueLogDir + outBam + ".rg" + } + + case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { + this.DBQ = dbq + this.knownSites :+= dbSNP + this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") + this.input_file :+= inBam + this.recal_file = outRecalFile + this.analysisName = queueLogDir + outRecalFile + ".covariates" + this.jobName = queueLogDir + outRecalFile + ".covariates" + this.scatterCount = threads + } + + case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { + this.DBQ = dbq + this.input_file :+= inBam + this.recal_file = inRecalFile + this.out = outBam + this.isIntermediate = false + this.analysisName = queueLogDir + outBam + ".recalibration" + this.jobName = queueLogDir + outBam + ".recalibration" + this.scatterCount = threads + } + + case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { + this.resources = R + this.recal_file = inRecalFile + this.output_dir = outPath + this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" + this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" + } +} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala deleted file mode 100755 index cbe53db8d..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ /dev/null @@ -1,90 +0,0 @@ -package org.broadinstitute.sting.queue.qscripts - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.util.QScriptUtils - -/** - * Created by IntelliJ IDEA. - * User: carneiro - * Date: 4/20/11 - * Time: 16:29 PM - */ - - -class RecalibrateBaseQualities extends QScript { - - @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) - var GATKjar: File = _ - - @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) - var input: File = _ - - @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true) - var R: String = _ - - @Input(doc="Reference fasta file", shortName="R", required=true) - var reference: File = _ // new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - - @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=true) - var dbSNP: File = _ // new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") - - val queueLogDir: String = ".qlog/" - var nContigs: Int = 0 - - def script = { - - val bamList = QScriptUtils.createListFromFile(input) - nContigs = QScriptUtils.getNumberOfContigs(bamList(0)) - - for (bam <- bamList) { - - val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv") - val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv") - val recalBam: File = swapExt(bam, ".bam", ".recal.bam") - val path1: String = recalBam + ".before" - val path2: String = recalBam + ".after" - - add(cov(bam, recalFile1), - recal(bam, recalFile1, recalBam), - cov(recalBam, recalFile2), - analyzeCovariates(recalFile1, path1), - analyzeCovariates(recalFile2, path2)) - } - } - - trait CommandLineGATKArgs extends CommandLineGATK { - this.jarFile = GATKjar - this.reference_sequence = reference - this.memoryLimit = 4 - this.isIntermediate = true - } - - case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { - this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") - this.input_file :+= inBam - this.recal_file = outRecalFile - this.analysisName = queueLogDir + outRecalFile + ".covariates" - this.jobName = queueLogDir + outRecalFile + ".covariates" - this.scatterCount = nContigs - } - - case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { - this.input_file :+= inBam - this.recal_file = inRecalFile - this.out = outBam - this.isIntermediate = false - this.analysisName = queueLogDir + outBam + ".recalibration" - this.jobName = queueLogDir + outBam + ".recalibration" - this.scatterCount = nContigs - } - - case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { - this.resources = R - this.recal_file = inRecalFile - this.output_dir = outPath - this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" - this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" - } -} diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index f19d60930..297da8cc9 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -95,7 +95,8 @@ class QCommandLine extends CommandLineProgram with Logging { def execute = { qGraph.settings = settings - for (script <- pluginManager.createAllTypes()) { + val allQScripts = pluginManager.createAllTypes(); + for (script <- allQScripts) { logger.info("Scripting " + pluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) loadArgumentsIntoObject(script) try { @@ -108,14 +109,26 @@ class QCommandLine extends CommandLineProgram with Logging { logger.info("Added " + script.functions.size + " functions") } + // Execute the job graph qGraph.run() + // walk over each script, calling onExecutionDone + for (script <- allQScripts) { + script.onExecutionDone(qGraph.getFunctionsAndStatus(script.functions), qGraph.success) + if ( ! settings.disableJobReport ) { + val jobStringName = (QScriptUtils.?(settings.jobReportFile)).getOrElse(settings.qSettings.jobNamePrefix + ".jobreport.txt") + val jobReportFile = new File(jobStringName) + logger.info("Writing JobLogging GATKReport to file " + jobReportFile) + QJobReport.printReport(qGraph.getFunctionsAndStatus(script.functions), jobReportFile) + QJobReport.plotReport(settings.rScriptArgs, jobReportFile) + } + } + if (!qGraph.success) { logger.info("Done with errors") qGraph.logFailed() 1 } else { - logger.info("Done") 0 } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 5cb8d1d29..fce65c997 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -24,6 +24,7 @@ package org.broadinstitute.sting.queue +import engine.JobRunInfo import org.broadinstitute.sting.queue.function.QFunction import annotation.target.field import io.Source @@ -57,6 +58,16 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon */ def script() + /** + * A default handler for the onExecutionDone() function. By default this doesn't do anything + * except print out a fine status message. + */ + def onExecutionDone(jobs: Map[QFunction, JobRunInfo], success: Boolean) { + logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", jobs.size)) + // this is too much output + // for ( (f, info) <- jobs ) logger.info(" %s %s".format(f.jobName, info)) + } + /** * The command line functions that will be executed for this QScript. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 05c1a1775..648f9ffef 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -41,12 +41,27 @@ class QSettings { @Argument(fullName="job_queue", shortName="jobQueue", doc="Default queue for compute farm jobs.", required=false) var jobQueue: String = _ - @Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs.", required=false) + @Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs. Min = 0, Max = 100", required=false) var jobPriority: Option[Int] = None - @Argument(fullName="default_memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) + @Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false) + var jobNativeArgs: List[String] = Nil + + @Argument(fullName="job_resource_request", shortName="jobResReq", doc="Resource requests to pass to the job runner.", required=false) + var jobResourceRequests: List[String] = Nil + + @Argument(fullName="job_environment_name", shortName="jobEnv", doc="Environment names for the job runner.", required=false) + var jobEnvironmentNames: List[String] = Nil + + @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) var memoryLimit: Option[Double] = None + @Argument(fullName="resident_memory_limit", shortName="resMemLimit", doc="Default resident memory limit for jobs, in gigabytes.", required=false) + var residentLimit: Option[Double] = None + + @Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false) + var residentRequest: Option[Double] = None + @Argument(fullName="run_directory", shortName="runDir", doc="Root directory to run functions from.", required=false) var runDirectory = new File(".") diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala index 2e3108136..2c960d8f6 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala @@ -51,10 +51,21 @@ trait CommandLineJobRunner extends JobRunner[CommandLineFunction] with Logging { /** The last time the status was updated */ protected var lastStatusUpdate: Long = _ - final override def status = this.lastStatus + /** The runner specific priority for a minimum priority job */ + protected val minRunnerPriority = 0 - def residentRequestMB: Option[Double] = function.memoryLimit.map(_ * 1024) - def residentLimitMB: Option[Double] = residentRequestMB.map( _ * 1.2 ) + /** The runner specific priority for a maximum priority job */ + protected val maxRunnerPriority = 0 + + /** The priority of the function in the range defined by the runner */ + protected def functionPriority = { + function.jobPriority.map { priority => + (((priority / 100D) * (maxRunnerPriority - minRunnerPriority)) + minRunnerPriority). + round.intValue() min maxRunnerPriority max minRunnerPriority + } + } + + final override def status = this.lastStatus override def init() { super.init() diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala index 162ed1b3c..4cb925d9f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala @@ -23,6 +23,8 @@ class FunctionEdge(val function: QFunction, val inputs: QNode, val outputs: QNod */ var depth = -1 + val myRunInfo: JobRunInfo = JobRunInfo.default // purely for dryRun testing + /** * Initializes with the current status of the function. */ @@ -179,4 +181,8 @@ class FunctionEdge(val function: QFunction, val inputs: QNode, val outputs: QNod printWriter.close IOUtils.writeContents(functionErrorFile, stackTrace.toString) } + + def getRunInfo = { + if ( runner == null ) myRunInfo else runner.getRunInfo + } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala index d583a55ef..a580be473 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/InProcessRunner.scala @@ -1,7 +1,9 @@ package org.broadinstitute.sting.queue.engine import org.broadinstitute.sting.queue.function.InProcessFunction -import org.broadinstitute.sting.queue.util.IOUtils +import java.util.Date +import org.broadinstitute.sting.queue.util.{Logging, IOUtils} +import org.broadinstitute.sting.utils.Utils /** * Runs a function that executes in process and does not fork out an external process. @@ -10,8 +12,13 @@ class InProcessRunner(val function: InProcessFunction) extends JobRunner[InProce private var runStatus: RunnerStatus.Value = _ def start() = { + getRunInfo.startTime = new Date() + getRunInfo.exechosts = Utils.resolveHostname() runStatus = RunnerStatus.RUNNING + function.run() + + getRunInfo.doneTime = new Date() val content = "%s%nDone.".format(function.description) IOUtils.writeContents(function.jobOutputFile, content) runStatus = RunnerStatus.DONE diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala index 30187f7e2..9aeb3a8ee 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala @@ -30,6 +30,9 @@ import org.broadinstitute.sting.queue.function.QFunction * Creates and stops JobRunners */ trait JobManager[TFunction <: QFunction, TRunner <: JobRunner[TFunction]] { + def init() {} + def exit() {} + /** The class type of the runner. Available at runtime even after erasure. */ def functionType: Class[TFunction] @@ -52,6 +55,5 @@ trait JobManager[TFunction <: QFunction, TRunner <: JobRunner[TFunction]] { * Stops a list of functions. * @param runners Runners to stop. */ - def tryStop(runners: Set[TRunner]) { - } + def tryStop(runners: Set[TRunner]) {} } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala new file mode 100644 index 000000000..2caa4d2aa --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunInfo.scala @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.engine + +import java.util.Date +import java.text.SimpleDateFormat + +/** + * Class containing tracked information about a job run. + */ + // todo -- it might be nice to have the hostname +class JobRunInfo { + /** constant date format */ + val formatter = new SimpleDateFormat("yy-MM-dd H:mm:ss:SSS"); + + /** The start time with millisecond resolution of this job */ + var startTime: Date = _ + /** The done time with millisecond resolution of this job */ + var doneTime: Date = _ + var exechosts: String = "localhost" + + def getStartTime = startTime + def getDoneTime = doneTime + def getFormattedStartTime = formatTime(getStartTime) + def getFormattedDoneTime = formatTime(getDoneTime) + + /** Helper function that pretty prints the date */ + private def formatTime(d: Date) = if ( d != null ) formatter.format(d) else "null" + + def getExecHosts = exechosts + + /** + * Was any information set for this jobInfo? JobInfo can be unset because + * the job never ran or because it already completed. + */ + def isFilledIn = startTime != null + + /** + * How long did the job run (in wall time)? Returns -1 if this jobInfo isn't filled in + */ + def getRuntimeInMs: Long = { + if ( isFilledIn ) + getDoneTime.getTime - getStartTime.getTime + else + -1 + } + + override def toString: String = + "started %s ended %s runtime %s".format(getFormattedStartTime, getFormattedDoneTime, getRuntimeInMs) +} + +object JobRunInfo { + def default: JobRunInfo = new JobRunInfo() +} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala index de5fbde05..6dca5d89f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala @@ -69,6 +69,12 @@ trait JobRunner[TFunction <: QFunction] { def cleanup() { } + /** + * Must be overloaded + */ + val runInfo = JobRunInfo.default + def getRunInfo = runInfo + /** * Calls back to a hook that an expert user can setup to modify a job. * @param value Value to modify. diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index a52e9c561..4469874e2 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -38,6 +38,8 @@ import org.apache.commons.lang.StringUtils import org.broadinstitute.sting.queue.util._ import collection.immutable.{TreeSet, TreeMap} import org.broadinstitute.sting.queue.function.scattergather.{ScatterFunction, CloneFunction, GatherFunction, ScatterGatherableFunction} +import java.util.Date +import org.broadinstitute.sting.utils.Utils /** * The internal dependency tracker between sets of function input and output files. @@ -319,7 +321,10 @@ class QGraph extends Logging { logger.debug("+++++++") foreachFunction(readyJobs.toList, edge => { if (running) { + edge.myRunInfo.startTime = new Date() + edge.getRunInfo.exechosts = Utils.resolveHostname() logEdge(edge) + edge.myRunInfo.doneTime = new Date() edge.markAsDone } }) @@ -361,6 +366,13 @@ class QGraph extends Logging { settings.jobRunner = "Shell" commandLineManager = commandLinePluginManager.createByName(settings.jobRunner) + for (mgr <- managers) { + if (mgr != null) { + val manager = mgr.asInstanceOf[JobManager[QFunction,JobRunner[QFunction]]] + manager.init() + } + } + if (settings.startFromScratch) logger.info("Removing outputs from previous runs.") @@ -932,6 +944,14 @@ class QGraph extends Logging { edges.sorted(functionOrdering).foreach(edge => if (running) f(edge)) } + /** + * Utility function for running a method over all function edges. + * @param edgeFunction Function to run for each FunctionEdge. + */ + private def getFunctionEdges: List[FunctionEdge] = { + jobGraph.edgeSet.toList.filter(_.isInstanceOf[FunctionEdge]).asInstanceOf[List[FunctionEdge]] + } + /** * Utility function for running a method over all functions, but traversing the nodes in order of dependency. * @param edgeFunction Function to run for each FunctionEdge. @@ -1021,6 +1041,10 @@ class QGraph extends Logging { */ def isShutdown = !running + def getFunctionsAndStatus(functions: List[QFunction]): Map[QFunction, JobRunInfo] = { + getFunctionEdges.map(edge => (edge.function, edge.getRunInfo)).toMap + } + /** * Kills any forked jobs still running. */ @@ -1034,18 +1058,26 @@ class QGraph extends Logging { for (mgr <- managers) { if (mgr != null) { val manager = mgr.asInstanceOf[JobManager[QFunction,JobRunner[QFunction]]] - val managerRunners = runners - .filter(runner => manager.runnerType.isAssignableFrom(runner.getClass)) - .asInstanceOf[Set[JobRunner[QFunction]]] - if (managerRunners.size > 0) - try { - manager.tryStop(managerRunners) - } catch { - case e => /* ignore */ + try { + val managerRunners = runners + .filter(runner => manager.runnerType.isAssignableFrom(runner.getClass)) + .asInstanceOf[Set[JobRunner[QFunction]]] + if (managerRunners.size > 0) + try { + manager.tryStop(managerRunners) + } catch { + case e => /* ignore */ + } + for (runner <- managerRunners) { + try { + runner.cleanup() + } catch { + case e => /* ignore */ + } } - for (runner <- managerRunners) { + } finally { try { - runner.cleanup() + manager.exit() } catch { case e => /* ignore */ } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala index 6ece600dd..ee498c8a0 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraphSettings.scala @@ -26,8 +26,9 @@ package org.broadinstitute.sting.queue.engine import java.io.File import org.broadinstitute.sting.queue.QSettings -import org.broadinstitute.sting.commandline.{ArgumentCollection, Argument} import org.broadinstitute.sting.queue.util.SystemUtils +import org.broadinstitute.sting.commandline.{Advanced, ArgumentCollection, Argument} +import org.broadinstitute.sting.utils.R.RScriptExecutor /** * Command line options for a QGraph. @@ -69,6 +70,16 @@ class QGraphSettings { @Argument(fullName="expanded_dot_graph", shortName="expandedDot", doc="Outputs the queue graph of scatter gather to a .dot file. Otherwise overwrites the dot_graph", required=false) var expandedDotFile: File = _ + @Argument(fullName="jobReport", shortName="jobReport", doc="File where we will write the Queue job report", required=false) + var jobReportFile: String = _ + + @Advanced + @Argument(fullName="disableJobReport", shortName="disabpleJobReport", doc="If provided, we will not create a job report", required=false) + var disableJobReport: Boolean = false + + @ArgumentCollection + var rScriptArgs = new RScriptExecutor.RScriptArgumentCollection + @ArgumentCollection val qSettings = new QSettings } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala new file mode 100644 index 000000000..4c9cc1890 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobManager.scala @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.engine.drmaa + +import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.queue.engine.CommandLineJobManager +import org.broadinstitute.sting.jna.drmaa.v1_0.JnaSessionFactory +import org.ggf.drmaa.Session + +/** + * Runs jobs using DRMAA + */ +class DrmaaJobManager extends CommandLineJobManager[DrmaaJobRunner] { + protected var session: Session = _ + + protected def newSession() = new JnaSessionFactory().getSession + protected def contact = null + + override def init() { + session = newSession() + session.init(contact) + } + + override def exit() { + session.exit() + } + + def runnerType = classOf[DrmaaJobRunner] + def create(function: CommandLineFunction) = new DrmaaJobRunner(session, function) + + override def updateStatus(runners: Set[DrmaaJobRunner]) = { + var updatedRunners = Set.empty[DrmaaJobRunner] + runners.foreach(runner => if (runner.updateJobStatus()) {updatedRunners += runner}) + updatedRunners + } + override def tryStop(runners: Set[DrmaaJobRunner]) { + runners.filterNot(_.jobId == null).foreach(_.tryStop()) + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala new file mode 100644 index 000000000..b48dcd2a9 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.engine.drmaa + +import org.broadinstitute.sting.queue.QException +import org.broadinstitute.sting.queue.util.{Logging,Retry} +import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} +import java.util.Collections +import org.ggf.drmaa._ + +/** + * Runs jobs using DRMAA. + */ +class DrmaaJobRunner(val session: Session, val function: CommandLineFunction) extends CommandLineJobRunner with Logging { + /** Job Id of the currently executing job. */ + var jobId: String = _ + override def jobIdString = jobId + + // Set the display name to < 512 characters of the description + // NOTE: Not sure if this is configuration specific? + protected val jobNameLength = 500 + protected val jobNameFilter = """[^A-Za-z0-9_]""" + protected def functionNativeSpec = function.jobNativeArgs.mkString(" ") + + def start() { + session.synchronized { + val drmaaJob: JobTemplate = session.createJobTemplate + + drmaaJob.setJobName(function.description.take(jobNameLength).replaceAll(jobNameFilter, "_")) + + // Set the current working directory + drmaaJob.setWorkingDirectory(function.commandDirectory.getPath) + + // Set the output file for stdout + drmaaJob.setOutputPath(":" + function.jobOutputFile.getPath) + + // If the error file is set specify the separate output for stderr + // Otherwise join with stdout + if (function.jobErrorFile != null) { + drmaaJob.setErrorPath(":" + function.jobErrorFile.getPath) + } else { + drmaaJob.setJoinFiles(true) + } + + drmaaJob.setNativeSpecification(functionNativeSpec) + + // Instead of running the function.commandLine, run "sh " + drmaaJob.setRemoteCommand("sh") + drmaaJob.setArgs(Collections.singletonList(jobScript.toString)) + + // Allow advanced users to update the request via QFunction.updateJobRun() + updateJobRun(drmaaJob) + + updateStatus(RunnerStatus.RUNNING) + + // Start the job and store the id so it can be killed in tryStop + try { + Retry.attempt(() => { + try { + jobId = session.runJob(drmaaJob) + } catch { + case de: DrmaaException => throw new QException("Unable to submit job: " + de.getLocalizedMessage) + } + }, 1, 5, 10) + } finally { + // Prevent memory leaks + session.deleteJobTemplate(drmaaJob) + } + logger.info("Submitted job id: " + jobId) + } + } + + def updateJobStatus() = { + session.synchronized { + var returnStatus: RunnerStatus.Value = null + + try { + val jobStatus = session.getJobProgramStatus(jobId); + jobStatus match { + case Session.QUEUED_ACTIVE => returnStatus = RunnerStatus.RUNNING + case Session.DONE => + val jobInfo: JobInfo = session.wait(jobId, Session.TIMEOUT_NO_WAIT) + if ((jobInfo.hasExited && jobInfo.getExitStatus != 0) + || jobInfo.hasSignaled + || jobInfo.wasAborted) + returnStatus = RunnerStatus.FAILED + else + returnStatus = RunnerStatus.DONE + case Session.FAILED => returnStatus = RunnerStatus.FAILED + case Session.UNDETERMINED => logger.warn("Unable to determine status of job id " + jobId) + case _ => returnStatus = RunnerStatus.RUNNING + } + } catch { + // getJobProgramStatus will throw an exception once wait has run, as the + // job will be reaped. If the status is currently DONE or FAILED, return + // the status. + case de: DrmaaException => + if (lastStatus == RunnerStatus.DONE || lastStatus == RunnerStatus.FAILED) + returnStatus = lastStatus + else + logger.warn("Unable to determine status of job id " + jobId, de) + } + + if (returnStatus != null) { + updateStatus(returnStatus) + true + } else { + false + } + } + } + + def tryStop() { + session.synchronized { + try { + // Stop runners. SIGTERM(15) is preferred to SIGKILL(9). + // Only way to send SIGTERM is for the Sys Admin set the terminate_method + // resource of the designated queue to SIGTERM + session.control(jobId, Session.TERMINATE) + } catch { + case e => + logger.error("Unable to kill job " + jobId, e) + } + } + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala index 78bd2cc78..7299036ed 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobManager.scala @@ -24,13 +24,9 @@ package org.broadinstitute.sting.queue.engine.gridengine -import org.broadinstitute.sting.queue.engine.CommandLineJobManager import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.queue.engine.drmaa.DrmaaJobManager -class GridEngineJobManager extends CommandLineJobManager[GridEngineJobRunner] { - def runnerType = classOf[GridEngineJobRunner] - def create(function: CommandLineFunction) = new GridEngineJobRunner(function) - - override def updateStatus(runners: Set[GridEngineJobRunner]) = { GridEngineJobRunner.updateStatus(runners) } - override def tryStop(runners: Set[GridEngineJobRunner]) { GridEngineJobRunner.tryStop(runners) } +class GridEngineJobManager extends DrmaaJobManager { + override def create(function: CommandLineFunction) = new GridEngineJobRunner(session, function) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala index 8c639b5bb..96e3ffd95 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala @@ -24,203 +24,52 @@ package org.broadinstitute.sting.queue.engine.gridengine -import org.broadinstitute.sting.queue.QException -import org.broadinstitute.sting.queue.util.{Logging,Retry} +import org.broadinstitute.sting.queue.util.Logging import org.broadinstitute.sting.queue.function.CommandLineFunction -import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} -import org.ggf.drmaa.{DrmaaException,JobInfo,JobTemplate,Session,SessionFactory} -import java.util.Collections +import org.broadinstitute.sting.queue.engine.drmaa.DrmaaJobRunner +import org.ggf.drmaa.Session /** * Runs jobs on a Grid Engine compute cluster. */ -class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLineJobRunner with Logging { - // Run the static initializer for GridEngineJobRunner - GridEngineJobRunner - - /** Job Id of the currently executing job. */ - private var jobId: String = _ - override def jobIdString = jobId - - def start() { - GridEngineJobRunner.gridEngineSession.synchronized { - val gridEngineJob: JobTemplate = GridEngineJobRunner.gridEngineSession.createJobTemplate - - // Force the remote environment to inherit local environment settings - var nativeSpecString: String = "-V" - - // Set the display name to < 512 characters of the description - // NOTE: Not sure if this is configuration specific? - gridEngineJob.setJobName(GridEngineJobRunner.toJobName(function.description.take(500))) - - // Set the output file for stdout - gridEngineJob.setOutputPath(":" + function.jobOutputFile.getPath) - - // Set the current working directory - gridEngineJob.setWorkingDirectory(function.commandDirectory.getPath) - - // If the error file is set specify the separate output for stderr - // Otherwise join with stdout - if (Option(function.jobErrorFile) != None) { - gridEngineJob.setErrorPath(":" + function.jobErrorFile.getPath) - } else { - gridEngineJob.setJoinFiles(true) - } - - // If a project name is set specify the project name - if (Option(function.jobProject) != None) { - nativeSpecString += " -P " + function.jobProject - } - - // If the job queue is set specify the job queue - if (Option(function.jobQueue) != None) { - nativeSpecString += " -q " + function.jobQueue - } - - // If the resident set size is requested pass on the memory request - if (residentRequestMB.isDefined) { - nativeSpecString += " -l mem_free=%dM".format(residentRequestMB.get.ceil.toInt) - } - - // If the resident set size limit is defined specify the memory limit - if (residentLimitMB.isDefined) { - nativeSpecString += " -l h_rss=%dM".format(residentLimitMB.get.ceil.toInt) - } - - // If the priority is set (user specified Int) specify the priority - if (function.jobPriority.isDefined) { - nativeSpecString += " -p " + function.jobPriority.get - } - - gridEngineJob.setNativeSpecification(nativeSpecString) - - // Instead of running the function.commandLine, run "sh " - gridEngineJob.setRemoteCommand("sh") - gridEngineJob.setArgs(Collections.singletonList(jobScript.toString)) - - // Allow advanced users to update the request via QFunction.updateJobRun() - updateJobRun(gridEngineJob) - - updateStatus(RunnerStatus.RUNNING) - - // Start the job and store the id so it can be killed in tryStop - try { - Retry.attempt(() => { - try { - jobId = GridEngineJobRunner.gridEngineSession.runJob(gridEngineJob) - } catch { - case de: DrmaaException => throw new QException("Unable to submit job: " + de.getLocalizedMessage) - } - }, 1, 5, 10) - } finally { - // Prevent memory leaks - GridEngineJobRunner.gridEngineSession.deleteJobTemplate(gridEngineJob) - } - logger.info("Submitted Grid Engine job id: " + jobId) - } - } -} - -object GridEngineJobRunner extends Logging { - private val gridEngineSession = SessionFactory.getFactory.getSession - - initGridEngine() - - /** - * Initialize the Grid Engine library. - */ - private def initGridEngine() { - gridEngineSession.synchronized { - try { - gridEngineSession.init("") - } catch { - case de: DrmaaException => - logger.error("Issue initializing Grid Engine", de) - throw new QException("init() failed", de) - } - } - } - - /** - * Updates the status of a list of jobs. - * @param runners Runners to update. - * @return runners which were updated. - */ - def updateStatus(runners: Set[GridEngineJobRunner]) = { - var updatedRunners = Set.empty[GridEngineJobRunner] - gridEngineSession.synchronized { - runners.foreach(runner => if (updateRunnerStatus(runner)) {updatedRunners += runner}) - } - updatedRunners - } - - /** - * Tries to stop any running jobs. - * @param runners Runners to stop. - */ - def tryStop(runners: Set[GridEngineJobRunner]) { - // Stop runners. SIGTERM(15) is preferred to SIGKILL(9). - // Only way to send SIGTERM is for the Sys Admin set the terminate_method - // resource of the designated queue to SIGTERM - gridEngineSession.synchronized { - for (runner <- runners.filterNot(runner => Option(runner.jobId) == None)) { - try { - gridEngineSession.control(runner.jobId, Session.TERMINATE) - } catch { - case e => - logger.error("Unable to kill job " + runner.jobId, e) - } - } - gridEngineSession.exit() - } - } - - private def updateRunnerStatus(runner: GridEngineJobRunner): Boolean = { - var returnStatus: RunnerStatus.Value = null - - try { - val jobStatus = gridEngineSession.getJobProgramStatus(runner.jobId); - jobStatus match { - case Session.QUEUED_ACTIVE => returnStatus = RunnerStatus.RUNNING - case Session.DONE => - val jobInfo: JobInfo = gridEngineSession.wait(runner.jobId, Session.TIMEOUT_NO_WAIT) - if ((jobInfo.hasExited && jobInfo.getExitStatus > 0) - || jobInfo.hasSignaled - || jobInfo.wasAborted) - returnStatus = RunnerStatus.FAILED - else - returnStatus = RunnerStatus.DONE - case Session.FAILED => returnStatus = RunnerStatus.FAILED - case Session.UNDETERMINED => logger.warn("Unable to determine status of Grid Engine job id " + runner.jobId) - case _ => returnStatus = RunnerStatus.RUNNING - } - } catch { - // getJobProgramStatus will throw an exception once wait has run, as the - // job will be reaped. If the status is currently DONE or FAILED, return - // the status. - case de: DrmaaException => - if (runner.lastStatus == RunnerStatus.DONE || runner.lastStatus == RunnerStatus.FAILED) - returnStatus = runner.lastStatus - else - logger.warn("Unable to determine status of Grid Engine job id " + runner.jobId, de) - } - - if (returnStatus != null) { - runner.updateStatus(returnStatus) - true - } else { - false - } - } - - // Reap what we've sown - override def finalize() { - gridEngineSession.exit() - } - +class GridEngineJobRunner(session: Session, function: CommandLineFunction) extends DrmaaJobRunner(session, function) with Logging { // Grid Engine disallows certain characters from being in job names. // This replaces all illegal characters with underscores - private def toJobName(name: String): String = { - name.replaceAll("""[\n\t\r/:@\\*?]""", "_") + protected override val jobNameFilter = """[\n\t\r/:@\\*?]""" + protected override val minRunnerPriority = -1023 + protected override val maxRunnerPriority = 0 + + override protected def functionNativeSpec = { + // Force the remote environment to inherit local environment settings + var nativeSpec: String = "-V" + + // If a project name is set specify the project name + if (function.jobProject != null) + nativeSpec += " -P " + function.jobProject + + // If the job queue is set specify the job queue + if (function.jobQueue != null) + nativeSpec += " -q " + function.jobQueue + + // If the resident set size is requested pass on the memory request + if (function.residentRequest.isDefined) + nativeSpec += " -l mem_free=%dM".format(function.residentRequest.map(_ * 1024).get.ceil.toInt) + + // If the resident set size limit is defined specify the memory limit + if (function.residentLimit.isDefined) + nativeSpec += " -l h_rss=%dM".format(function.residentLimit.map(_ * 1024).get.ceil.toInt) + + // Pass on any job resource requests + nativeSpec += function.jobResourceRequests.map(" -l " + _).mkString + + // Pass on any job environment names + nativeSpec += function.jobEnvironmentNames.map(" -pe " + _).mkString + + // If the priority is set specify the priority + val priority = functionPriority + if (priority.isDefined) + nativeSpec += " -p " + priority.get + + (nativeSpec + " " + super.functionNativeSpec).trim() } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index 46dd08332..323cc63ff 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -31,9 +31,12 @@ import org.broadinstitute.sting.jna.lsf.v7_0_6.{LibLsf, LibBat} import org.broadinstitute.sting.utils.Utils import org.broadinstitute.sting.jna.clibrary.LibC import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.{submitReply, submit} -import com.sun.jna.ptr.IntByReference import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} -import com.sun.jna.{Structure, StringArray, NativeLong} +import java.util.regex.Pattern +import java.lang.StringBuffer +import java.util.Date +import com.sun.jna.{Pointer, Structure, StringArray, NativeLong} +import com.sun.jna.ptr.{PointerByReference, IntByReference} /** * Runs jobs on an LSF compute cluster. @@ -47,12 +50,22 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR private var jobId = -1L override def jobIdString = jobId.toString + protected override val minRunnerPriority = 1 + protected override val maxRunnerPriority = Lsf706JobRunner.maxUserPriority + + private val selectString = new StringBuffer() + private val usageString = new StringBuffer() + private val requestString = new StringBuffer() + /** * Dispatches the function on the LSF cluster. * @param function Command to run. */ def start() { Lsf706JobRunner.lsfLibLock.synchronized { + + parseResourceRequest() + val request = new submit for (i <- 0 until LibLsf.LSF_RLIM_NLIMITS) request.rLimits(i) = LibLsf.DEFAULT_RLIMIT; @@ -81,28 +94,45 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR } // If the resident set size is requested pass on the memory request - if (residentRequestMB.isDefined) { - val memInUnits = Lsf706JobRunner.convertUnits(residentRequestMB.get) - request.resReq = "select[mem>%1$d] rusage[mem=%1$d]".format(memInUnits) + if (function.residentRequest.isDefined) { + val memInUnits = Lsf706JobRunner.convertUnits(function.residentRequest.get) + appendRequest("select", selectString, "&&", "mem>%d".format(memInUnits)) + appendRequest("rusage", usageString, ",", "mem=%d".format(memInUnits)) + } + + val resReq = getResourceRequest + if (resReq.length > 0) { + request.resReq = resReq request.options |= LibBat.SUB_RES_REQ } // If the resident set size limit is defined specify the memory limit - if (residentLimitMB.isDefined) { - val memInUnits = Lsf706JobRunner.convertUnits(residentLimitMB.get) + if (function.residentLimit.isDefined) { + val memInUnits = Lsf706JobRunner.convertUnits(function.residentLimit.get) request.rLimits(LibLsf.LSF_RLIMIT_RSS) = memInUnits } // If the priority is set (user specified Int) specify the priority - if (function.jobPriority.isDefined) { - request.userPriority = function.jobPriority.get + val priority = functionPriority + if (priority.isDefined) { + request.userPriority = priority.get request.options2 |= LibBat.SUB2_JOB_PRIORITY } - // Broad specific requirement, our esub requires there be a project - // else it will spit out a warning to stdout. see $LSF_SERVERDIR/esub - request.projectName = if (function.jobProject != null) function.jobProject else "Queue" - request.options |= LibBat.SUB_PROJECT_NAME + // Set the project to either the function or LSF default + val project = if (function.jobProject != null) function.jobProject else Lsf706JobRunner.defaultProject + if (project != null) { + request.projectName = project + request.options |= LibBat.SUB_PROJECT_NAME + } + + // Set the esub names based on the job envorinment names + if (!function.jobEnvironmentNames.isEmpty) { + val argv = Array("", "-a", function.jobEnvironmentNames.mkString(" ")) + val setOptionResult = LibBat.setOption_(argv.length, new StringArray(argv), "a:", request, ~0, ~0, ~0, null); + if (setOptionResult == -1) + throw new QException("setOption_() returned -1 while setting esub"); + } // LSF specific: get the max runtime for the jobQueue and pass it for this job request.rLimits(LibLsf.LSF_RLIMIT_RUN) = Lsf706JobRunner.getRlimitRun(function.jobQueue) @@ -132,6 +162,41 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR logger.debug("Job Id %s status / exitStatus / exitInfo: ??? / ??? / ???".format(jobId)) super.checkUnknownStatus() } + + private def parseResourceRequest() { + requestString.setLength(0) + selectString.setLength(0) + usageString.setLength(0) + + requestString.append(function.jobResourceRequests.mkString(" ")) + extractSection(requestString, "select", selectString) + extractSection(requestString, "rusage", usageString) + } + + private def extractSection(requestString: StringBuffer, section: String, sectionString: StringBuffer) { + val pattern = Pattern.compile(section + "\\s*\\[[^\\]]+\\]\\s*"); + val matcher = pattern.matcher(requestString.toString) + if (matcher.find()) { + sectionString.setLength(0) + sectionString.append(matcher.group().trim()) + + val sb = new StringBuffer + matcher.appendReplacement(sb, "") + matcher.appendTail(sb) + + requestString.setLength(0) + requestString.append(sb) + } + } + + private def appendRequest(section: String, sectionString: StringBuffer, separator: String, request: String) { + if (sectionString.length() == 0) + sectionString.append(section).append("[").append(request).append("]") + else + sectionString.insert(sectionString.length() - 1, separator + request) + } + + private def getResourceRequest = "%s %s %s".format(selectString, usageString, requestString).trim() } object Lsf706JobRunner extends Logging { @@ -141,15 +206,23 @@ object Lsf706JobRunner extends Logging { /** Number of seconds for a non-normal exit status before we give up on expecting LSF to retry the function. */ private val retryExpiredSeconds = 5 * 60 - initLsf() - /** * Initialize the Lsf library. */ - private def initLsf() { + private val (defaultQueue, defaultProject, maxUserPriority) = { lsfLibLock.synchronized { if (LibBat.lsb_init("Queue") < 0) throw new QException(LibBat.lsb_sperror("lsb_init() failed")) + + val parameterInfo = LibBat.lsb_parameterinfo(null, null, 0); + var defaultQueue: String = parameterInfo.defaultQueues + val defaultProject = parameterInfo.defaultProject + val maxUserPriority = parameterInfo.maxUserPriority + + if (defaultQueue != null && defaultQueue.indexOf(' ') > 0) + defaultQueue = defaultQueue.split(" ")(0) + + (defaultQueue, defaultProject, maxUserPriority) } } @@ -199,12 +272,27 @@ object Lsf706JobRunner extends Logging { logger.debug("Job Id %s status / exitStatus / exitInfo: 0x%02x / 0x%02x / 0x%02x".format(runner.jobId, jobStatus, exitStatus, exitInfo)) + def updateRunInfo() { + // the platform LSF startTimes are in seconds, not milliseconds, so convert to the java convention + runner.getRunInfo.startTime = new Date(jobInfo.startTime.longValue * 1000) + runner.getRunInfo.doneTime = new Date(jobInfo.endTime.longValue * 1000) + val exHostsRaw = jobInfo.exHosts.getStringArray(0) + //logger.warn("exHostsRaw = " + exHostsRaw) + val exHostsList = exHostsRaw.toList + //logger.warn("exHostsList = " + exHostsList) + val exHosts = exHostsList.reduceLeft(_ + "," + _) + //logger.warn("exHosts = " + exHosts) + runner.getRunInfo.exechosts = exHosts + } + runner.updateStatus( if (Utils.isFlagSet(jobStatus, LibBat.JOB_STAT_DONE)) { // Done successfully. + updateRunInfo() RunnerStatus.DONE } else if (Utils.isFlagSet(jobStatus, LibBat.JOB_STAT_EXIT) && !willRetry(exitInfo, endTime)) { // Exited function that (probably) won't be retried. + updateRunInfo() RunnerStatus.FAILED } else { // Note that we still saw the job in the system. @@ -249,17 +337,6 @@ object Lsf706JobRunner extends Logging { } } - /** The name of the default queue. */ - private lazy val defaultQueue: String = { - lsfLibLock.synchronized { - val numQueues = new IntByReference(1) - val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0) - if (queueInfo == null) - throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue")) - queueInfo.queue - } - } - /** The run limits for each queue. */ private var queueRlimitRun = Map.empty[String,Int] @@ -299,15 +376,15 @@ object Lsf706JobRunner extends Logging { Structure.autoRead(unitsParam.asInstanceOf[Array[Structure]]) unitsParam(0).paramValue match { - case "MB" => 1D - case "GB" => 1024D - case "TB" => 1024D * 1024 - case "PB" => 1024D * 1024 * 1024 - case "EB" => 1024D * 1024 * 1024 * 1024 - case null => 1D + case "MB" => 1 / 1024D + case "GB" => 1D + case "TB" => 1024D + case "PB" => 1024D * 1024 + case "EB" => 1024D * 1024 * 1024 + case null => 1 / 1024D } } } - private def convertUnits(mb: Double) = (mb / unitDivisor).ceil.toInt + private def convertUnits(gb: Double) = (gb / unitDivisor).ceil.toInt } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala index 03f9d3315..ae899868a 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala @@ -27,6 +27,9 @@ package org.broadinstitute.sting.queue.engine.shell import org.broadinstitute.sting.queue.function.CommandLineFunction import org.broadinstitute.sting.queue.util.ShellJob import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} +import java.util.Date +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport +import org.broadinstitute.sting.utils.Utils /** * Runs jobs one at a time locally @@ -50,8 +53,11 @@ class ShellJobRunner(val function: CommandLineFunction) extends CommandLineJobRu // Allow advanced users to update the job. updateJobRun(job) + getRunInfo.startTime = new Date() + getRunInfo.exechosts = Utils.resolveHostname() updateStatus(RunnerStatus.RUNNING) job.run() + getRunInfo.doneTime = new Date() updateStatus(RunnerStatus.DONE) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 27e186585..d70022147 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -43,8 +43,7 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { this.intervals = this.originalGATK.intervals this.intervalsString = this.originalGATK.intervalsString - this.rodBind = this.gatherParts.zipWithIndex map { case (input, index) => new RodBind("input"+index, "VCF", input) } - this.rod_priority_list = (0 until this.gatherParts.size).map("input"+_).mkString(",") + this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) } this.out = this.originalOutput this.assumeIdenticalSamples = true diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala index 2508f5776..5456ed02c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/AddOrReplaceReadGroups.scala @@ -15,13 +15,13 @@ class AddOrReplaceReadGroups extends org.broadinstitute.sting.queue.function.Jav javaMainClass = "net.sf.picard.sam.AddOrReplaceReadGroups" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="The output BAM file with the modified/added read groups", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) - var outputIndex: File = new File(output + ".bai") + var outputIndex: File = _ @Argument(doc="Read group ID", shortName = "id", fullName = "read_group_id", required = true) var RGID: String = _ @@ -44,6 +44,12 @@ class AddOrReplaceReadGroups extends org.broadinstitute.sting.queue.function.Jav @Argument(doc = "Read group description", shortName = "ds", fullName = "read_group_description", required = false) var RGDS: String = "" + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + override def inputBams = input override def outputBam = output diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala index 6f006ffad..d44d5e004 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MarkDuplicates.scala @@ -15,13 +15,13 @@ class MarkDuplicates extends org.broadinstitute.sting.queue.function.JavaCommand javaMainClass = "net.sf.picard.sam.MarkDuplicates" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="The output file to write marked records to", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) - var outputIndex: File = new File(output + ".bai") + var outputIndex: File = _ @Output(doc="File to write duplication metrics to", shortName = "out_metrics", fullName = "output_metrics_file", required = false) var metrics: File = new File(output + ".metrics") @@ -35,6 +35,13 @@ class MarkDuplicates extends org.broadinstitute.sting.queue.function.JavaCommand @Argument(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by some of the sorting collections. If you are running out of memory, try reducing this number.", shortName = "sorting_ratio", fullName = "sorting_collection_size_ratio", required = false) var SORTING_COLLECTION_SIZE_RATIO: Double = -1 + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + override def inputBams = input override def outputBam = output this.sortOrder = null diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala index a7e74e1b5..fd107890e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/MergeSamFiles.scala @@ -3,6 +3,7 @@ package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ import java.io.File +import org.broadinstitute.sting.queue.QScript._ /* * Created by IntelliJ IDEA. @@ -15,13 +16,13 @@ class MergeSamFiles extends org.broadinstitute.sting.queue.function.JavaCommandL javaMainClass = "net.sf.picard.sam.MergeSamFiles" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="The output merged BAM file", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) - var outputIndex: File = new File(output + ".bai") + var outputIndex: File = _ @Argument(doc="Merge the seqeunce dictionaries Default value: false. This option can be set to 'null' to clear the default value.", shortName = "merge_dict", fullName = "merge_sequence_dictionaries", required = false) var MERGE_SEQUENCE_DICTIONARIES: Boolean = false @@ -32,6 +33,13 @@ class MergeSamFiles extends org.broadinstitute.sting.queue.function.JavaCommandL @Argument(doc = "Comments to include in the merged output file's header.", shortName = "com", fullName = "comments", required = false) var COMMENT: String = "" + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + override def inputBams = input override def outputBam = output this.createIndex = Some(true) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala new file mode 100644 index 000000000..72489dc87 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline._ + +import java.io.File +/* + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 6/22/11 + * Time: 10:35 AM + */ +class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "ReorderSam" + javaMainClass = "net.sf.picard.sam.ReorderSam" + + @Input(doc="Input file (bam or sam) to extract reads from.", shortName = "input", fullName = "input_bam_files", required = true) + var input: List[File] = Nil + + @Output(doc="Output file (bam or sam) to write extracted reads to.", shortName = "output", fullName = "output_bam_file", required = true) + var output: File = _ + + @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) + var outputIndex: File = _ + + @Argument(doc="Reference sequence to reorder reads to match.", shortName = "ref", fullName = "sort_reference", required = true) + var sortReference: File = _ + + @Argument(doc="If true, then allows only a partial overlap of the BAM contigs with the new reference sequence contigs. By default, this tool requires a corresponding contig in the new reference for each read contig.", shortName = "aic", fullName = "allow_incomplete_concordance", required = false) + var ALLOW_INCOMPLETE_DICT_CONCORDANCE: Boolean = _ + + @Argument(doc="If true, then permits mapping from a read contig to a new reference contig with the same name but a different length. Highly dangerous, only use if you know what you are doing.", shortName = "acld", fullName = "allow_contig_length_discordance", required = false) + var ALLOW_CONTIG_LENGTH_DISCORDANCE: Boolean = _ + + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + override def inputBams = input + override def outputBam = output + this.createIndex = Some(true) + this.sortOrder = null + override def commandLine = super.commandLine + + " REFERENCE=" + sortReference + + optional(" ALLOW_INCOMPLETE_DICT_CONCORDANCE=", ALLOW_INCOMPLETE_DICT_CONCORDANCE) + optional(" ALLOW_CONTIG_LENGTH_DISCORDANCE=", ALLOW_CONTIG_LENGTH_DISCORDANCE) +} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala new file mode 100644 index 000000000..746ce609e --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/RevertSam.scala @@ -0,0 +1,61 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline._ + +import java.io.File + +/* + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 6/22/11 + * Time: 10:35 AM + */ +class RevertSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "RevertSam" + javaMainClass = "net.sf.picard.sam.RevertSam" + + @Input(shortName = "input", fullName = "input_bam_files", required = true, doc = "The input SAM or BAM files to revert.") + var input: List[File] = Nil + + @Output(shortName = "output", fullName = "output_bam_file", required = true, doc = "The reverted BAM or SAM output file.") + var output: File = _ + + @Output(shortName = "out_index", fullName = "output_bam_index_file", required = false, doc = "The output bam index") + var outputIndex: File = _ + + @Argument(shortName = "roq", fullName = "restore_original_qualities", required = false, doc = "True to restore original qualities from the OQ field to the QUAL field if available.") + var restoreOriginalQualities: Boolean = true + + @Argument(shortName = "rdi", fullName = "remove_duplicate_information", required = false, doc = "Remove duplicate read flags from all reads. Note that if this is true and REMOVE_ALIGNMENT_INFORMATION==false, the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.") + var removeDuplicateInformation: Boolean = true + + @Argument(shortName = "rai", fullName = "remove_alignment_information", required = false, doc = "Remove all alignment information from the file.") + var removeAlignmentInformation: Boolean = true + + @Argument(shortName = "atc", fullName = "attributes_to_clear", required = false, doc = "When removing alignment information, the set of optional tags to remove.") + var attributesToClear: List[String] = Nil + + @Argument(shortName = "sa", fullName = "sample_alias", required = false, doc = "The sample alias to use in the reverted output file. This will override the existing sample alias in the file and is used only if all the read groups in the input file have the same sample alias.") + var sampleAlias: String = null + + @Argument(shortName = "ln", fullName = "library_name", required = false, doc = "The library name to use in the reverted output file. This will override the existing sample alias in the file and is used only if all the read groups in the input file have the same sample alias.") + var libraryName: String = null + + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + + override def inputBams = input + override def outputBam = output + this.createIndex = Some(true) + override def commandLine = super.commandLine + + conditionalParameter(!restoreOriginalQualities, " RESTORE_ORIGINAL_QUALITIES=false") + + conditionalParameter(!removeDuplicateInformation, " REMOVE_DUPLICATE_INFORMATION=false") + + conditionalParameter(!removeAlignmentInformation, " REMOVE_ALIGNMENT_INFORMATION=false") + + conditionalParameter(!attributesToClear.isEmpty, repeat(" ATTRIBUTE_TO_CLEAR=", attributesToClear)) + + conditionalParameter(sampleAlias != null, " SAMPLE_ALIAS=" + sampleAlias) + + conditionalParameter(libraryName != null, " LIBRARY_NAME=" + libraryName) +} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala index cc26f7471..a56093be8 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala @@ -3,6 +3,7 @@ package org.broadinstitute.sting.queue.extensions.picard import org.broadinstitute.sting.commandline._ import java.io.File +import org.broadinstitute.sting.queue.QScript._ /* * Created by IntelliJ IDEA. @@ -15,13 +16,21 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun javaMainClass = "net.sf.picard.sam.SortSam" @Input(doc="The input SAM or BAM files to sort.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="The sorted BAM or SAM output file.", shortName = "output", fullName = "output_bam_file", required = true) var output: File = _ @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false) - var outputIndex: File = new File(output + ".bai") + var outputIndex: File = _ + + override def freezeFieldValues() { + super.freezeFieldValues() + if (outputIndex == null && output != null) + outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + override def inputBams = input override def outputBam = output diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala index 726682b89..2c8fbc6d9 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ValidateSamFile.scala @@ -17,7 +17,7 @@ class ValidateSamFile extends org.broadinstitute.sting.queue.function.JavaComman javaMainClass = "net.sf.picard.sam.ValidateSamFile" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) - var input: List[File] = _ + var input: List[File] = Nil @Output(doc="Send output to a file instead of stdout", shortName = "output", fullName = "output_file", required = false) var output: File = _ @@ -26,7 +26,7 @@ class ValidateSamFile extends org.broadinstitute.sting.queue.function.JavaComman var MODE: Mode = Mode.VERBOSE @Argument(doc="List of validation error types to ignore.", shortName = "ignore", fullName = "ignore_error_types", required = false) - var IGNORE: List[String] = _ + var IGNORE: List[String] = Nil @Argument(doc = "The maximum number of lines output in verbose mode.", shortName = "max", fullName = "max_output", required = false) var MAX_OUTPUT: Int = 100 diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index c62fdcd7c..ff77503ac 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -11,12 +11,27 @@ trait CommandLineFunction extends QFunction with Logging { /** Upper memory limit */ var memoryLimit: Option[Double] = None + /** Resident memory limit */ + var residentLimit: Option[Double] = None + + /** Resident memory request */ + var residentRequest: Option[Double] = None + /** Job project to run the command */ var jobProject: String = _ /** Job queue to run the command */ var jobQueue: String = _ + /** Native arguments to pass to the job runner */ + var jobNativeArgs: List[String] = Nil + + /** Native arguments to pass to the job runner */ + var jobResourceRequests: List[String] = Nil + + /** Environment names to pass to the job runner */ + var jobEnvironmentNames: List[String] = Nil + override def copySettingsTo(function: QFunction) { super.copySettingsTo(function) function match { @@ -24,13 +39,27 @@ trait CommandLineFunction extends QFunction with Logging { if (commandLineFunction.memoryLimit.isEmpty) commandLineFunction.memoryLimit = this.memoryLimit + if (commandLineFunction.residentLimit.isEmpty) + commandLineFunction.residentLimit = this.residentLimit + + if (commandLineFunction.residentRequest.isEmpty) + commandLineFunction.residentRequest = this.residentRequest + if (commandLineFunction.jobProject == null) commandLineFunction.jobProject = this.jobProject if (commandLineFunction.jobQueue == null) commandLineFunction.jobQueue = this.jobQueue - commandLineFunction.jobQueue = this.jobQueue + if (commandLineFunction.jobNativeArgs.isEmpty) + commandLineFunction.jobNativeArgs = this.jobNativeArgs + + if (commandLineFunction.jobResourceRequests.isEmpty) + commandLineFunction.jobResourceRequests = this.jobResourceRequests + + if (commandLineFunction.jobEnvironmentNames.isEmpty) + commandLineFunction.jobEnvironmentNames = this.jobEnvironmentNames + case _ => /* ignore */ } } @@ -53,9 +82,30 @@ trait CommandLineFunction extends QFunction with Logging { if (jobProject == null) jobProject = qSettings.jobProject + if (jobNativeArgs.isEmpty) + jobNativeArgs = qSettings.jobNativeArgs + + if (jobResourceRequests.isEmpty) + jobResourceRequests = qSettings.jobResourceRequests + + if (jobEnvironmentNames.isEmpty) + jobEnvironmentNames = qSettings.jobEnvironmentNames + if (memoryLimit.isEmpty) memoryLimit = qSettings.memoryLimit + if (residentLimit.isEmpty) + residentLimit = qSettings.residentLimit + + if (residentRequest.isEmpty) + residentRequest = qSettings.residentRequest + + if (residentRequest.isEmpty) + residentRequest = memoryLimit + + if (residentLimit.isEmpty) + residentLimit = residentRequest.map( _ * 1.2 ) + super.freezeFieldValues() } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index 7048b6413..c905581fa 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -30,14 +30,14 @@ import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.{QException, QSettings} import collection.JavaConversions._ import org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction -import org.broadinstitute.sting.queue.util.{Logging, CollectionUtils, IOUtils, ReflectionUtils} +import org.broadinstitute.sting.queue.util._ /** * The base interface for all functions in Queue. * Inputs and outputs are specified as Sets of values. * Inputs are matched to other outputs by using .equals() */ -trait QFunction extends Logging { +trait QFunction extends Logging with QJobReport { /** A short description of this step in the graph */ var analysisName: String = "" @@ -83,11 +83,17 @@ trait QFunction extends Logging { */ var deleteIntermediateOutputs = true + // ------------------------------------------------------- + // + // job run information + // + // ------------------------------------------------------- + /** * Copies settings from this function to another function. * @param function QFunction to copy values to. */ - def copySettingsTo(function: QFunction) { + override def copySettingsTo(function: QFunction) { function.analysisName = this.analysisName function.jobName = this.jobName function.qSettings = this.qSettings @@ -99,6 +105,8 @@ trait QFunction extends Logging { function.updateJobRun = this.updateJobRun function.isIntermediate = this.isIntermediate function.deleteIntermediateOutputs = this.deleteIntermediateOutputs + function.reportGroup = this.reportGroup + function.reportFeatures = this.reportFeatures } /** File to redirect any output. Defaults to .out */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala new file mode 100644 index 000000000..85896da66 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.util +import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} +import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.queue.engine.JobRunInfo +import java.io.{FileOutputStream, PrintStream, File} +import org.broadinstitute.sting.queue.function.scattergather.{GathererFunction, ScatterFunction} +import org.broadinstitute.sting.utils.R.RScriptExecutor.RScriptArgumentCollection +import org.broadinstitute.sting.utils.R.RScriptExecutor +import org.broadinstitute.sting.queue.QScript + +/** + * A mixin to add Job info to the class + */ +trait QJobReport extends Logging { + self: QFunction => + + protected var reportGroup: String = null + protected var reportFeatures: Map[String, String] = Map() + protected var reportEnabled: Boolean = true + + def includeInReport = reportEnabled + def enableReport() { reportEnabled = true } + def disableReport() { reportEnabled = false } + + def setRunInfo(info: JobRunInfo) { + //logger.info("info " + info) + reportFeatures = Map( + "iteration" -> 1, + "analysisName" -> getReportGroup, + "jobName" -> QJobReport.workAroundSameJobNames(this), + "intermediate" -> self.isIntermediate, + "exechosts" -> info.getExecHosts, + "startTime" -> info.getStartTime.getTime, + "doneTime" -> info.getDoneTime.getTime, + "formattedStartTime" -> info.getFormattedStartTime, + "formattedDoneTime" -> info.getFormattedDoneTime, + "runtime" -> info.getRuntimeInMs).mapValues((x:Any) => if (x != null) x.toString else "null") ++ reportFeatures + // note -- by adding reportFeatures second we override iteration + // (or any other binding) with the user provided value + } + + /** The report Group is the analysis name transform to only contain valid GATKReportTable characters */ + def getReportGroup = self.analysisName.replaceAll(GATKReportTable.INVALID_TABLE_NAME_REGEX, "_") + def getReportFeatures = reportFeatures + + def getReportFeatureNames: List[String] = getReportFeatures.keys.toList + def getReportFeature(key: String): String = { + getReportFeatures.get(key) match { + case Some(x) => x + case None => throw new RuntimeException("Get called with key %s but no value was found".format(key)) + } + } + + def getReportName: String = getReportFeature("jobName") + + def configureJobReport(features: Map[String, Any]) { + this.reportFeatures = features.mapValues(_.toString) + } + + // copy the QJobReport information -- todo : what's the best way to do this? + override def copySettingsTo(function: QFunction) { + self.copySettingsTo(function) + function.reportFeatures = this.reportFeatures + } +} + +object QJobReport { + val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R" + + // todo -- fixme to have a unique name for Scatter/gather jobs as well + var seenCounter = 1 + var seenNames = Set[String]() + + def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { + val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) + jobs foreach {case (qf, info) => qf.setRunInfo(info)} + val stream = new PrintStream(new FileOutputStream(dest)) + printJobLogging(jobs.keys.toList, stream) + stream.close() + } + + def plotReport(args: RScriptArgumentCollection, jobReportFile: File) { + val executor = new RScriptExecutor(args, false) // don't except on error + val pdf = jobReportFile.getAbsolutePath + ".pdf" + executor.callRScripts(JOB_REPORT_QUEUE_SCRIPT, jobReportFile.getAbsolutePath, pdf) + } + + def workAroundSameJobNames(func: QFunction):String = { + if ( seenNames.apply(func.jobName) ) { + seenCounter += 1 + "%s_%d".format(func.jobName, seenCounter) + } else { + seenNames += func.jobName + func.jobName + } + } + + /** + * Prints the JobLogging logs to a GATKReport. First splits up the + * logs by group, and for each group generates a GATKReportTable + */ + private def printJobLogging(logs: List[QFunction], stream: PrintStream) { + // create the report + val report: GATKReport = new GATKReport + + // create a table for each group of logs + for ( (group, groupLogs) <- groupLogs(logs) ) { + report.addTable(group, "Job logs for " + group) + val table: GATKReportTable = report.getTable(group) + table.addPrimaryKey("jobName", false) + val keys = logKeys(groupLogs) + + // add the columns + keys.foreach(table.addColumn(_, 0)) + for (log <- groupLogs) { + for ( key <- keys ) + table.set(log.getReportName, key, log.getReportFeature(key)) + } + } + + report.print(stream) + } + + private def groupLogs(logs: List[QFunction]): Map[String, List[QFunction]] = { + logs.groupBy(_.getReportGroup) + } + + private def logKeys(logs: List[QFunction]): Set[String] = { + // the keys should be the same for each log, but we will check that + val keys = Set[String](logs(0).getReportFeatureNames : _*) + + for ( log <- logs ) + if ( keys.sameElements(Set(log.getReportFeatureNames)) ) + throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " + + "We found one with %s and another with %s").format(keys, log.getReportFeatureNames)) + + keys + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala index 99aaa9474..3b1b2ece1 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -22,15 +22,15 @@ object QScriptUtils { * to have empty lines and comment lines (lines starting with #). */ def createListFromFile(in: File):List[File] = { - // If the file provided ends with .bam, it is not a bam list, we treat it as a single file. + // If the file provided ends with .bam, .fasta or .fq, it is not a bam list, we treat it as a single file. // and return a list with only this file. - if (in.toString.endsWith(".bam")) + if (in.toString.endsWith(".bam") || in.toString.endsWith(".fasta") || in.toString.endsWith(".fq")) return List(in) var list: List[File] = List() - for (bam <- fromFile(in).getLines) - if (!bam.startsWith("#") && !bam.isEmpty ) - list :+= new File(bam.trim()) + for (file <- fromFile(in).getLines) + if (!file.startsWith("#") && !file.isEmpty ) + list :+= new File(file.trim()) list.sortWith(_.compareTo(_) < 0) } @@ -57,4 +57,6 @@ object QScriptUtils { } + def ?[A <: AnyRef](ref: A): Option[A] = + if (ref eq null) None else Some(ref) } \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala index 40a296022..58341a0a5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala @@ -4,6 +4,7 @@ import collection.JavaConversions._ import org.broadinstitute.sting.queue.QException import java.lang.Class import org.broadinstitute.sting.commandline.{ArgumentMatches, ArgumentSource, ArgumentTypeDescriptor, ParsingEngine} +import java.lang.reflect.Type /** * An ArgumentTypeDescriptor that can parse the scala collections. @@ -42,6 +43,10 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { * @param argumentMatches The argument match strings that were found for this argument source. * @return The parsed object. */ + def parse(parsingEngine: ParsingEngine, source: ArgumentSource, typeType: Type, argumentMatches: ArgumentMatches) = { + parse(parsingEngine,source, makeRawTypeIfNecessary(typeType), argumentMatches) + } + def parse(parsingEngine: ParsingEngine, source: ArgumentSource, classType: Class[_], argumentMatches: ArgumentMatches) = { val componentType = ReflectionUtils.getCollectionType(source.field) val componentArgumentParser = parsingEngine.selectBestTypeDescriptor(componentType) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index c2c956118..5de474340 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -34,8 +34,8 @@ import org.broadinstitute.sting.BaseTest import org.broadinstitute.sting.MD5DB import org.broadinstitute.sting.queue.QCommandLine import org.broadinstitute.sting.queue.util.{Logging, ProcessController} -import java.io.{FileNotFoundException, File} -import org.broadinstitute.sting.gatk.report.GATKReportParser +import java.io.File +import org.broadinstitute.sting.gatk.report.GATKReport import org.apache.commons.io.FileUtils import org.broadinstitute.sting.queue.engine.CommandLinePluginManager @@ -43,13 +43,15 @@ object PipelineTest extends BaseTest with Logging { private val validationReportsDataLocation = "/humgen/gsa-hpprojects/GATK/validationreports/submitted/" - val run = System.getProperty("pipeline.run") == "run" + final val run = System.getProperty("pipeline.run") == "run" - private val jobRunners = { + final val allJobRunners = { val commandLinePluginManager = new CommandLinePluginManager - commandLinePluginManager.getPlugins.map(commandLinePluginManager.getName(_)).filterNot(_ == "Shell") + commandLinePluginManager.getPlugins.map(commandLinePluginManager.getName(_)).toList } + final val defaultJobRunners = List("Lsf706", "GridEngine") + /** * Returns the top level output path to this test. * @param testName The name of the test passed to PipelineTest.executeTest() @@ -79,9 +81,12 @@ object PipelineTest extends BaseTest with Logging { * @param pipelineTest test to run. */ def executeTest(pipelineTest: PipelineTestSpec) { + var jobRunners = pipelineTest.jobRunners + if (jobRunners == null) + jobRunners = defaultJobRunners; jobRunners.foreach(executeTest(pipelineTest, _)) } - + /** * Runs the pipelineTest. * @param pipelineTest test to run. @@ -118,12 +123,11 @@ object PipelineTest extends BaseTest with Logging { // write the report to the shared validation data location val formatter = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss") val reportLocation = "%s%s/%s/validation.%s.eval".format(validationReportsDataLocation, jobRunner, name, formatter.format(new Date)) - val report = new File(reportLocation) + val reportFile = new File(reportLocation) - FileUtils.copyFile(new File(runDir(name, jobRunner) + evalSpec.evalReport), report); + FileUtils.copyFile(new File(runDir(name, jobRunner) + evalSpec.evalReport), reportFile); - val parser = new GATKReportParser - parser.parse(report) + val report = new GATKReport(reportFile); var allInRange = true @@ -131,7 +135,9 @@ object PipelineTest extends BaseTest with Logging { println(name + " validation values:") println(" value (min,target,max) table key metric") for (validation <- evalSpec.validations) { - val value = parser.getValue(validation.table, validation.key, validation.metric) + val table = report.getTable(validation.table) + val key = table.getPrimaryKey(validation.key) + val value = String.valueOf(table.get(key, validation.metric)) val inRange = if (value == null) false else validation.inRange(value) val flag = if (!inRange) "*" else " " println(" %s %s (%s,%s,%s) %s %s %s".format(flag, value, validation.min, validation.target, validation.max, validation.table, validation.key, validation.metric)) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala index f26689383..a7b3f3a47 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTestSpec.scala @@ -1,7 +1,5 @@ package org.broadinstitute.sting.queue.pipeline -import java.io.File - class PipelineTestSpec(var name: String = null) { /** The arguments to pass to the Queue test, ex: "-S scala/qscript/examples/HelloWorld.scala" */ @@ -10,6 +8,9 @@ class PipelineTestSpec(var name: String = null) { /** Job Queue to run the test. Default is null which means use hour. */ var jobQueue: String = _ + /** Job runners to run the test. Default is null which means use the default. */ + var jobRunners: List[String] = _ + /** Expected MD5 results for each file path. */ var fileMD5s = Map.empty[String, String] diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala index 7c76823da..f320cb3a6 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala @@ -33,6 +33,7 @@ class HelloWorldPipelineTest { val spec = new PipelineTestSpec spec.name = "HelloWorld" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } @@ -40,23 +41,89 @@ class HelloWorldPipelineTest { def testHelloWorldWithPrefix() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPrefix" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPrefix HelloWorld" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobPrefix HelloWorld" + spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } @Test def testHelloWorldWithMemoryLimit() { val spec = new PipelineTestSpec - spec.name = "HelloWorldWithPrefix" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -memLimit 1.25" + spec.name = "HelloWorldMemoryLimit" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -memLimit 1.25" + spec.jobRunners = PipelineTest.allJobRunners PipelineTest.executeTest(spec) } - @Test(enabled=false) + @Test def testHelloWorldWithPriority() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPriority" - spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPriority 100" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobPriority 100" + spec.jobRunners = PipelineTest.allJobRunners + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithLsfResource() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLsfResource" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" + spec.jobRunners = List("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithLsfResourceAndMemoryLimit() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLsfResourceAndMemoryLimit" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -memLimit 1.25 -jobResReq rusage[iodine_io=1] -jobResReq select[swp>0] -jobResReq order[swp]" + spec.jobRunners = List("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithLsfEnvironment() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithLsfEnvironment" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobEnv tv" + spec.jobRunners = List("Lsf706") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithGridEngineResource() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithGridEngineResource" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobResReq s_core=1000M" + spec.jobRunners = List("GridEngine") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithGridEngineResourceAndMemoryLimit() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithGridEngineResourceAndMemoryLimit" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -memLimit 1.25 -jobResReq s_core=1000M" + spec.jobRunners = List("GridEngine") + PipelineTest.executeTest(spec) + } + + @Test + def testHelloWorldWithGridEngineEnvironment() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithGridEngineEnvironment" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" + + " -jobEnv \"make 1\"" + spec.jobRunners = List("GridEngine") PipelineTest.executeTest(spec) } } diff --git a/public/testdata/HiSeq.10000.vcf.gz b/public/testdata/HiSeq.10000.vcf.gz new file mode 100644 index 000000000..15e91010c Binary files /dev/null and b/public/testdata/HiSeq.10000.vcf.gz differ diff --git a/settings/helpTemplates/common.html b/settings/helpTemplates/common.html new file mode 100644 index 000000000..1554a1d40 --- /dev/null +++ b/settings/helpTemplates/common.html @@ -0,0 +1,15 @@ +<#macro makeHeader title> + + ${title} + + + + +<#macro headerInfo> + + +<#macro footerInfo> +

    See also Main index | GATK wiki | GATK support forum

    +

    GATK version ${version} built at ${timestamp}.

    + + diff --git a/settings/helpTemplates/generic.index.template.html b/settings/helpTemplates/generic.index.template.html new file mode 100644 index 000000000..6c9e9f4e8 --- /dev/null +++ b/settings/helpTemplates/generic.index.template.html @@ -0,0 +1,36 @@ +<#include "common.html"/> + +<#macro emitGroup group> + +

    ${group.name}

    +

    + ${group.summary} +

    +

    + + + + <#list data as datum> + <#if datum.group == group.name> + + + + + + +
    NameSummary
    ${datum.name}${datum.summary}
    + + + +<@makeHeader title="GATK documentation index"/> + +

    GATK documentation index

    + <@headerInfo /> + <#list groups?sort_by("name") as group> + <@emitGroup group=group/> + + + <@footerInfo /> + + + diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html new file mode 100644 index 000000000..7fc8cd7bd --- /dev/null +++ b/settings/helpTemplates/generic.template.html @@ -0,0 +1,126 @@ +<#include "common.html"/> + +<#macro argumentlist name myargs> + <#if myargs?size != 0> + ${name} + <#list myargs as arg> + + ${arg.name} + ${arg.type} + ${arg.defaultValue!"NA"} + ${arg.summary} + + <#-- + ${arg.required} + --> + + + + +<#macro argumentDetails arg> +

    ${arg.name}<#if arg.synonyms??> / ${arg.synonyms} + (<#if arg.attributes??>${arg.attributes} ${arg.type}<#if arg.defaultValue??> with default value ${arg.defaultValue})

    +

    + ${arg.summary}. ${arg.fulltext} + <#if arg.rodTypes??>${arg.name} binds reference ordered data. This argument supports ROD files of the + following types: ${arg.rodTypes} + <#if arg.options??> +
    + The ${arg.name} argument is an enumerated type (${arg.type}), which can have one of the following values: +

    + <#list arg.options as option> +
    ${option.name}
    +
    ${option.summary}
    + +
    + +

    + + +<#macro relatedByType name type> + <#list relatedDocs as relatedDoc> + <#if relatedDoc.relation == type> +

    ${name}

    +
      + <#list relatedDocs as relatedDoc> + <#if relatedDoc.relation == type> +
    • ${relatedDoc.name} is a ${relatedDoc.relation}
    • + + +
    + <#break> + + + + + +<@makeHeader title="${name} documentation"/> + +

    ${name}

    + <@headerInfo /> +

    ${summary}

    + <#if author??> +

    Author

    + ${author} + +

    Introduction

    + ${description} + + <#-- Create the argument summary --> + <#if arguments.all?size != 0> +
    +

    ${name} specific arguments

    + + + + + + + + + + + <@argumentlist name="Required" myargs=arguments.required/> + <@argumentlist name="Optional" myargs=arguments.optional/> + <@argumentlist name="Advanced" myargs=arguments.advanced/> + <@argumentlist name="Hidden" myargs=arguments.hidden/> + <@argumentlist name="Depreciated" myargs=arguments.depreciated/> + +
    NameTypeDefault valueSummary
    + + + <#-- Create references to additional capabilities if appropriate --> + <#if extradocs?size != 0> +
    +

    Additional capabilities

    + The arguments described in the entries below can be supplied to this tool to modify + its behavior. For example, the -L argument directs the GATK engine restricts processing + to specific genomic intervals. This capability is available to all GATK walkers. + + + + <#-- This class is related to other documented classes via sub/super relationships --> + <#if relatedDocs?? && relatedDocs?size != 0> +
    +

    Related capabilities

    + <@relatedByType name="Superclasses" type="superclass"/> + <@relatedByType name="Subclasses" type="subclass"/> + + + <#-- List all of the --> + <#if arguments.all?size != 0> +
    + <#-- Create the argument details --> +

    Argument details

    + <#list arguments.all as arg> + <@argumentDetails arg=arg/> + + + + <@footerInfo /> + + diff --git a/settings/helpTemplates/style.css b/settings/helpTemplates/style.css new file mode 100644 index 000000000..297cd49ef --- /dev/null +++ b/settings/helpTemplates/style.css @@ -0,0 +1,194 @@ +body +{ + background-color: #ffffff; + color: #202020; +} + +body, p, ul, ol, dl +{ + font-family: Corbel, Verdana, "Lucida Grande", "Lucida Sans Unicode", Sans-Serif; +} + +p, ul, ol, dl, dt, dd, td +{ + font-size: 12pt; +} + +p +{ + margin-left: 1em; +} + +p.summary +{ + margin-left: 2em; + margin-top: -20pt; + font-style: italic; +} + +p.see-also +{ + font-size: 10pt; + margin-left: 0em; + margin-top: 3em; + text-align: center; +} + +p.version +{ + font-size: 8pt; + margin-left: 0em; + margin-top: -8pt; + text-align: center; +} + +p.args +{ + margin-left: 3em; +} + +h1, h2, h3, h4 +{ + font-family: Corbel, Arial, Helvetica, Sans-Serif; + font-weight: bold; + text-align: left; +} + +h1 +{ + font-size: 32pt; + letter-spacing: -2px; + color: #669; +} + +h2 +{ + font-size: 16pt; + font-weight: bold; + margin-top: 2em; + color: #669; +} + +h3 +{ + font-size: 12pt; + margin-left: 1em; + color: #000; +} + +hr +{ + margin-top: 4em; +} + +/* + * enum DT layout +*/ + +dl { + margin-left: 3em; +} + +dl.enum { + margin-left: 3em; + border: 1px dashed #ccc; +} + +dt, dt.enum { + font-weight: bold; + text-decoration: underline; +} + +/* +dt, dd.enum { + padding: 0 0 0.5em 0; +} +*/ + +pre { + border: thin solid lightgray; + margin-left: 1em; + margin-right: 4em; +/* + background-color: #e0fdff; +*/ +} +/* + * clean table layouts +*/ +#hor-minimalist-b +{ + font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; + font-size: 12px; + background: #fff; + margin: 5px; + width: 100%; + border-collapse: collapse; + text-align: left; +} +#hor-minimalist-b th +{ + font-size: 14px; + font-weight: normal; + color: #039; + padding: 10px 8px; + border-bottom: 2px solid #6678b1; +} +#hor-minimalist-b td +{ + border-bottom: 1px solid #ccc; + color: #669; + padding: 6px 8px; +} +#hor-minimalist-b tbody tr:hover td +{ + color: #009; +} + +th#row-divider +{ + font-weight: bolder; + font-size: larger; +} + + +/* + * Table design for input/ouptut description + */ + +#description-table +{ + font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; + font-size: 12px; + background: #fff; + margin: 5px; + border-collapse: collapse; + text-align: left; +} +#description-table th +{ + font-size: 16px; + font-weight: bold; + background-color: lightgray; + color: #039; + text-align: center; + padding: 10px 8px; + border-bottom: 2px solid #6678b1; +} +#description-table td +{ + border-bottom: 1px solid #ccc; + color: #669; + padding: 6px 8px; + text-align: right; +} +#description-table tbody tr:hover td +{ + color: #009; +} + +th#row-divider +{ + font-weight: bolder; + font-size: larger; +} \ No newline at end of file diff --git a/settings/repository/org.broad/tribble-16.xml b/settings/repository/org.broad/tribble-16.xml deleted file mode 100644 index e23eec339..000000000 --- a/settings/repository/org.broad/tribble-16.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - diff --git a/settings/repository/org.broad/tribble-16.jar b/settings/repository/org.broad/tribble-24.jar similarity index 73% rename from settings/repository/org.broad/tribble-16.jar rename to settings/repository/org.broad/tribble-24.jar index 331f28ec3..b1c39e60a 100644 Binary files a/settings/repository/org.broad/tribble-16.jar and b/settings/repository/org.broad/tribble-24.jar differ diff --git a/settings/repository/org.broad/tribble-24.xml b/settings/repository/org.broad/tribble-24.xml new file mode 100644 index 000000000..9b2b967f8 --- /dev/null +++ b/settings/repository/org.broad/tribble-24.xml @@ -0,0 +1,3 @@ + + +