diff --git a/build.xml b/build.xml
index ef53f6aa4..275cb5555 100644
--- a/build.xml
+++ b/build.xml
@@ -48,8 +48,9 @@
-
-
+
+
+
@@ -176,6 +177,10 @@
+
+
+
+
@@ -485,11 +490,16 @@
+
+
+
+
+
+ additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet -J-Xdebug -J-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005">
@@ -744,8 +754,7 @@
-
-
+
diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R
new file mode 100644
index 000000000..18d33054e
--- /dev/null
+++ b/public/R/queueJobReport.R
@@ -0,0 +1,154 @@
+library(gsalib)
+require("ggplot2")
+require("gplots")
+
+#
+# Standard command line switch. Can we loaded interactively for development
+# or executed with RScript
+#
+args = commandArgs(TRUE)
+onCMDLine = ! is.na(args[1])
+if ( onCMDLine ) {
+ inputFileName = args[1]
+ outputPDF = args[2]
+} else {
+ inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
+ outputPDF = NA
+}
+
+RUNTIME_UNITS = "(sec)"
+ORIGINAL_UNITS_TO_SECONDS = 1/1000
+
+#
+# Helper function to aggregate all of the jobs in the report across all tables
+#
+allJobsFromReport <- function(report) {
+ names <- c("jobName", "startTime", "analysisName", "doneTime", "exechosts")
+ sub <- lapply(report, function(table) table[,names])
+ do.call("rbind", sub)
+}
+
+#
+# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
+#
+plotJobsGantt <- function(gatkReport, sortOverall) {
+ allJobs = allJobsFromReport(gatkReport)
+ if ( sortOverall ) {
+ title = "All jobs, by analysis, by start time"
+ allJobs = allJobs[order(allJobs$analysisName, allJobs$startTime, decreasing=T), ]
+ } else {
+ title = "All jobs, sorted by start time"
+ allJobs = allJobs[order(allJobs$startTime, decreasing=T), ]
+ }
+ allJobs$index = 1:nrow(allJobs)
+ minTime = min(allJobs$startTime)
+ allJobs$relStartTime = allJobs$startTime - minTime
+ allJobs$relDoneTime = allJobs$doneTime - minTime
+ allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
+ maxRelTime = max(allJobs$relDoneTime)
+ p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
+ p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
+ p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
+ p <- p + xlim(0, maxRelTime * 1.1)
+ p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
+ p <- p + ylab("Job")
+ p <- p + opts(title=title)
+ print(p)
+}
+
+#
+# Plots scheduling efficiency at job events
+#
+plotProgressByTime <- function(gatkReport) {
+ allJobs = allJobsFromReport(gatkReport)
+ nJobs = dim(allJobs)[1]
+ allJobs = allJobs[order(allJobs$startTime, decreasing=F),]
+ allJobs$index = 1:nrow(allJobs)
+
+ minTime = min(allJobs$startTime)
+ allJobs$relStartTime = allJobs$startTime - minTime
+ allJobs$relDoneTime = allJobs$doneTime - minTime
+
+ times = sort(c(allJobs$relStartTime, allJobs$relDoneTime))
+
+ countJobs <- function(p) {
+ s = allJobs$relStartTime
+ e = allJobs$relDoneTime
+ x = c() # I wish I knew how to make this work with apply
+ for ( time in times )
+ x = c(x, sum(p(s, e, time)))
+ x
+ }
+
+ pending = countJobs(function(s, e, t) s > t)
+ done = countJobs(function(s, e, t) e < t)
+ running = nJobs - pending - done
+
+ d = data.frame(times=times, pending=pending, running=running, done=done)
+
+ p <- ggplot(data=melt(d, id.vars=c("times")), aes(x=times, y=value, color=variable))
+ p <- p + facet_grid(variable ~ ., scales="free")
+ p <- p + geom_line(size=2)
+ p <- p + xlab(paste("Time since start of first job", RUNTIME_UNITS))
+ p <- p + opts(title = "Job scheduling")
+ print(p)
+}
+
+#
+# Creates tables for each job in this group
+#
+standardColumns = c("jobName", "startTime", "formattedStartTime", "analysisName", "intermediate", "exechosts", "formattedDoneTime", "doneTime", "runtime")
+plotGroup <- function(groupTable) {
+ name = unique(groupTable$analysisName)[1]
+ groupAnnotations = setdiff(names(groupTable), standardColumns)
+ sub = groupTable[,c("jobName", groupAnnotations, "runtime")]
+ sub = sub[order(sub$iteration, sub$jobName, decreasing=F), ]
+
+ # create a table showing each job and all annotations
+ textplot(sub, show.rownames=F)
+ title(paste("Job summary for", name, "full itemization"), cex=3)
+
+ # create the table for each combination of values in the group, listing iterations in the columns
+ sum = cast(melt(sub, id.vars=groupAnnotations, measure.vars=c("runtime")), ... ~ iteration, fun.aggregate=mean)
+ textplot(as.data.frame(sum), show.rownames=F)
+ title(paste("Job summary for", name, "itemizing each iteration"), cex=3)
+
+ # as above, but averaging over all iterations
+ groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration")
+ sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
+ textplot(as.data.frame(sum), show.rownames=F)
+ title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
+}
+
+# print out some useful basic information
+print("Report")
+print(paste("Project :", inputFileName))
+
+convertUnits <- function(gatkReportData) {
+ convertGroup <- function(g) {
+ g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
+ g
+ }
+ lapply(gatkReportData, convertGroup)
+}
+
+
+# read the table
+gatkReportData <- gsa.read.gatkreport(inputFileName)
+gatkReportData <- convertUnits(gatkReportData)
+#print(summary(gatkReportData))
+
+if ( ! is.na(outputPDF) ) {
+ pdf(outputPDF, height=8.5, width=11)
+}
+
+plotJobsGantt(gatkReportData, T)
+plotJobsGantt(gatkReportData, F)
+plotProgressByTime(gatkReportData)
+for ( group in gatkReportData ) {
+ plotGroup(group)
+}
+
+if ( ! is.na(outputPDF) ) {
+ dev.off()
+}
diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
index 2ff8aa979..7ea515591 100755
--- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
+++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
@@ -31,6 +31,7 @@ import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
+import org.broadinstitute.sting.utils.R.RScriptExecutor;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
@@ -38,6 +39,7 @@ import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.*;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.Map;
import java.util.regex.Pattern;
@@ -91,12 +93,12 @@ import java.util.regex.Pattern;
* -resources resources/ \
* -ignoreQ 5
*
- *
+ *
*/
@DocumentedGATKFeature(
- groupName = "AnalyzeCovariates",
- summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
+ groupName = "AnalyzeCovariates",
+ summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
public class AnalyzeCovariates extends CommandLineProgram {
/////////////////////////////
@@ -118,7 +120,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
private String PATH_TO_RESOURCES = "public/R/";
@Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false)
private int IGNORE_QSCORES_LESS_THAN = 5;
- @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
+ @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
/**
@@ -323,13 +325,14 @@ public class AnalyzeCovariates extends CommandLineProgram {
}
private void callRScripts() {
+ RScriptExecutor.RScriptArgumentCollection argumentCollection =
+ new RScriptExecutor.RScriptArgumentCollection(PATH_TO_RSCRIPT, Arrays.asList(PATH_TO_RESOURCES));
+ RScriptExecutor executor = new RScriptExecutor(argumentCollection, true);
int numReadGroups = 0;
-
+
// for each read group
for( Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet() ) {
-
- Process p;
if(++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) {
String readGroup = readGroupKey.toString();
@@ -338,35 +341,19 @@ public class AnalyzeCovariates extends CommandLineProgram {
// for each covariate
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
Covariate cov = requestedCovariates.get(iii);
- try {
-
- if (DO_INDEL_QUALITY) {
- p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_indelQuality.R" + " " +
- OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
- cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
- p.waitFor();
-
- } else {
+ final String outputFilename = OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat";
+ if (DO_INDEL_QUALITY) {
+ executor.callRScripts("plot_indelQuality.R", outputFilename,
+ cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
+ } else {
if( iii == 1 ) {
- // Analyze reported quality
- p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
- OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
- IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE + " " + MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
- p.waitFor();
- } else { // Analyze all other covariates
- p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
- OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
- cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
- p.waitFor();
- }
+ // Analyze reported quality
+ executor.callRScripts("plot_residualError_QualityScoreCovariate.R", outputFilename,
+ IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
+ } else { // Analyze all other covariates
+ executor.callRScripts("plot_residualError_OtherCovariate.R", outputFilename,
+ cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
}
- } catch (InterruptedException e) {
- e.printStackTrace();
- System.exit(-1);
- } catch (IOException e) {
- System.out.println("Fatal Exception: Perhaps RScript jobs are being spawned too quickly? One work around is to process fewer read groups using the -numRG option.");
- e.printStackTrace();
- System.exit(-1);
}
}
} else { // at the maximum number of read groups so break out
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index ff992d77d..16358d05f 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -379,10 +379,14 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
}
if ( tribbleType == null )
- throw new UserException.CommandLineException(
- String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
- "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
- manager.userFriendlyListOfAvailableFeatures(parameterType)));
+ if ( ! file.canRead() | !! file.isFile() ) {
+ throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
+ } else {
+ throw new UserException.CommandLineException(
+ String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
+ "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
+ manager.userFriendlyListOfAvailableFeatures(parameterType)));
+ }
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
index 48fd73e0b..65ff27497 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
@@ -48,9 +48,10 @@ public class LinearMicroScheduler extends MicroScheduler {
walker.initialize();
Accumulator accumulator = Accumulator.create(engine,walker);
+ boolean done = walker.isDone();
int counter = 0;
for (Shard shard : shardStrategy ) {
- if ( shard == null ) // we ran out of shards that aren't owned
+ if ( done || shard == null ) // we ran out of shards that aren't owned
break;
if(shard.getShardType() == Shard.ShardType.LOCUS) {
@@ -61,6 +62,7 @@ public class LinearMicroScheduler extends MicroScheduler {
Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit());
accumulator.accumulate(dataProvider,result);
dataProvider.close();
+ if ( walker.isDone() ) break;
}
windowMaker.close();
}
@@ -70,6 +72,8 @@ public class LinearMicroScheduler extends MicroScheduler {
accumulator.accumulate(dataProvider,result);
dataProvider.close();
}
+
+ done = walker.isDone();
}
Object result = accumulator.finishTraversal();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
index acee1a6a3..4d94130a8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
@@ -46,7 +46,6 @@ import org.simpleframework.xml.stream.Format;
import org.simpleframework.xml.stream.HyphenStyle;
import java.io.*;
-import java.net.InetAddress;
import java.security.NoSuchAlgorithmException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
@@ -230,22 +229,6 @@ public class GATKRunReport {
}
- /**
- * Helper utility that calls into the InetAddress system to resolve the hostname. If this fails,
- * unresolvable gets returned instead.
- *
- * @return
- */
- private String resolveHostname() {
- try {
- return InetAddress.getLocalHost().getCanonicalHostName();
- }
- catch (java.net.UnknownHostException uhe) { // [beware typo in code sample -dmw]
- return "unresolvable";
- // handle exception
- }
- }
-
public void postReport(PhoneHomeOption type) {
logger.debug("Posting report of type " + type);
switch (type) {
@@ -325,7 +308,7 @@ public class GATKRunReport {
private void postReportToAWSS3() {
// modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html
- this.hostName = resolveHostname(); // we want to fill in the host name
+ this.hostName = Utils.resolveHostname(); // we want to fill in the host name
File localFile = postReportToLocalDisk(new File("./"));
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
if ( localFile != null ) { // we succeeded in creating the local file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java
index ba1ca674e..a97f3211c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java
@@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.refdata.tracks;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.util.CloseableIterator;
+import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureSource;
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
@@ -45,10 +46,11 @@ import java.io.IOException;
* the basics of what a reference metadata track must contain.
*/
public class RMDTrack {
+ private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
+ private final static boolean DEBUG = false;
// the basics of a track:
private final Class type; // our type
- private final Class recordType; // the underlying records that are produced by this track
private final String name; // the name
private final File file; // the associated file we create the reader from
@@ -90,7 +92,6 @@ public class RMDTrack {
*/
public RMDTrack(Class type, String name, File file, FeatureSource reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) {
this.type = type;
- this.recordType = codec.getFeatureType();
this.name = name;
this.file = file;
this.reader = reader;
@@ -112,19 +113,8 @@ public class RMDTrack {
}
public CloseableIterator query(GenomeLoc interval) throws IOException {
- return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(interval.getContig(),interval.getStart(),interval.getStop()),this.getName());
- }
-
- public CloseableIterator query(GenomeLoc interval, boolean contained) throws IOException {
- return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(interval.getContig(),interval.getStart(),interval.getStop()),this.getName());
- }
-
- public CloseableIterator query(String contig, int start, int stop) throws IOException {
- return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(contig,start,stop),this.getName());
- }
-
- public CloseableIterator query(String contig, int start, int stop, boolean contained) throws IOException {
- return new FeatureToGATKFeatureIterator(genomeLocParser,reader.query(contig,start,stop),this.getName());
+ if ( DEBUG ) logger.debug("Issuing query for %s: " + interval);
+ return new FeatureToGATKFeatureIterator(genomeLocParser, reader.query(interval.getContig(),interval.getStart(),interval.getStop()), this.getName());
}
public void close() {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java
index 152e1a57b..3e3aa29a7 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java
@@ -92,6 +92,8 @@ import java.util.regex.Pattern;
* @author Khalid Shakir
*/
public class GATKReportTable {
+ /** REGEX that matches any table with an invalid name */
+ public final static String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]";
private static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V0_2;
private String tableName;
private String tableDescription;
@@ -111,7 +113,7 @@ public class GATKReportTable {
* @return true if the name is valid, false if otherwise
*/
private boolean isValidName(String name) {
- Pattern p = Pattern.compile("[^a-zA-Z0-9_\\-\\.]");
+ Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX);
Matcher m = p.matcher(name);
return !m.find();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
index 1ba48ca5f..046003154 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
@@ -173,7 +173,9 @@ public class TraverseDuplicates extends TraversalEngine those with the same mate pair position, for paired reads
* -> those flagged as unpaired and duplicated but having the same start and end
*/
+ boolean done = walker.isDone();
for (SAMRecord read : iter) {
+ if ( done ) break;
// get the genome loc from the read
GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read);
@@ -194,6 +196,7 @@ public class TraverseDuplicates extends TraversalEngine extends TraversalEngine,Locu
logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider));
LocusView locusView = getLocusView( walker, dataProvider );
+ boolean done = false;
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
@@ -46,7 +47,7 @@ public class TraverseLoci extends TraversalEngine,Locu
LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
// We keep processing while the next reference location is within the interval
- while( locusView.hasNext() ) {
+ while( locusView.hasNext() && ! done ) {
AlignmentContext locus = locusView.next();
GenomeLoc location = locus.getLocation();
@@ -76,15 +77,17 @@ public class TraverseLoci extends TraversalEngine,Locu
if (keepMeP) {
M x = walker.map(tracker, refContext, locus);
sum = walker.reduce(x, sum);
+ done = walker.isDone();
}
printProgress(dataProvider.getShard(),locus.getLocation());
}
}
- // We have a final map call to execute here to clean up the skipped based from the
- // last position in the ROD to that in the interval
- if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA ) {
+ // We have a final map call to execute here to clean up the skipped based from the
+ // last position in the ROD to that in the interval
+ if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) {
+ // only do this if the walker isn't done!
RodLocusView rodLocusView = (RodLocusView)locusView;
long nSkipped = rodLocusView.getLastSkippedBases();
if ( nSkipped > 0 ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java
index 196d54036..dd4402d82 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java
@@ -50,7 +50,9 @@ public class TraverseReadPairs extends TraversalEngine pairs = new ArrayList();
+ boolean done = walker.isDone();
for(SAMRecord read: reads) {
+ if ( done ) break;
dataProvider.getShard().getReadMetrics().incrementNumReadsSeen();
if(pairs.size() == 0 || pairs.get(0).getReadName().equals(read.getReadName())) {
@@ -65,6 +67,8 @@ public class TraverseReadPairs extends TraversalEngine extends TraversalEngine,Read
// get the reference ordered data
ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
+ boolean done = walker.isDone();
// while we still have more reads
for (SAMRecord read : reads) {
+ if ( done ) break;
// ReferenceContext -- the reference bases covered by the read
ReferenceContext refContext = null;
@@ -106,6 +108,7 @@ public class TraverseReads extends TraversalEngine,Read
GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
printProgress(dataProvider.getShard(),locus);
+ done = walker.isDone();
}
return sum;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java
index 9e261a0b1..c88c7c3c4 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java
@@ -126,6 +126,17 @@ public abstract class Walker {
public void initialize() { }
+ /**
+ * A function for overloading in subclasses providing a mechanism to abort early from a walker.
+ *
+ * If this ever returns true, then the Traversal engine will stop executing map calls
+ * and start the process of shutting down the walker in an orderly fashion.
+ * @return
+ */
+ public boolean isDone() {
+ return false;
+ }
+
/**
* Provide an initial value for reduce computations.
* @return Initial value of reduce.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
index 4e3342609..2159bc839 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
@@ -234,7 +234,7 @@ public class DiffEngine {
// now that we have a specific list of values we want to show, display them
GATKReport report = new GATKReport();
- final String tableName = "diffences";
+ final String tableName = "differences";
report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", false);
GATKReportTable table = report.getTable(tableName);
table.addPrimaryKey("Difference", true);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index e7f89bf08..ae419a5c4 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -156,7 +156,7 @@ public class UnifiedArgumentCollection {
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
@Hidden
- @Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false)
+ @Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false)
public boolean dovit = false;
@Hidden
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
index 55450486b..2d7969230 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@@ -274,7 +274,7 @@ public class PairHMMIndelErrorModel {
this.doViterbi = dovit;
}
- public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {
+ public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {
this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
@@ -754,7 +754,7 @@ public class PairHMMIndelErrorModel {
// check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
if (indelLikelihoodMap.containsKey(p)) {
- HashMap el = indelLikelihoodMap.get(p);
+ HashMap el = indelLikelihoodMap.get(p);
int j=0;
for (Allele a: haplotypeMap.keySet()) {
readLikelihoods[readIdx][j++] = el.get(a);
@@ -1055,7 +1055,6 @@ public class PairHMMIndelErrorModel {
genotypeLikelihoods[i] -= maxElement;
return genotypeLikelihoods;
-
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
index 2b38afaf6..b616a0ebe 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
@@ -303,7 +303,10 @@ public class GenotypeAndValidateWalker extends RodWalker= 0) uac.MIN_BASE_QUALTY_SCORE = mbq;
- if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions;
+ if (deletions >= 0)
+ uac.MAX_DELETION_FRACTION = deletions;
+ else
+ uac.MAX_DELETION_FRACTION = 1.0;
if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf;
if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf;
@@ -347,12 +350,14 @@ public class GenotypeAndValidateWalker extends RodWalker
*
*/
@@ -223,6 +241,18 @@ public class SelectVariants extends RodWalker {
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
private boolean EXCLUDE_FILTERED = false;
+
+ /**
+ * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
+ * For example, a multiallelic record such as:
+ * 1 100 . A AAA,AAAAA
+ * will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as:
+ * 1 100 . A T
+ * will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC
+ */
+ @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false)
+ private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL;
+
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
@@ -266,11 +296,14 @@ public class SelectVariants extends RodWalker {
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
private double fractionRandom = 0;
- @Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false)
- private boolean SELECT_SNPS = false;
+ /**
+ * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
+ * When specified one or more times, a particular type of variant is selected.
+ *
+ */
+ @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
+ private List TYPES_TO_INCLUDE = new ArrayList();
- @Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false)
- private boolean SELECT_INDELS = false;
@Hidden
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
@@ -290,6 +323,13 @@ public class SelectVariants extends RodWalker {
}
+ public enum NumberAlleleRestriction {
+ ALL,
+ BIALLELIC,
+ MULTIALLELIC
+ }
+
+ private ArrayList selectedTypes = new ArrayList();
private ArrayList selectNames = new ArrayList();
private List jexls = null;
@@ -354,6 +394,20 @@ public class SelectVariants extends RodWalker {
for ( String sample : samples )
logger.info("Including sample '" + sample + "'");
+
+
+ // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
+ if (TYPES_TO_INCLUDE.isEmpty()) {
+
+ for (VariantContext.Type t : VariantContext.Type.values())
+ selectedTypes.add(t);
+
+ }
+ else {
+ for (VariantContext.Type t : TYPES_TO_INCLUDE)
+ selectedTypes.add(t);
+
+ }
// Initialize VCF header
Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
@@ -494,12 +548,13 @@ public class SelectVariants extends RodWalker {
return 0;
}
- // TODO - add ability to also select MNPs
- // TODO - move variant selection arguments to the engine so other walkers can also do this
- if (SELECT_INDELS && !(vc.isIndel() || vc.isMixed()))
+ if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic())
continue;
- if (SELECT_SNPS && !vc.isSNP())
+ if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic())
+ continue;
+
+ if (!selectedTypes.contains(vc.getType()))
continue;
VariantContext sub = subsetRecord(vc, samples);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
index 19db58e0c..2a877fb09 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@@ -124,8 +124,12 @@ public class VariantsToTable extends RodWalker {
* multi-allelic INFO field values can be lists of values.
*/
@Advanced
- @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
- public boolean keepMultiAllelic = false;
+ @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
+ public boolean keepMultiAllelic = false;
+
+ @Hidden
+ @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false)
+ public boolean logACSum = false;
/**
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
@@ -147,22 +151,21 @@ public class VariantsToTable extends RodWalker {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
- if ( ++nRecords < MAX_RECORDS || MAX_RECORDS == -1 ) {
- for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
- if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
- List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA);
- out.println(Utils.join("\t", vals));
- }
+ for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
+ if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
+ List vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum);
+ out.println(Utils.join("\t", vals));
}
-
- return 1;
- } else {
- if ( nRecords >= MAX_RECORDS ) {
- logger.warn("Calling sys exit to leave after " + nRecords + " records");
- System.exit(0); // todo -- what's the recommend way to abort like this?
- }
- return 0;
}
+
+ return 1;
+ }
+
+ @Override
+ public boolean isDone() {
+ boolean done = MAX_RECORDS != -1 && nRecords >= MAX_RECORDS;
+ if ( done) logger.warn("isDone() will return true to leave after " + nRecords + " records");
+ return done ;
}
private static final boolean isWildCard(String s) {
@@ -176,9 +179,11 @@ public class VariantsToTable extends RodWalker {
* @param fields a non-null list of fields to capture from VC
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
* provides a value of NA
+ * @param kma if true, multiallelic variants are to be kept
+ * @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count.
* @return
*/
- public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) {
+ private static List extractFields(VariantContext vc, List fields, boolean allowMissingData, boolean kma, boolean logsum) {
List vals = new ArrayList();
for ( String field : fields ) {
@@ -206,12 +211,31 @@ public class VariantsToTable extends RodWalker {
}
if (field.equals("AF") || field.equals("AC")) {
- if (val.contains(",")) {
- // strip [,] and spaces
- val = val.replace("[","");
- val = val.replace("]","");
- val = val.replace(" ","");
- }
+ String afo = val;
+
+ double af=0;
+ if (afo.contains(",")) {
+ String[] afs = afo.split(",");
+ afs[0] = afs[0].substring(1,afs[0].length());
+ afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
+
+ double[] afd = new double[afs.length];
+
+ for (int k=0; k < afd.length; k++)
+ afd[k] = Double.valueOf(afs[k]);
+
+ if (kma && logsum)
+ af = MathUtils.sum(afd);
+ else
+ af = MathUtils.arrayMax(afd);
+ //af = Double.valueOf(afs[0]);
+
+ }
+ else
+ if (!afo.equals("NA"))
+ af = Double.valueOf(afo);
+
+ val = Double.toString(af);
}
vals.add(val);
@@ -220,6 +244,9 @@ public class VariantsToTable extends RodWalker {
return vals;
}
+ public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) {
+ return extractFields(vc, fields, allowMissingData, false, false);
+ }
//
// default reduce -- doesn't do anything at all
//
@@ -243,7 +270,7 @@ public class VariantsToTable extends RodWalker {
getters.put("REF", new Getter() {
public String get(VariantContext vc) {
String x = "";
- if ( vc.hasReferenceBaseForIndel() ) {
+ if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x=x+new String(new byte[]{refByte});
}
@@ -255,7 +282,7 @@ public class VariantsToTable extends RodWalker {
StringBuilder x = new StringBuilder();
int n = vc.getAlternateAlleles().size();
if ( n == 0 ) return ".";
- if ( vc.hasReferenceBaseForIndel() ) {
+ if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x.append(new String(new byte[]{refByte}));
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java
new file mode 100644
index 000000000..c0493fe22
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.R;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.commandline.Advanced;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.ArgumentCollection;
+import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
+import org.broadinstitute.sting.utils.PathUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Generic service for executing RScripts in the GATK directory
+ *
+ * @author Your Name
+ * @since Date created
+ */
+public class RScriptExecutor {
+ /**
+ * our log
+ */
+ protected static Logger logger = Logger.getLogger(RScriptExecutor.class);
+
+ public static class RScriptArgumentCollection {
+ @Advanced
+ @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
+ public String PATH_TO_RSCRIPT = "Rscript";
+
+ @Advanced
+ @Argument(fullName = "path_to_Rresources", shortName = "Rresources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
+ public List PATH_TO_RESOURCES = Arrays.asList("public/R/", "private/R/");
+
+ public RScriptArgumentCollection() {}
+
+ /** For testing and convenience */
+ public RScriptArgumentCollection(final String PATH_TO_RSCRIPT, final List PATH_TO_RESOURCES) {
+ this.PATH_TO_RSCRIPT = PATH_TO_RSCRIPT;
+ this.PATH_TO_RESOURCES = PATH_TO_RESOURCES;
+ }
+ }
+
+ final RScriptArgumentCollection myArgs;
+ final boolean exceptOnError;
+
+ public RScriptExecutor(final RScriptArgumentCollection myArgs, final boolean exceptOnError) {
+ this.myArgs = myArgs;
+ this.exceptOnError = exceptOnError;
+ }
+
+ public void callRScripts(String scriptName, Object... scriptArgs) {
+ callRScripts(scriptName, Arrays.asList(scriptArgs));
+ }
+
+ public void callRScripts(String scriptName, List