diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index bbbd96cf1..f66e229bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -64,12 +64,35 @@ public class GATKArgumentCollection { @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; + // -------------------------------------------------------------------------------------------------------------- + // + // GATKRunReport options + // + // -------------------------------------------------------------------------------------------------------------- + @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false) public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD; @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false) public File gatkKeyFile = null; + /** + * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary String tag that can be + * used to group together runs during later analysis. One use of this capability is to tag runs as GATK + * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. + * + * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find + * meaningful. + */ + @Argument(fullName = "tag", shortName = "tag", doc="Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis", required = false) + public String tag = "NA"; + + // -------------------------------------------------------------------------------------------------------------- + // + // XXX + // + // -------------------------------------------------------------------------------------------------------------- + @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) public List readFilters = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index b5d5dfc0e..75af7976f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -304,7 +304,7 @@ public class LocusIteratorByState extends LocusIterator { final boolean isSingleElementCigar = nextElement == lastElement; final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator - final int readOffset = state.getReadOffset(); // the base offset on this read + int readOffset = state.getReadOffset(); // the base offset on this read final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; @@ -331,6 +331,9 @@ public class LocusIteratorByState extends LocusIterator { String insertedBaseString = null; if (nextOp == CigarOperator.I) { final int insertionOffset = isSingleElementCigar ? 0 : 1; + // TODO -- someone please implement a better fix for the single element insertion CIGAR! + if (isSingleElementCigar) + readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases! insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index b60a7845a..035252c14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -138,6 +138,9 @@ public class GATKRunReport { @Element(required = true, name = "iterations") private long nIterations; + @Element(required = true, name = "tag") + private String tag; + public enum PhoneHomeOption { /** Disable phone home */ NO_ET, @@ -186,6 +189,8 @@ public class GATKRunReport { nIterations = engine.getCumulativeMetrics().getNumIterations(); } + tag = engine.getArguments().tag; + // user and hostname -- information about the runner of the GATK userName = System.getProperty("user.name"); hostName = Utils.resolveHostname(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index aacd987d5..33a543e39 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -286,6 +286,7 @@ public class VariantDataManager { case INDEL: case MIXED: case SYMBOLIC: + case STRUCTURAL_INDEL: return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.INDEL ); default: return false; diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 2d7f51c3f..8c95091a6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils; +import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import java.util.Arrays; @@ -444,29 +445,8 @@ public class BaseUtils { * @param bases the bases * @return the upper cased version */ - static public byte[] convertToUpperCase(final byte[] bases) { - for ( int i = 0; i < bases.length; i++ ) { - if ( (char)bases[i] >= 'a' ) - bases[i] = toUpperCaseBase(bases[i]); - } - return bases; - } - - static public byte toUpperCaseBase(final byte base) { - switch (base) { - case 'a': - return 'A'; - case 'c': - return 'C'; - case 'g': - return 'G'; - case 't': - return 'T'; - case 'n': - return 'N'; - default: - return base; - } + static public void convertToUpperCase(final byte[] bases) { + StringUtil.toUpperCase(bases); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java index 2c312678e..85c925204 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java @@ -105,7 +105,7 @@ public class Allele implements Comparable { if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele"); } else { - bases = BaseUtils.convertToUpperCase(bases); + BaseUtils.convertToUpperCase(bases); } this.isRef = isRef; diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index e65a7fb27..edd97f17f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -208,19 +208,18 @@ public class LocusIteratorByStateUnitTest extends BaseTest { } /** - * Test to make sure that reads supporting only an indel (example cigar string: 76I) do - * not negatively influence the ordering of the pileup. + * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly */ @Test - public void testWholeIndelReadTest2() { + public void testWholeIndelReadRepresentedTest() { final int firstLocus = 44367788, secondLocus = firstLocus + 1; - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read",0,secondLocus,1); - read.setReadBases(Utils.dupBytes((byte) 'A', 1)); - read.setBaseQualities(Utils.dupBytes((byte) '@', 1)); - read.setCigarString("1I"); + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); + read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read1.setCigarString("1I"); - List reads = Arrays.asList(read); + List reads = Arrays.asList(read1); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads, createTestReadProperties()); @@ -234,6 +233,26 @@ public class LocusIteratorByStateUnitTest extends BaseTest { Assert.assertFalse(pe.isAfterInsertion()); Assert.assertEquals(pe.getEventBases(), "A"); } + + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); + read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); + read2.setCigarString("10I"); + + reads = Arrays.asList(read2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); + } } private static ReadProperties createTestReadProperties() { diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 041e84a8c..775847ba9 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -105,8 +105,7 @@ class QCommandLine extends CommandLineProgram with Logging { def execute = { if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) - - qGraph.settings = settings + qGraph.initializeWithSettings(settings) val allQScripts = pluginManager.createAllTypes(); for (script <- allQScripts) { @@ -137,26 +136,9 @@ class QCommandLine extends CommandLineProgram with Logging { logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size)) - if (!settings.disableJobReport) { - val jobStringName = { - if (settings.jobReportFile != null) - settings.jobReportFile - else - settings.qSettings.runName + ".jobreport.txt" - } - - if (!shuttingDown) { - val reportFile = IOUtils.absolute(settings.qSettings.runDirectory, jobStringName) - logger.info("Writing JobLogging GATKReport to file " + reportFile) - QJobReport.printReport(functionsAndStatus, reportFile) - - if (settings.run) { - val pdfFile = IOUtils.absolute(settings.qSettings.runDirectory, FilenameUtils.removeExtension(jobStringName) + ".pdf") - logger.info("Plotting JobLogging GATKReport to file " + pdfFile) - QJobReport.plotReport(reportFile, pdfFile) - } - } - } + // write the final complete job report + logger.info("Writing final jobs report...") + qGraph.writeJobsReport() if (!qGraph.success) { logger.info("Done with errors") diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index e3a1714ff..2c33596e1 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -39,7 +39,7 @@ import collection.immutable.{TreeSet, TreeMap} import org.broadinstitute.sting.queue.function.scattergather.{ScatterFunction, CloneFunction, GatherFunction, ScatterGatherableFunction} import java.util.Date import org.broadinstitute.sting.utils.Utils -import org.apache.commons.io.{FileUtils, IOUtils} +import org.apache.commons.io.{FilenameUtils, FileUtils, IOUtils} import java.io.{OutputStreamWriter, File} /** @@ -71,6 +71,16 @@ class QGraph extends Logging { private val inProcessManager = new InProcessJobManager private def managers = Seq[Any](inProcessManager, commandLineManager) + /** + * If true, we will write out incremental job reports + */ + private val INCREMENTAL_JOBS_REPORT = true + + /** + * Holds the optional jobInfoReporter structure + */ + private var jobInfoReporter: QJobsReporter = null + private class StatusCounts { var pending = 0 var running = 0 @@ -79,6 +89,19 @@ class QGraph extends Logging { } private val statusCounts = new StatusCounts + /** + * Final initialization step of this QGraph -- tell it runtime setting information + * + * The settings aren't necessarily available until after this QGraph object has been constructed, so + * this function must be called once the QGraphSettings have been filled in. + * + * @param settings + */ + def initializeWithSettings(settings: QGraphSettings) { + this.settings = settings + this.jobInfoReporter = createJobsReporter() + } + /** * Adds a QScript created CommandLineFunction to the graph. * @param command Function to add to the graph. @@ -467,6 +490,12 @@ class QGraph extends Logging { checkRetryJobs(failedJobs) } + // incremental + if ( logNextStatusCounts && INCREMENTAL_JOBS_REPORT ) { + logger.info("Writing incremental jobs reports...") + writeJobsReport(false) + } + readyJobs ++= getReadyJobs } @@ -1084,6 +1113,39 @@ class QGraph extends Logging { } } + /** + * Create the jobsReporter for this QGraph, based on the settings data. + * + * Must be called after settings has been initialized properly + * + * @return + */ + private def createJobsReporter(): QJobsReporter = { + val jobStringName = if (settings.jobReportFile != null) + settings.jobReportFile + else + settings.qSettings.runName + ".jobreport.txt" + + val reportFile = org.broadinstitute.sting.utils.io.IOUtils.absolute(settings.qSettings.runDirectory, jobStringName) + + val pdfFile = if ( settings.run ) + Some(org.broadinstitute.sting.utils.io.IOUtils.absolute(settings.qSettings.runDirectory, FilenameUtils.removeExtension(jobStringName) + ".pdf")) + else + None + + new QJobsReporter(settings.disableJobReport, reportFile, pdfFile) + } + + /** + * Write, if possible, the jobs report + */ + def writeJobsReport(plot: Boolean = true) { + // note: the previous logic didn't write the job report if the system was shutting down, but I don't + // see any reason not to write the job report + if ( jobInfoReporter != null ) + jobInfoReporter.write(this, plot) + } + /** * Returns true if the graph was shutdown instead of exiting on its own. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala index c69a310b3..0600f9ad5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -25,13 +25,8 @@ package org.broadinstitute.sting.queue.util import org.broadinstitute.sting.queue.function.QFunction -import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} -import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.gatk.report.GATKReportTable import org.broadinstitute.sting.queue.engine.JobRunInfo -import java.io.{PrintStream, File} -import org.broadinstitute.sting.utils.R.{RScriptLibrary, RScriptExecutor} -import org.broadinstitute.sting.utils.io.Resource -import org.apache.commons.io.{IOUtils, FileUtils} /** * A mixin to add Job info to the class @@ -98,31 +93,10 @@ trait QJobReport extends Logging { } object QJobReport { - val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R" - // todo -- fixme to have a unique name for Scatter/gather jobs as well var seenCounter = 1 var seenNames = Set[String]() - def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { - val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) - jobs foreach {case (qf, info) => qf.setRunInfo(info)} - val stream = new PrintStream(FileUtils.openOutputStream(dest)) - try { - printJobLogging(jobs.keys.toSeq, stream) - } finally { - IOUtils.closeQuietly(stream) - } - } - - def plotReport(reportFile: File, pdfFile: File) { - val executor = new RScriptExecutor - executor.addLibrary(RScriptLibrary.GSALIB) - executor.addScript(new Resource(JOB_REPORT_QUEUE_SCRIPT, classOf[QJobReport])) - executor.addArgs(reportFile.getAbsolutePath, pdfFile.getAbsolutePath) - executor.exec() - } - def workAroundSameJobNames(func: QFunction):String = { if ( seenNames.apply(func.jobName) ) { seenCounter += 1 @@ -132,45 +106,4 @@ object QJobReport { func.jobName } } - - /** - * Prints the JobLogging logs to a GATKReport. First splits up the - * logs by group, and for each group generates a GATKReportTable - */ - private def printJobLogging(logs: Seq[QFunction], stream: PrintStream) { - // create the report - val report: GATKReport = new GATKReport - - // create a table for each group of logs - for ( (group, groupLogs) <- groupLogs(logs) ) { - val keys = logKeys(groupLogs) - report.addTable(group, "Job logs for " + group, keys.size) - val table: GATKReportTable = report.getTable(group) - - // add the columns - keys.foreach(table.addColumn(_)) - for (log <- groupLogs) { - for ( key <- keys ) - table.set(log.getReportName, key, log.getReportFeature(key)) - } - } - - report.print(stream) - } - - private def groupLogs(logs: Seq[QFunction]): Map[String, Seq[QFunction]] = { - logs.groupBy(_.getReportGroup) - } - - private def logKeys(logs: Seq[QFunction]): Set[String] = { - // the keys should be the same for each log, but we will check that - val keys = Set[String](logs(0).getReportFeatureNames : _*) - - for ( log <- logs ) - if ( keys.sameElements(Set(log.getReportFeatureNames)) ) - throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " + - "We found one with %s and another with %s").format(keys, log.getReportFeatureNames)) - - keys - } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala new file mode 100644 index 000000000..a23fe4485 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.util + +import java.io.{PrintStream, File} +import org.broadinstitute.sting.utils.io.{Resource} +import org.broadinstitute.sting.queue.engine.{JobRunInfo, QGraph} +import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.utils.R.{RScriptLibrary, RScriptExecutor} +import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} +import org.broadinstitute.sting.utils.exceptions.UserException +import org.apache.commons.io.{FileUtils, IOUtils} + +/** + * Writes out RunInfo to a GATKReport + */ +class QJobsReporter(val disabled: Boolean, val reportFile: File, val pdfFile: Option[File]) extends Logging { + private val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R" + + /** + * Write out a job report based on the finished jobs graph + * @param jobGraph + * @param enabledPlotting if true, we will plot the report as well with the JOB_REPORT_QUEUE_SCRIPT + */ + def write(jobGraph: QGraph, enabledPlotting: Boolean) { + if ( ! disabled ) { + logger.info("Writing JobLogging GATKReport to file " + reportFile) + printReport(jobGraph.getFunctionsAndStatus, reportFile) + + if ( enabledPlotting ) + pdfFile match { + case Some(file) => + logger.info("Plotting JobLogging GATKReport to file " + file) + plotReport(reportFile, file) + case None => + } + } + } + + private def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { + val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) + jobs foreach {case (qf, info) => qf.setRunInfo(info)} + val stream = new PrintStream(FileUtils.openOutputStream(dest)) + try { + printJobLogging(jobs.keys.toSeq, stream) + } finally { + IOUtils.closeQuietly(stream) + } + } + + private def plotReport(reportFile: File, pdfFile: File) { + val executor = new RScriptExecutor + executor.addLibrary(RScriptLibrary.GSALIB) + executor.addScript(new Resource(JOB_REPORT_QUEUE_SCRIPT, classOf[QJobReport])) + executor.addArgs(reportFile.getAbsolutePath, pdfFile.getAbsolutePath) + executor.exec() + } + + /** + * Prints the JobLogging logs to a GATKReport. First splits up the + * logs by group, and for each group generates a GATKReportTable + */ + private def printJobLogging(logs: Seq[QFunction], stream: PrintStream) { + // create the report + val report: GATKReport = new GATKReport + + // create a table for each group of logs + for ( (group, groupLogs) <- groupLogs(logs) ) { + val keys = logKeys(groupLogs) + report.addTable(group, "Job logs for " + group, keys.size) + val table: GATKReportTable = report.getTable(group) + + // add the columns + keys.foreach(table.addColumn(_)) + for (log <- groupLogs) { + for ( key <- keys ) + table.set(log.getReportName, key, log.getReportFeature(key)) + } + } + + report.print(stream) + } + + private def groupLogs(logs: Seq[QFunction]): Map[String, Seq[QFunction]] = { + logs.groupBy(_.getReportGroup) + } + + private def logKeys(logs: Seq[QFunction]): Set[String] = { + // the keys should be the same for each log, but we will check that + val keys = Set[String](logs(0).getReportFeatureNames : _*) + + for ( log <- logs ) + if ( keys.sameElements(Set(log.getReportFeatureNames)) ) + throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " + + "We found one with %s and another with %s").format(keys, log.getReportFeatureNames)) + + keys + } +}