From 2ec3852acd4aa250a8d024d861f713630c71fd5b Mon Sep 17 00:00:00 2001 From: kshakir Date: Sun, 4 Nov 2012 23:42:02 -0500 Subject: [PATCH 01/24] Scala classes were only returning direct subclasses (confirmed when inspected in debugger) so changed PluginManager to allow specifying the explicit subclass. Removed some generics from PluginManager for now until able to figure out syntax for requesting explicit subclass. QStatusMessenger uses a slightly more primitive Map[String, Seq[RemoteFile]] instead of Map[ArgumentSource, Seq[RemoteFile]]. Added an QCommandPlugin.initScript utility method for handling specialized script types. --- .../org/broadinstitute/sting/gatk/WalkerManager.java | 4 ++-- .../sting/utils/classloader/PluginManager.java | 11 ++++++----- .../org/broadinstitute/sting/queue/QCommandLine.scala | 11 ++++++++--- .../broadinstitute/sting/queue/QCommandPlugin.scala | 2 ++ .../src/org/broadinstitute/sting/queue/QScript.scala | 8 ++++++-- .../sting/queue/engine/QStatusMessenger.scala | 3 +-- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index fbacbddc4..28b5f918d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -350,11 +350,11 @@ public class WalkerManager extends PluginManager { * @return A name for this type of walker. */ @Override - public String getName(Class walkerType) { + public String getName(Class walkerType) { String walkerName = ""; if (walkerType.getAnnotation(WalkerName.class) != null) - walkerName = walkerType.getAnnotation(WalkerName.class).value().trim(); + walkerName = ((WalkerName)walkerType.getAnnotation(WalkerName.class)).value().trim(); else walkerName = super.getName(walkerType); diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 43cc800d8..b39aae8ab 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -101,7 +101,7 @@ public class PluginManager { * Create a new plugin manager. * @param pluginType Core type for a plugin. */ - public PluginManager(Class pluginType) { + public PluginManager(Class pluginType) { this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), null); } @@ -110,7 +110,7 @@ public class PluginManager { * @param pluginType Core type for a plugin. * @param classpath Custom class path to search for classes. */ - public PluginManager(Class pluginType, List classpath) { + public PluginManager(Class pluginType, List classpath) { this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), classpath); } @@ -120,7 +120,7 @@ public class PluginManager { * @param pluginCategory Provides a category name to the plugin. Must not be null. * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. */ - public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) { + public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) { this(pluginType, pluginCategory, pluginSuffix, null); } @@ -131,7 +131,7 @@ public class PluginManager { * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. * @param classpath Custom class path to search for classes. */ - public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List classpath) { + public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List classpath) { this.pluginCategory = pluginCategory; this.pluginSuffix = pluginSuffix; @@ -149,6 +149,7 @@ public class PluginManager { } // Load all classes types filtering them by concrete. + @SuppressWarnings("unchecked") Set> allTypes = reflections.getSubTypesOf(pluginType); for( Class type: allTypes ) { // The plugin manager does not support anonymous classes; to be a plugin, a class must have a name. @@ -325,7 +326,7 @@ public class PluginManager { * @param pluginType The type of plugin. * @return A name for this type of plugin. */ - public String getName(Class pluginType) { + public String getName(Class pluginType) { String pluginName = ""; if (pluginName.length() == 0) { diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 65abaf7be..637174557 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -92,13 +92,19 @@ class QCommandLine extends CommandLineProgram with Logging { private lazy val qScriptPluginManager = { qScriptClasses = IOUtils.tempDir("Q-Classes-", "", settings.qSettings.tempDirectory) qScriptManager.loadScripts(scripts, qScriptClasses) - new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL)) + new PluginManager[QScript](qPluginType, Seq(qScriptClasses.toURI.toURL)) } private lazy val qCommandPlugin = { new PluginManager[QCommandPlugin](classOf[QCommandPlugin]) } + private lazy val allCommandPlugins = qCommandPlugin.createAllTypes() + + private lazy val qPluginType: Class[_ <: QScript] = { + allCommandPlugins.map(_.qScriptClass).headOption.getOrElse(classOf[QScript]) + } + /** * Takes the QScripts passed in, runs their script() methods, retrieves their generated * functions, and then builds and runs a QGraph based on the dependencies. @@ -106,8 +112,6 @@ class QCommandLine extends CommandLineProgram with Logging { def execute = { ClassFieldCache.parsingEngine = this.parser - val allCommandPlugins = qCommandPlugin.createAllTypes() - if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory)) @@ -138,6 +142,7 @@ class QCommandLine extends CommandLineProgram with Logging { for (script <- allQScripts) { logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) loadArgumentsIntoObject(script) + allCommandPlugins.foreach(_.initScript(script)) // TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now. //if (settings.run) script.pullInputs() diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala index 499c31554..eae6a6a92 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala @@ -6,4 +6,6 @@ import util.RemoteFileConverter trait QCommandPlugin { def statusMessenger: QStatusMessenger = null def remoteFileConverter: RemoteFileConverter = null + def qScriptClass: Class[_ <: QScript] = classOf[QScript] + def initScript(script: QScript) {} } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 8c834696c..eb8be183a 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -149,13 +149,17 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon * List out the remote outputs * @return the RemoteFile outputs by argument source */ - def remoteInputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(inputFields) + def remoteInputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(inputFields)) /** * List out the remote outputs * @return the RemoteFile outputs by argument source */ - def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(outputFields) + def remoteOutputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(outputFields)) + + private def tagMap(remoteFieldMap: Map[ArgumentSource, Seq[RemoteFile]]): Map[String, Seq[RemoteFile]] = { + remoteFieldMap.collect{ case (k, v) => ClassFieldCache.fullName(k) -> v }.toMap + } private def remoteFieldMap(fields: Seq[ArgumentSource]): Map[ArgumentSource, Seq[RemoteFile]] = { fields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala index c4151dafc..a1133b944 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala @@ -1,6 +1,5 @@ package org.broadinstitute.sting.queue.engine -import org.broadinstitute.sting.commandline.ArgumentSource import org.broadinstitute.sting.queue.util.RemoteFile /** @@ -8,7 +7,7 @@ import org.broadinstitute.sting.queue.util.RemoteFile */ trait QStatusMessenger { def started() - def done(inputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]], outputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]]) + def done(inputs: Seq[Map[String, Seq[RemoteFile]]], outputs: Seq[Map[String, Seq[RemoteFile]]]) def exit(message: String) def started(job: String) From 2c0bf89961e653caa8c21316db22a0edc306f3bd Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 24 Oct 2012 16:57:08 -0400 Subject: [PATCH 03/24] Co-Reduction implementation in ReduceReads ReduceReads now co-reduces bams if they're passed in toghether with multiple -I. Co-reduction forces every variant region in one sample to be a variant region in all samples. Also: * Added integrationtest for co-reduction * Fixed bug with new no-recalculation implementation of the marksites object where the last object wasn't being removed after finalizing a variant region (updated MD5's accordingly) DEV-200 #resolve #time 8m --- .../reducereads/CompressionStash.java | 38 ++++++++ .../reducereads/MultiSampleCompressor.java | 49 ++++++---- .../compression/reducereads/ReduceReads.java | 2 +- .../reducereads/SingleSampleCompressor.java | 38 ++++---- .../reducereads/SlidingWindow.java | 89 ++++++++++--------- .../ReduceReadsIntegrationTest.java | 10 +-- .../reducereads/SimpleGenomeLoc.java | 43 +++++++++ 7 files changed, 185 insertions(+), 84 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java index 714a4df18..a6e5b6c5b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.utils.GenomeLocComparator; +import java.util.Collection; import java.util.TreeSet; /** @@ -18,4 +19,41 @@ public class CompressionStash extends TreeSet { public CompressionStash() { super(new GenomeLocComparator()); } + + /** + * Adds a SimpleGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc + * in the stash. + * + * @param insertLoc the new loc to be inserted + * @return true if the loc, or it's merged version, wasn't present in the list before. + */ + @Override + public boolean add(SimpleGenomeLoc insertLoc) { + TreeSet removedLocs = new TreeSet(); + for (SimpleGenomeLoc existingLoc : this) { + if (existingLoc.isPast(insertLoc)) { + break; // if we're past the loc we're done looking for overlaps. + } + if (existingLoc.equals(insertLoc)) { + return false; // if this loc was already present in the stash, we don't need to insert it. + } + if (existingLoc.contiguousP(insertLoc)) { + removedLocs.add(existingLoc); // list the original loc for merging + } + } + for (SimpleGenomeLoc loc : removedLocs) { + this.remove(loc); // remove all locs that will be merged + } + removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged + return super.add(SimpleGenomeLoc.merge(removedLocs)); // merge them all into one loc and add to the stash + } + + @Override + public boolean addAll(Collection locs) { + boolean result = false; + for (SimpleGenomeLoc loc : locs) { + result |= this.add(loc); + } + return result; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 2c3439010..f348225ca 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -3,13 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.HashMap; import java.util.Map; -import java.util.SortedSet; +import java.util.Set; import java.util.TreeSet; /* @@ -41,7 +42,7 @@ import java.util.TreeSet; * * @author depristo */ -public class MultiSampleCompressor implements Compressor { +public class MultiSampleCompressor { protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class); protected Map compressorsPerSample = new HashMap(); @@ -55,30 +56,44 @@ public class MultiSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction, - final CompressionStash compressionStash) { + final boolean allowPolyploidReduction) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction, compressionStash)); + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction)); } } - @Override - public Iterable addAlignment(GATKSAMRecord read) { - String sample = read.getReadGroup().getSample(); - SingleSampleCompressor compressor = compressorsPerSample.get(sample); + public Set addAlignment(GATKSAMRecord read) { + String sampleName = read.getReadGroup().getSample(); + SingleSampleCompressor compressor = compressorsPerSample.get(sampleName); if ( compressor == null ) - throw new ReviewedStingException("No compressor for sample " + sample); - return compressor.addAlignment(read); + throw new ReviewedStingException("No compressor for sample " + sampleName); + Pair, CompressionStash> readsAndStash = compressor.addAlignment(read); + Set reads = readsAndStash.getFirst(); + CompressionStash regions = readsAndStash.getSecond(); + + reads.addAll(closeVariantRegionsInAllSamples(regions)); + + return reads; } - @Override - public Iterable close() { - SortedSet reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); - for ( SingleSampleCompressor comp : compressorsPerSample.values() ) - for ( GATKSAMRecord read : comp.close() ) - reads.add(read); + public Set close() { + Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + for ( SingleSampleCompressor sample : compressorsPerSample.values() ) { + Pair, CompressionStash> readsAndStash = sample.close(); + reads = readsAndStash.getFirst(); + } + return reads; + } + + private Set closeVariantRegionsInAllSamples(CompressionStash regions) { + Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + if (!regions.isEmpty()) { + for (SingleSampleCompressor sample : compressorsPerSample.values()) { + reads.addAll(sample.closeVariantRegions(regions)); + } + } return reads; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index b6761f4a6..a05992cb4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -330,7 +330,7 @@ public class ReduceReads extends ReadWalker, ReduceRea */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION, compressionStash)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION)); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 82a433300..ac3388795 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -1,8 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.Set; import java.util.TreeSet; /** @@ -10,7 +12,7 @@ import java.util.TreeSet; * @author carneiro, depristo * @version 3.0 */ -public class SingleSampleCompressor implements Compressor { +public class SingleSampleCompressor { final private int contextSize; final private int downsampleCoverage; final private int minMappingQuality; @@ -20,11 +22,11 @@ public class SingleSampleCompressor implements Compressor { final private ReduceReads.DownsampleStrategy downsampleStrategy; final private int nContigs; final private boolean allowPolyploidReduction; - final CompressionStash compressionStash; private SlidingWindow slidingWindow; private int slidingWindowCounter; + public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new TreeSet(), new CompressionStash()); public SingleSampleCompressor(final int contextSize, final int downsampleCoverage, @@ -34,8 +36,7 @@ public class SingleSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction, - final CompressionStash compressionStash) { + final boolean allowPolyploidReduction) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; @@ -46,15 +47,11 @@ public class SingleSampleCompressor implements Compressor { this.downsampleStrategy = downsampleStrategy; this.nContigs = nContigs; this.allowPolyploidReduction = allowPolyploidReduction; - this.compressionStash = compressionStash; } - /** - * @{inheritDoc} - */ - @Override - public Iterable addAlignment( GATKSAMRecord read ) { - TreeSet result = new TreeSet(new AlignmentStartWithNoTiesComparator()); + public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) { + Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + CompressionStash stash = new CompressionStash(); int readOriginalStart = read.getUnclippedStart(); // create a new window if: @@ -63,22 +60,27 @@ public class SingleSampleCompressor implements Compressor { (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window // close the current sliding window - result.addAll(slidingWindow.close()); + Pair, CompressionStash> readsAndStash = slidingWindow.close(); + reads = readsAndStash.getFirst(); + stash = readsAndStash.getSecond(); slidingWindow = null; // so we create a new one on the next if } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction, compressionStash); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction); slidingWindowCounter++; } - result.addAll(slidingWindow.addRead(read)); - return result; + stash.addAll(slidingWindow.addRead(read)); + return new Pair, CompressionStash>(reads, stash); } - @Override - public Iterable close() { - return (slidingWindow != null) ? slidingWindow.close() : new TreeSet(); + public Pair, CompressionStash> close() { + return (slidingWindow != null) ? slidingWindow.close() : emptyPair; + } + + public Set closeVariantRegions(CompressionStash regions) { + return slidingWindow.closeVariantRegions(regions); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 24cacd997..24a3ba3cb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -6,8 +6,10 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -55,7 +57,8 @@ public class SlidingWindow { private final int nContigs; private boolean allowPolyploidReductionInGeneral; - private CompressionStash compressionStash; + + private static CompressionStash emptyRegions = new CompressionStash(); /** * The types of synthetic reads to use in the finalizeAndAdd method @@ -87,7 +90,7 @@ public class SlidingWindow { } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction, CompressionStash compressionStash) { + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; @@ -124,7 +127,6 @@ public class SlidingWindow { this.nContigs = nContigs; this.allowPolyploidReductionInGeneral = allowPolyploidReduction; - this.compressionStash = compressionStash; } /** @@ -138,7 +140,7 @@ public class SlidingWindow { * @param read the read * @return a list of reads that have been finished by sliding the window. */ - public List addRead(GATKSAMRecord read) { + public CompressionStash addRead(GATKSAMRecord read) { addToHeader(windowHeader, read); // update the window header counts readsInWindow.add(read); // add read to sliding reads return slideWindow(read.getUnclippedStart()); @@ -152,8 +154,9 @@ public class SlidingWindow { * @param variantSite boolean array with true marking variant regions * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. */ - private SimpleGenomeLoc getNextVariantRegion(int from, int to, boolean[] variantSite) { + private SimpleGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean forceClose) { boolean foundStart = false; + final int windowHeaderStart = getStartLocation(windowHeader); int variantRegionStartIndex = 0; for (int i=from; i slideWindow(final int incomingReadUnclippedStart) { - List finalizedReads = new LinkedList(); - + protected CompressionStash slideWindow(final int incomingReadUnclippedStart) { final int windowHeaderStartLocation = getStartLocation(windowHeader); + CompressionStash regions = emptyRegions; + boolean forceClose = true; if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) { markSites(incomingReadUnclippedStart); int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation; int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) - CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, markedSites.getVariantSiteBitSet()); - finalizedReads = closeVariantRegions(regions, false); - - while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { - readsInWindow.pollFirst(); - } + regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); } - return finalizedReads; + // todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions + while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { + readsInWindow.pollFirst(); + } + + return regions; } @@ -623,31 +629,27 @@ public class SlidingWindow { result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); - return result; // finalized reads will be downsampled if necessary + return result; // finalized reads will be downsampled if necessary } - - private List closeVariantRegions(CompressionStash regions, boolean forceClose) { - List allReads = new LinkedList(); + public Set closeVariantRegions(CompressionStash regions) { + TreeSet allReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); if (!regions.isEmpty()) { int lastStop = -1; + int windowHeaderStart = getStartLocation(windowHeader); + for (SimpleGenomeLoc region : regions) { - int start = region.getStart(); - int stop = region.getStop(); + if (region.isFinished() && region.getContig() == contig && region.getStart() >= windowHeaderStart && region.getStop() <= windowHeaderStart + windowHeader.size()) { + int start = region.getStart() - windowHeaderStart; + int stop = region.getStop() - windowHeaderStart; - if (!region.isFinished()) { - if(forceClose) // region is unfinished but we're forcing the close of this window - stop = windowHeader.size() - 1; - else - continue; // region is unfinished and we're not forcing the close of this window + allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track + lastStop = stop; } - - allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); - lastStop = stop; } - for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) - windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! + for (int i = 0; i <= lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) + windowHeader.remove(); } return allReads; } @@ -681,23 +683,24 @@ public class SlidingWindow { * * @return All reads generated */ - public List close() { + public Pair, CompressionStash> close() { // mark variant regions - List finalizedReads = new LinkedList(); + Set finalizedReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + CompressionStash regions = new CompressionStash(); + boolean forceCloseUnfinishedRegions = true; if (!windowHeader.isEmpty()) { markSites(getStopLocation(windowHeader) + 1); - CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), markedSites.getVariantSiteBitSet()); - finalizedReads = closeVariantRegions(regions, true); + regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions); + finalizedReads = closeVariantRegions(regions); if (!windowHeader.isEmpty()) { finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false)); finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up } - } - return finalizedReads; + return new Pair, CompressionStash>(finalizedReads, regions); } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 50500536f..1e539dc9d 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -26,23 +26,23 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "46ea88e32bae3072f5cd68a0db4b55f1"); + RRTest("testDefaultCompression ", L, "98080d3c53f441564796fc143cf510da"); } @Test(enabled = true) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "c3784a0b42f5456b705f9b152a4b697a"); + RRTest("testMultipleIntervals ", intervals, "c5dcdf4edf368b5b897d66f76034d9f0"); } @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e385eb0ae5768f8507671d5303a212d5"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "27cb99e87eda5e46187e56f50dd37f26"); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "6b5546be9363e493b9838542f5dc8cae"); + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "4e7f111688d49973c35669855b7a2eaf"); } @Test(enabled = true) @@ -83,7 +83,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList(""))); + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("5c30fde961a1357bf72c15144c01981b"))); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java index 45e105751..51d8aad63 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java @@ -1,6 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.SortedSet; /** * GenomeLocs are very useful objects to keep track of genomic locations and perform set operations @@ -27,4 +31,43 @@ public class SimpleGenomeLoc extends GenomeLoc { public boolean isFinished() { return finished; } + + @Requires("a != null && b != null") + public static SimpleGenomeLoc merge(SimpleGenomeLoc a, SimpleGenomeLoc b) throws ReviewedStingException { + if(GenomeLoc.isUnmapped(a) || GenomeLoc.isUnmapped(b)) { + throw new ReviewedStingException("Tried to merge unmapped genome locs"); + } + + if (!(a.contiguousP(b))) { + throw new ReviewedStingException("The two genome locs need to be contiguous"); + } + + + return new SimpleGenomeLoc(a.getContig(), a.contigIndex, + Math.min(a.getStart(), b.getStart()), + Math.max(a.getStop(), b.getStop()), + a.isFinished()); + } + + /** + * Merges a list of *sorted* *contiguous* locs into one + * + * @param sortedLocs a sorted list of contiguous locs + * @return one merged loc + */ + public static SimpleGenomeLoc merge(SortedSet sortedLocs) { + SimpleGenomeLoc previousLoc = null; + for (SimpleGenomeLoc loc : sortedLocs) { + if (loc.isUnmapped()) { + throw new ReviewedStingException("Tried to merge unmapped genome locs"); + } + if (previousLoc != null && !previousLoc.contiguousP(loc)) { + throw new ReviewedStingException("The genome locs need to be contiguous"); + } + previousLoc = loc; + } + SimpleGenomeLoc firstLoc = sortedLocs.first(); + SimpleGenomeLoc lastLoc = sortedLocs.last(); + return merge(firstLoc, lastLoc); + } } From 95a4ba57bf1d13ee48402231495c19a0eed35546 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 13 Nov 2012 01:18:37 -0500 Subject: [PATCH 09/24] Implementation of BySampleSAMFileWriter ReduceReads now works with the n-way-out capability, splitting by sample. DEV-27 #resolve #time 3m --- .../compression/reducereads/ReduceReads.java | 37 +- .../utils/sam/BySampleSAMFileWriter.java | 70 ++++ .../sting/utils/sam/NWaySAMFileWriter.java | 374 +++++++++--------- 3 files changed, 290 insertions(+), 191 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index a05992cb4..3712e4524 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -25,6 +25,9 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMProgramRecord; import net.sf.samtools.util.SequenceUtil; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; @@ -45,6 +48,7 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -86,7 +90,8 @@ import java.util.*; public class ReduceReads extends ReadWalker, ReduceReadsStash> { @Output - private StingSAMFileWriter out; + private StingSAMFileWriter out = null; + private SAMFileWriter writerToUse = null; /** * The number of bases to keep around mismatches (potential variation) @@ -196,6 +201,10 @@ public class ReduceReads extends ReadWalker, ReduceRea @Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false) private int nContigs = 2; + @Hidden + @Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false) + private boolean nwayout = false; + @Hidden @Argument(fullName = "", shortName = "dl", doc = "", required = false) private int debugLevel = 0; @@ -227,6 +236,7 @@ public class ReduceReads extends ReadWalker, ReduceRea SortedSet intervalList; private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag + private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam"; /** * Basic generic initialization of the readNameHash and the intervalList. Output initialization @@ -242,10 +252,24 @@ public class ReduceReads extends ReadWalker, ReduceRea if (toolkit.getIntervals() != null) intervalList.addAll(toolkit.getIntervals()); - if (!NO_PG_TAG) - Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME); - else + + // todo -- rework the whole NO_PG_TAG thing + final boolean preSorted = true; + final boolean indexOnTheFly = true; + final boolean generateMD5 = true; + final boolean keep_records = true; + final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; + if (nwayout) { + SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME); + writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, preSorted, indexOnTheFly, NO_PG_TAG, programRecord, true); + } + else { + writerToUse = out; out.setPresorted(false); + if (!NO_PG_TAG) { + Utils.setupWriter(out, toolkit, !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + } + } } /** @@ -386,6 +410,9 @@ public class ReduceReads extends ReadWalker, ReduceRea // output any remaining reads in the compressor for (GATKSAMRecord read : stash.close()) outputRead(read); + + if (nwayout) + writerToUse.close(); } /** @@ -554,7 +581,7 @@ public class ReduceReads extends ReadWalker, ReduceRea if (!DONT_COMPRESS_READ_NAMES) compressReadName(read); - out.addAlignment(read); + writerToUse.addAlignment(read); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java new file mode 100644 index 000000000..6bad58d9f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.HashMap; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: Nov 13 + */ +public class BySampleSAMFileWriter extends NWaySAMFileWriter{ + + private final Map sampleToWriterMap; + + public BySampleSAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + super(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, pRecord, keep_records); + + sampleToWriterMap = new HashMap(toolkit.getSAMFileHeader().getReadGroups().size() * 2); + + for (SAMReaderID readerID : toolkit.getReadsDataSource().getReaderIDs()) { + for (SAMReadGroupRecord rg : toolkit.getReadsDataSource().getHeader(readerID).getReadGroups()) { + String sample = rg.getSample(); + if (sampleToWriterMap.containsKey(sample) && sampleToWriterMap.get(sample) != readerID) { + throw new ReviewedStingException("The same sample appears in multiple files, this input cannot be multiplexed using the BySampleSAMFileWriter, try NWaySAMFileWriter instead."); + } + else { + sampleToWriterMap.put(sample, readerID); + } + } + } + } + + @Override + public void addAlignment(SAMRecord samRecord) { + super.addAlignment(samRecord, sampleToWriterMap.get(samRecord.getReadGroup().getSample())); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index fa07523f3..83d1c99bf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -1,186 +1,188 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; - -import java.io.File; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: May 31, 2011 - * Time: 3:52:49 PM - * To change this template use File | Settings | File Templates. - */ -public class NWaySAMFileWriter implements SAMFileWriter { - - private Map writerMap = null; - private boolean presorted ; - GenomeAnalysisEngine toolkit; - boolean KEEP_ALL_PG_RECORDS = false; - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { - this.presorted = presorted; - this.toolkit = toolkit; - this.KEEP_ALL_PG_RECORDS = keep_records; - writerMap = new HashMap(); - setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { - this.presorted = presorted; - this.toolkit = toolkit; - this.KEEP_ALL_PG_RECORDS = keep_records; - writerMap = new HashMap(); - setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5) { - this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly , boolean generateMD5) { - this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); - } - - /** - * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved - * from toolkit). The in2out map must contain an entry for each input filename and map it - * onto a unique output file name. - * @param toolkit - * @param in2out - */ - public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { - if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); - for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { - - String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); - - String outName; - if ( ! in2out.containsKey(fName) ) - throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName); - outName = in2out.get(fName); - - if ( writerMap.containsKey( rid ) ) - throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ - "map file likely contains multiple entries for this input file"); - - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - } - - /** - * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved - * from toolkit). The output file names will be generated automatically by stripping ".sam" or ".bam" off the - * input file name and adding ext instead (e.g. ".cleaned.bam"). - * onto a unique output file name. - * @param toolkit - * @param ext - */ - public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { - for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { - - String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); - - String outName; - int pos ; - if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM"); - else { - if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM"); - else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam"); - } - String prefix = fName.substring(0,pos); - outName = prefix+ext; - - if ( writerMap.containsKey( rid ) ) - throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - } - - private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, - boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { - File f = new File(outName); - SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); - header.setSortOrder(order); - - if ( programRecord != null ) { - // --->> add program record - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) - newRecords.add(record); - } - newRecords.add(programRecord); - header.setProgramRecords(newRecords); - // <-- add program record ends here - } - SAMFileWriterFactory factory = new SAMFileWriterFactory(); - factory.setCreateIndex(indexOnTheFly); - factory.setCreateMd5File(generateMD5); - SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); - writerMap.put(id,sw); - } - - public Collection getWriters() { - return writerMap.values(); - } - - public void addAlignment(SAMRecord samRecord) { - final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); - String rg = samRecord.getStringAttribute("RG"); - if ( rg != null ) { - String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); - samRecord.setAttribute("RG",rg_orig); - } - writerMap.get(id).addAlignment(samRecord); - } - - public SAMFileHeader getFileHeader() { - return toolkit.getSAMFileHeader(); - } - - public void close() { - for ( SAMFileWriter w : writerMap.values() ) w.close(); - } -} +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: May 31, 2011 + * Time: 3:52:49 PM + * To change this template use File | Settings | File Templates. + */ +public class NWaySAMFileWriter implements SAMFileWriter { + + private Map writerMap = null; + private boolean presorted ; + GenomeAnalysisEngine toolkit; + boolean KEEP_ALL_PG_RECORDS = false; + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5) { + this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The in2out map must contain an entry for each input filename and map it + * onto a unique output file name. + * @param toolkit + * @param in2out + */ + public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + if ( ! in2out.containsKey(fName) ) + throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName); + outName = in2out.get(fName); + + if ( writerMap.containsKey( rid ) ) + throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ + "map file likely contains multiple entries for this input file"); + + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The output file names will be generated automatically by stripping ".sam" or ".bam" off the + * input file name and adding ext instead (e.g. ".cleaned.bam"). + * onto a unique output file name. + * @param toolkit + * @param ext + */ + public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + int pos ; + if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM"); + else { + if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM"); + else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam"); + } + String prefix = fName.substring(0,pos); + outName = prefix+ext; + + if ( writerMap.containsKey( rid ) ) + throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, + boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { + File f = new File(outName); + SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); + header.setSortOrder(order); + + if ( programRecord != null ) { + // --->> add program record + List oldRecords = header.getProgramRecords(); + List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) { + if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + newRecords.add(record); + } + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + // <-- add program record ends here + } + SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(indexOnTheFly); + factory.setCreateMd5File(generateMD5); + SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); + writerMap.put(id,sw); + } + + public Collection getWriters() { + return writerMap.values(); + } + + public void addAlignment(SAMRecord samRecord) { + final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); + String rg = samRecord.getStringAttribute("RG"); + if ( rg != null ) { + String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); + samRecord.setAttribute("RG",rg_orig); + } + addAlignment(samRecord, id); + } + + public void addAlignment(SAMRecord samRecord, SAMReaderID readerID) { + writerMap.get(readerID).addAlignment(samRecord); + } + + public SAMFileHeader getFileHeader() { + return toolkit.getSAMFileHeader(); + } + + public void close() { + for ( SAMFileWriter w : writerMap.values() ) w.close(); + } +} From cab8ba7c7528b212012fe8955df2fbaab75b7c8b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 13 Nov 2012 15:21:57 -0500 Subject: [PATCH 10/24] Breaking the utility to write @PG tags for SAMFileWriters and StingSAMFileWriters --- .../compression/reducereads/ReduceReads.java | 3 +- .../org/broadinstitute/sting/utils/Utils.java | 59 +++++++++++++++++-- .../sting/utils/sam/NWaySAMFileWriter.java | 21 ++----- 3 files changed, 59 insertions(+), 24 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 3712e4524..3cdf3d75e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -256,7 +256,6 @@ public class ReduceReads extends ReadWalker, ReduceRea // todo -- rework the whole NO_PG_TAG thing final boolean preSorted = true; final boolean indexOnTheFly = true; - final boolean generateMD5 = true; final boolean keep_records = true; final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; if (nwayout) { @@ -267,7 +266,7 @@ public class ReduceReads extends ReadWalker, ReduceRea writerToUse = out; out.setPresorted(false); if (!NO_PG_TAG) { - Utils.setupWriter(out, toolkit, !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index f4a200af0..b780d0966 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -687,23 +687,69 @@ public class Utils { array[i] = value; } - public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { - final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); - - SAMFileHeader header = toolkit.getSAMFileHeader(); + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param toolkit the engine + * @param originalHeader original header + * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file + * @param programRecord the program record for this program + */ + public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, SAMProgramRecord programRecord) { + SAMFileHeader header = originalHeader.clone(); List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS ) + if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); newRecords.add(programRecord); header.setProgramRecords(newRecords); + return header; + } + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and returns + * the new header to be added to the BAM writer. + * + * @param toolkit the engine + * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a pre-filled header for the bam writer + */ + public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); + return setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, programRecord); + } + + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param writer BAM file writer + * @param toolkit the engine + * @param preSorted whether or not the writer can assume reads are going to be added are already sorted + * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + */ + public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + SAMFileHeader header = setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, walker, PROGRAM_RECORD_NAME); writer.writeHeader(header); writer.setPresorted(preSorted); } - + + + /** + * Creates a program record (@PG) tag + * + * @param toolkit the engine + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a program record for the tool + */ public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) { final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); @@ -858,4 +904,5 @@ public class Utils { } return subLists; } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index 83d1c99bf..cdf70884c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -28,11 +28,14 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; /** * Created by IntelliJ IDEA. @@ -138,21 +141,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { File f = new File(outName); - SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); - header.setSortOrder(order); - - if ( programRecord != null ) { - // --->> add program record - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) - newRecords.add(record); - } - newRecords.add(programRecord); - header.setProgramRecords(newRecords); - // <-- add program record ends here - } + SAMFileHeader header = Utils.setupWriter(toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord); SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(indexOnTheFly); factory.setCreateMd5File(generateMD5); From 8b749673bce448d1b92cae97649f161d0d04eef0 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Nov 2012 13:59:34 -0500 Subject: [PATCH 11/24] centralize header element removal in reduce reads --- .../compression/reducereads/ReduceReads.java | 1 - .../compression/reducereads/SlidingWindow.java | 16 +++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 3cdf3d75e..629a27c48 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -253,7 +253,6 @@ public class ReduceReads extends ReadWalker, ReduceRea intervalList.addAll(toolkit.getIntervals()); - // todo -- rework the whole NO_PG_TAG thing final boolean preSorted = true; final boolean indexOnTheFly = true; final boolean keep_records = true; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 24a3ba3cb..fff1c20a5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -220,7 +220,6 @@ public class SlidingWindow { regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); } - // todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { readsInWindow.pollFirst(); } @@ -607,9 +606,7 @@ public class SlidingWindow { toRemove.add(read); } } - for (GATKSAMRecord read : toRemove) { - readsInWindow.remove(read); - } + removeReadsFromWindow(toRemove); } return allReads; } @@ -805,9 +802,8 @@ public class SlidingWindow { hetReads.add(finalizeRunningConsensus()); } - for (GATKSAMRecord read : toRemove) { - readsInWindow.remove(read); - } + removeReadsFromWindow(toRemove); + return hetReads; } @@ -924,5 +920,11 @@ public class SlidingWindow { } } } + + private void removeReadsFromWindow (List readsToRemove) { + for (GATKSAMRecord read : readsToRemove) { + readsInWindow.remove(read); + } + } } From a3f59325016b9ffb78f71941aa9fd54bb9e4b3d6 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 26 Nov 2012 11:12:27 -0500 Subject: [PATCH 17/24] Fixed null pointer exception in Integration Tests When running Utils.setupWriter with NO_PG_TAG set, the writer was attempting to create a program record with the null pointer. Fixed. --- public/java/src/org/broadinstitute/sting/utils/Utils.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b780d0966..544030f73 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -701,11 +701,13 @@ public class Utils { List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); - newRecords.add(programRecord); - header.setProgramRecords(newRecords); + if (programRecord != null) { + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } return header; } From c3b7dd1374ece9cb8bffedd6518f5d6d9582eec0 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 26 Nov 2012 12:19:11 -0500 Subject: [PATCH 18/24] Misc cleanup in the HaplotypeCaller. Cleaning up unused arguments after recent changes to HC-GenotypingEngine --- .../haplotypecaller/GenotypingEngine.java | 4 +- .../haplotypecaller/HaplotypeCaller.java | 66 ++----------------- .../LikelihoodCalculationEngine.java | 1 - .../HaplotypeCallerIntegrationTest.java | 17 ++--- .../annotator/MappingQualityRankSumTest.java | 5 +- .../gatk/walkers/annotator/RankSumTest.java | 2 +- .../walkers/annotator/ReadPosRankSumTest.java | 2 +- 7 files changed, 21 insertions(+), 76 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index beec8a92e..4fc2dc8f7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -41,13 +41,11 @@ import java.util.*; public class GenotypingEngine { private final boolean DEBUG; - private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); - public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { + public GenotypingEngine( final boolean DEBUG ) { this.DEBUG = DEBUG; - this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE; noCall.add(Allele.NO_CALL); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 2b739a321..24b3309f1 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -131,14 +131,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; - @Advanced - @Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false) - protected boolean GENOTYPE_FULL_ACTIVE_REGION = false; - - @Advanced - @Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false) - protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false; - @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) protected int gcpHMM = 10; @@ -248,10 +240,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); - simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); @@ -273,15 +266,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_PL_KEY); - // header lines for the experimental HaplotypeCaller-specific annotations - headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant")); - headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region")); - headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region")); - headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region")); - headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles")); - headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL")); - headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX")); - headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant")); // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used @@ -298,7 +282,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); + genotypingEngine = new GenotypingEngine( DEBUG ); } //--------------------------------------------------------------------------------------------------------------- @@ -428,43 +412,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); - - if( !GENOTYPE_FULL_ACTIVE_REGION ) { - // add some custom annotations to the calls - - // Calculate the number of variants on the haplotype - int maxNumVar = 0; - for( final Allele allele : callResult.getFirst().getAlleles() ) { - if( !allele.isReference() ) { - for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { - final int numVar = haplotype.getEventMap().size(); - if( numVar > maxNumVar ) { maxNumVar = numVar; } - } - } - } - // Calculate the event length - int maxLength = 0; - for ( final Allele a : annotatedCall.getAlternateAlleles() ) { - final int length = a.length() - annotatedCall.getReference().length(); - if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } - } - - myAttributes.put("NVH", maxNumVar); - myAttributes.put("NumHapEval", bestHaplotypes.size()); - myAttributes.put("NumHapAssembly", haplotypes.size()); - myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); - myAttributes.put("EVENTLENGTH", maxLength); - myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); - myAttributes.put("extType", annotatedCall.getType().toString() ); - - //if( likelihoodCalculationEngine.haplotypeScore != null ) { - // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); - //} - if( annotatedCall.hasAttribute("QD") ) { - myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); - } - } - vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); } @@ -520,6 +467,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // protect against INTERVALS with abnormally high coverage + // BUGBUG: remove when positinal downsampler is hooked up to ART/HC if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { activeRegion.add(clippedRead); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 304f8d5cb..29622ca17 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -169,7 +169,6 @@ public class LikelihoodCalculationEngine { } // compute the diploid haplotype likelihoods - // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a57462d1d..007df3602 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,18 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "56aa4b84606b6b0b7dc78a383974d1b3"); + HCTest(CEUTRIO_BAM, "", "2b39732ff8e0de5bc2ae949aaf7a6f21"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f"); + HCTest(NA12878_BAM, "", "8b217638ff585effb9cc70e9a9aa544f"); } // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "f2d0309fdf50d5827e9c60ed0dd07e3f"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", + "541aa8291f03ba33bd1ad3d731fd5657"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -43,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd7170cbde7df04d4fbe1da7903c31c6"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -54,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "7fbc6b9e27e374f2ffe4be952d88c7c6"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "99456fc7207c1fe9f367a0d0afae87cd"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -65,7 +66,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6c1631785b3f832aecab1a99f0454762"); } @Test @@ -78,7 +79,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("96ab8253d242b851ccfc218759f79784")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("237601bbc39694c7413a332cbb656c8e")); executeTest("HCTestStructuralIndels: ", spec); } @@ -92,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("425f1a0fb00d7145edf1c55e54346fae")); + Arrays.asList("40bf739fb2b1743642498efe79ea6342")); executeTest("HC calling on a ReducedRead BAM", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 82596a501..2679a169b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn final List refQuals, final List altQuals) { if (pileup != null && likelihoodMap == null) { - // no per-read likelihoods available: + // old UG snp-only path through the annotations for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { @@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn } for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + // BUGBUG: There needs to be a comparable isUsableBase check here if (a.isNoCall()) continue; // read is non-informative if (a.isReference()) refQuals.add((double)el.getKey().getMappingQuality()); else if (allAlleles.contains(a)) altQuals.add((double)el.getKey().getMappingQuality()); - - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 0df7aff71..e7c0e6b14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR ReadBackedPileup pileup = null; - if (stratifiedContexts != null) { + if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context != null ) pileup = context.getBasePileup(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index d01233bb2..334b89f01 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio final List refQuals, final List altQuals) { if (alleleLikelihoodMap == null) { - // use fast SNP-based version if we don't have per-read allele likelihoods + // use old UG SNP-based version if we don't have per-read allele likelihoods for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); From 59cef880d195937e2e2eee9344a7bcc0d2ff016b Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 26 Nov 2012 12:20:07 -0500 Subject: [PATCH 19/24] Updating HC integration tests because experimental, HC-specific annotations have been removed. --- .../walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 007df3602..f8ba1f4cc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -72,7 +72,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("788176e1717bd28fc7cbc8e3efbb6100")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec437d2d9f3ae07d155983be0155c8ed")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } From 405f3c675d9daa589942e830db0870931741f113 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Nov 2012 01:07:00 -0500 Subject: [PATCH 20/24] Fix for GSA-649: GenomeLocSortedSet.overlaps is crazy slow. Also improved GenomeLocSortedSet.sizeBeforeLoc. --- .../traversals/TraverseActiveRegions.java | 1 - .../sting/utils/GenomeLocSortedSet.java | 49 +++++++++++++++---- .../utils/GenomeLocSortedSetUnitTest.java | 36 ++++++++++++++ 3 files changed, 76 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index a2c37944a..3f20db0af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -80,7 +80,6 @@ public class TraverseActiveRegions extends TraversalEngine { // our private storage for the GenomeLoc's private List mArray = new ArrayList(); + // cache this to make overlap checking much more efficient + private int previousOverlapSearchIndex = -1; + /** default constructor */ public GenomeLocSortedSet(GenomeLocParser parser) { this.genomeLocParser = parser; @@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet { * Return the number of bps before loc in the sorted set * * @param loc the location before which we are counting bases - * @return + * @return the number of base pairs over all previous intervals */ public long sizeBeforeLoc(GenomeLoc loc) { long s = 0; @@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( e.isBefore(loc) ) s += e.size(); else if ( e.isPast(loc) ) - ; // don't do anything + break; // we are done else // loc is inside of s s += loc.getStart() - e.getStart(); } @@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet { * Determine if the given loc overlaps any loc in the sorted set * * @param loc the location to test - * @return + * @return trip if the location overlaps any loc */ public boolean overlaps(final GenomeLoc loc) { - for(final GenomeLoc e : mArray) { - if(e.overlapsP(loc)) { - return true; - } + // edge condition + if ( mArray.isEmpty() ) + return false; + + // use the cached version first + if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) ) + return true; + + // update the cached index + previousOverlapSearchIndex = Collections.binarySearch(mArray, loc); + + // if it matches an interval exactly, we are done + if ( previousOverlapSearchIndex > 0 ) + return true; + + // check whether it overlaps the interval before or after the insertion point + previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2); + return overlapsAtOrImmediatelyAfterCachedIndex(loc, false); + } + + private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) { + // check the cached entry + if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) ) + return true; + + // check the entry after the cached entry since we may have moved to it + boolean returnValue = false; + if ( previousOverlapSearchIndex < mArray.size() - 1 ) { + returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc); + if ( updateCachedIndex ) + previousOverlapSearchIndex++; } - return false; + + return returnValue; } /** @@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet { mArray.add(e); return true; } else { - int loc = Collections.binarySearch(mArray,e); + final int loc = Collections.binarySearch(mArray,e); if (loc >= 0) { throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); } else { diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java index 3d21e654f..6138e7396 100755 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import org.testng.annotations.BeforeClass; @@ -117,6 +118,41 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(loc.getContigIndex() == 1); } + @Test + public void overlap() { + for ( int i = 1; i < 6; i++ ) { + final int start = i * 10; + mSortedSet.add(genomeLocParser.createGenomeLoc(contigOneName, start, start + 1)); + } + + // test matches in and around interval + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 9, 9))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 10, 10))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 11))); + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 12))); + + // test matches spanning intervals + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 14, 20))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 15))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 30, 40))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53))); + + // test miss + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 19))); + + // test exact match after miss + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 40, 41))); + + // test matches at beginning of intervals + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 5, 6))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 0, 10))); + + // test matches at end of intervals + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53))); + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53))); + } + @Test public void mergingOverlappingAbove() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50); From cc72aaefebfed89723c40403ae0dfd3a932ff78d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Nov 2012 01:11:23 -0500 Subject: [PATCH 21/24] Minor efficiency: use >= instead of > in test --- .../src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index ca1d385a2..394220106 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -149,7 +149,7 @@ public class GenomeLocSortedSet extends AbstractSet { previousOverlapSearchIndex = Collections.binarySearch(mArray, loc); // if it matches an interval exactly, we are done - if ( previousOverlapSearchIndex > 0 ) + if ( previousOverlapSearchIndex >= 0 ) return true; // check whether it overlaps the interval before or after the insertion point From b1969a66bdcf755757517e77582b9d4e55caeb54 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Nov 2012 08:24:41 -0500 Subject: [PATCH 22/24] Update docs --- .../gatk/walkers/fasta/FastaAlternateReferenceMaker.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index 2b9744b89..22c6097cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -47,6 +47,12 @@ import java.util.List; *

* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * + * The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). + * * Several important notes: * 1) if there are multiple variants that start at a site, it chooses one of them randomly. * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. From e199562c2521032abd003bd315cbfad294208e93 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Nov 2012 10:26:17 -0500 Subject: [PATCH 23/24] I have pulled out all of the documentation URLs and put them into the HelpUtils class as static variables; this way, Appistry can change links as needed to point commercial users to their own internal forum without having to muck things up all over our source. Added some TODOs for Geraldine to update links in the GATK docs that still point to the old wiki. Sorry that I am pushing into stable, but that's what Appistry is pulling from for their release next week (and unstable has been failing forever). --- .../sting/commandline/CommandLineProgram.java | 3 ++- .../sting/gatk/CommandLineGATK.java | 24 ++++++++++++------- .../sting/gatk/filters/FilterManager.java | 6 ++--- .../gatk/walkers/diffengine/DiffEngine.java | 3 ++- .../gatk/walkers/diffengine/DiffObjects.java | 3 ++- .../stratifications/JexlExpression.java | 2 +- .../VariantDataManager.java | 3 ++- .../VariantValidationAssessor.java | 3 +-- .../sting/utils/exceptions/UserException.java | 5 ++-- .../sting/utils/help/ForumAPIUtils.java | 8 +++---- .../sting/utils/help/GATKDocUtils.java | 2 +- .../sting/utils/help/HelpUtils.java | 9 +++++++ .../SortingVariantContextWriterBase.java | 1 - .../utils/codecs/vcf/VCFIntegrationTest.java | 1 - 14 files changed, 43 insertions(+), 30 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index d77ae67cf..fb15a3722 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.help.ApplicationDetails; import org.broadinstitute.sting.utils.help.HelpFormatter; +import org.broadinstitute.sting.utils.help.HelpUtils; import java.io.IOException; import java.util.*; @@ -288,7 +289,7 @@ public abstract class CommandLineProgram { */ private static void printDocumentationReference() { errorPrintf("Visit our website and forum for extensive documentation and answers to %n"); - errorPrintf("commonly asked questions http://www.broadinstitute.org/gatk%n"); + errorPrintf("commonly asked questions " + HelpUtils.BASE_GATK_URL + "%n"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 0daad2c2b..d1711ba4c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.ApplicationDetails; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.GATKDocUtils; +import org.broadinstitute.sting.utils.help.HelpUtils; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.util.*; @@ -118,17 +119,24 @@ public class CommandLineGATK extends CommandLineExecutable { public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; private static void checkForMaskedUserErrors(final Throwable t) { + // masked out of memory error + if ( t instanceof OutOfMemoryError ) + exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked user error + if ( t instanceof UserException || t instanceof TribbleException ) + exitSystemWithUserError(new UserException(t.getMessage())); + + // no message means no masked error final String message = t.getMessage(); if ( message == null ) return; - // we know what to do about the common "Too many open files" error + // too many open files error if ( message.contains("Too many open files") ) exitSystemWithUserError(new UserException.TooManyOpenFiles()); // malformed BAM looks like a SAM file - if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || - message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) exitSystemWithSamError(t); // can't close tribble index when writing @@ -138,12 +146,10 @@ public class CommandLineGATK extends CommandLineExecutable { // disk is full if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) ) - exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - // masked out of memory error - if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError ) - exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked error wrapped in another one + if ( t.getCause() != null ) + checkForMaskedUserErrors(t.getCause()); } /** @@ -155,7 +161,7 @@ public class CommandLineGATK extends CommandLineExecutable { List header = new ArrayList(); header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); header.add("Copyright (c) 2010 The Broad Institute"); - header.add("For support and documentation go to http://www.broadinstitute.org/gatk"); + header.add("For support and documentation go to " + HelpUtils.BASE_GATK_URL); return header; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java index 5ca8a1779..89099c587 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java @@ -25,11 +25,9 @@ package org.broadinstitute.sting.gatk.filters; -import com.google.common.base.Function; -import com.google.common.collect.Collections2; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.help.GATKDocUtils; +import org.broadinstitute.sting.utils.help.HelpUtils; import java.util.Collection; import java.util.List; @@ -73,7 +71,7 @@ public class FilterManager extends PluginManager { return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, userFriendlyListofReadFilters(availableFilters), - "Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information."); + "Please consult the GATK Documentation (" + HelpUtils.GATK_DOCS_URL + ") for more information."); } private String userFriendlyListofReadFilters(List> filters) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java index 40ed26608..579b84d96 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -282,7 +282,8 @@ public class DiffEngine { // now that we have a specific list of values we want to show, display them GATKReport report = new GATKReport(); final String tableName = "differences"; - report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", 3); + // TODO for Geraldine -- link needs to be updated below + report.addTable(tableName, "Summarized differences between the master and test files. See [ask Geraldine to fix link to DiffEngine wiki] for more information", 3); final GATKReportTable table = report.getTable(tableName); table.addColumn("Difference"); table.addColumn("NumberOfOccurrences"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java index 92e2e2dc4..5951ee7d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjects.java @@ -138,7 +138,8 @@ public class DiffObjects extends RodWalker { /** * Writes out a file of the DiffEngine format: * - * http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine + * TODO for Geraldine -- link needs to be updated below (and also in SelectVariants and RefSeqCodec GATK docs) + * http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine */ @Output(doc="File to which results should be written",required=true) protected PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java index dc5438358..c89c4be66 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/JexlExpression.java @@ -13,7 +13,7 @@ import java.util.Set; /** * Stratifies the eval RODs by user-supplied JEXL expressions * - * See http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions for more details + * See http://gatkforums.broadinstitute.org/discussion/1255/what-are-jexl-expressions-and-how-can-i-use-them-with-the-gatk for more details */ public class JexlExpression extends VariantStratifier implements StandardStratification { // needs to know the jexl expressions diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index aacd987d5..3382a1d9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.help.HelpUtils; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -80,7 +81,7 @@ public class VariantDataManager { final double theSTD = standardDeviation(theMean, iii); logger.info( annotationKeys.get(iii) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) ); if( Double.isNaN(theMean) ) { - throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See http://www.broadinstitute.org/gsa/wiki/index.php/VariantAnnotator"); + throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpUtils.GATK_FORUM_URL + "discussion/49/using-variant-annotator"); } foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index a301867fc..9236247f1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -51,8 +51,7 @@ import java.util.*; * The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes). * The tool produces a VCF that is annotated with information pertaining to plate quality control and by * default is soft-filtered by high no-call rate or low Hardy-Weinberg probability. - * If you have .ped files, please first convert them to VCF format - * (see http://www.broadinstitute.org/gsa/wiki/index.php/Converting_ped_to_vcf). + * If you have .ped files, please first convert them to VCF format. * *

Input

*

diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index a49a12292..a2ec35ae2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -30,6 +30,7 @@ import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -267,7 +268,7 @@ public class UserException extends ReviewedStingException { public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(SAMRecord read) { - super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName())); + super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpUtils.GATK_FORUM_URL + "discussion/59/companion-utilities-replacereadgroups to fix this problem", read.getReadName())); } } @@ -343,7 +344,7 @@ public class UserException extends ReviewedStingException { super(String.format("Lexicographically sorted human genome sequence detected in %s." + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." - + "\nYou can use the ReorderSam utility to fix this problem: http://www.broadinstitute.org/gsa/wiki/index.php/ReorderSam" + + "\nYou can use the ReorderSam utility to fix this problem: " + HelpUtils.GATK_FORUM_URL + "discussion/58/companion-utilities-reordersam" + "\n %s contigs = %s", name, name, ReadUtils.prettyPrintSequenceRecords(dict))); } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java index fe5f48a48..64238dc73 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java @@ -44,14 +44,13 @@ public class ForumAPIUtils { /** * How we post to the forum */ - private final static String API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; final private static String ACCESS_TOKEN = "access_token="; public static List getPostedTools(String forumKey) { Gson gson = new Gson(); List output = new ArrayList(); - String text = httpGet(API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); + String text = httpGet(HelpUtils.GATK_FORUM_API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); APIQuery details = gson.fromJson(text, APIQuery.class); ForumDiscussion[] discussions = details.Discussions; @@ -159,7 +158,7 @@ public class ForumAPIUtils { Gson gson = new Gson(); String data = gson.toJson(post.getPostData()); - httpPost(data, API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); + httpPost(data, HelpUtils.GATK_FORUM_API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); } @@ -167,8 +166,7 @@ public class ForumAPIUtils { class APIQuery { ForumDiscussion[] Discussions; - public APIQuery() { - } + public APIQuery() {} } } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java index 4ec2ac6d7..21054a794 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java @@ -28,7 +28,7 @@ public class GATKDocUtils { /** * The URL root for RELEASED GATKDOC units */ - public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = "http://www.broadinstitute.org/gatk/gatkdocs/"; + public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = HelpUtils.GATK_DOCS_URL; /** * The URL root for STABLE GATKDOC units */ diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java index 645ab34c1..1bc20d5a0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -32,6 +32,15 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils; import java.lang.reflect.Field; public class HelpUtils { + + public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; + public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; + public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; + + + + protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) { try { Class type = getClassForDoc(classDoc); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/SortingVariantContextWriterBase.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/SortingVariantContextWriterBase.java index 18d91ef3f..1f3cdd0fe 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/SortingVariantContextWriterBase.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/SortingVariantContextWriterBase.java @@ -71,7 +71,6 @@ abstract class SortingVariantContextWriterBase implements VariantContextWriter { this.takeOwnershipOfInner = takeOwnershipOfInner; // has to be PriorityBlockingQueue to be thread-safe - // see http://getsatisfaction.com/gsa/topics/missing_loci_output_in_multi_thread_mode_when_implement_sortingvcfwriterbase?utm_content=topic_link&utm_medium=email&utm_source=new_topic this.queue = new PriorityBlockingQueue(50, new VariantContextComparator()); this.mostUpstreamWritableLoc = BEFORE_MOST_UPSTREAM_LOC; diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index b2a4ac2da..b9ce58992 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -28,7 +28,6 @@ public class VCFIntegrationTest extends WalkerTest { } @Test(enabled = true) - // See https://getsatisfaction.com/gsa/topics/support_vcf_4_1_structural_variation_breakend_alleles?utm_content=topic_link&utm_medium=email&utm_source=new_topic public void testReadingAndWritingBreakpointAlleles() { String testVCF = privateTestDir + "breakpoint-example.vcf"; From 4543ece0889cc07acc9d2a9cbefda7882ae340ef Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Nov 2012 11:00:33 -0500 Subject: [PATCH 24/24] Fixing parsing of genomelocs that contain colons in the contig names (which is allowed by the spec) as reported on the forum. Added unit test for this case. --- .../sting/utils/GenomeLocParser.java | 2 +- .../sting/utils/GenomeLocParserUnitTest.java | 21 ++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index a3ffe708c..bf60b4a80 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -374,7 +374,7 @@ public final class GenomeLocParser { int start = 1; int stop = -1; - final int colonIndex = str.indexOf(":"); + final int colonIndex = str.lastIndexOf(":"); if(colonIndex == -1) { contig = str.substring(0, str.length()); // chr1 stop = Integer.MAX_VALUE; diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index e9f138a0e..e4313b30a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -74,6 +76,23 @@ public class GenomeLocParserUnitTest extends BaseTest { genomeLocParser.parseGenomeLoc("Bad:0-1"); } + @Test + public void testContigHasColon() { + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate); + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + SAMSequenceRecord rec = new SAMSequenceRecord("c:h:r1", 10); + rec.setSequenceLength(10); + dict.addSequence(rec); + header.setSequenceDictionary(dict); + + final GenomeLocParser myGenomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + GenomeLoc loc = myGenomeLocParser.parseGenomeLoc("c:h:r1:4-5"); + assertEquals(0, loc.getContigIndex()); + assertEquals(loc.getStart(), 4); + assertEquals(loc.getStop(), 5); + } + @Test public void testParseGoodString() { GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-10"); @@ -81,7 +100,7 @@ public class GenomeLocParserUnitTest extends BaseTest { assertEquals(loc.getStop(), 10); assertEquals(loc.getStart(), 1); } - + @Test public void testCreateGenomeLoc1() { GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 1, 100);