diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java index 81a17b6ae..38fa060cc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectHeaders.java @@ -57,6 +57,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -180,18 +182,47 @@ public class SelectHeaders extends RodWalker implements TreeRe headerLines = new LinkedHashSet(getSelectedHeaders(headerLines)); // Optionally add in the intervals. - if (includeIntervals && getToolkit().getArguments().intervals != null) { - for (IntervalBinding intervalBinding : getToolkit().getArguments().intervals) { - String source = intervalBinding.getSource(); - if (source == null) - continue; - File file = new File(source); - if (file.exists()) { - headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); - } else { - headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); + if (includeIntervals) { + IntervalArgumentCollection intervalArguments = getToolkit().getArguments().intervalArguments; + if (intervalArguments.intervals != null) { + for (IntervalBinding intervalBinding : intervalArguments.intervals) { + String source = intervalBinding.getSource(); + if (source == null) + continue; + File file = new File(source); + if (file.exists()) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); + } else { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); + } } } + + if (intervalArguments.excludeIntervals != null) { + for (IntervalBinding intervalBinding : intervalArguments.excludeIntervals) { + String source = intervalBinding.getSource(); + if (source == null) + continue; + File file = new File(source); + if (file.exists()) { + headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); + } else { + headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, source)); + } + } + } + + if (intervalArguments.intervalMerging != IntervalMergingRule.ALL) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_MERGING_KEY, String.valueOf(intervalArguments.intervalMerging))); + } + + if (intervalArguments.intervalSetRule != IntervalSetRule.UNION) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_SET_RULE_KEY, String.valueOf(intervalArguments.intervalSetRule))); + } + + if (intervalArguments.intervalPadding != 0) { + headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_PADDING_KEY, String.valueOf(intervalArguments.intervalPadding))); + } } TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java new file mode 100644 index 000000000..3f76ae652 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalArgumentCollection.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.commandline; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; + +import java.util.List; + +public class IntervalArgumentCollection { + /** + * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. + * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). + * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). + * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped. + */ + @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) + public List> intervals = null; + + /** + * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times. + * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals). + * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf). + */ + @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) + public List> excludeIntervals = null; + + /** + * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions + * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION). + */ + @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false) + public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; + + /** + * Should abutting (but not overlapping) intervals be treated as separate intervals? + */ + @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false) + public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; + + /** + * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'. + */ + @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false) + public int intervalPadding = 0; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f9d6955c0..9b801be7d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -55,7 +55,6 @@ import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -361,7 +360,6 @@ public class GenomeAnalysisEngine { * Returns a list of active, initialized read transformers * * @param walker the walker we need to apply read transformers too - * @return a non-null list of read transformers */ public void initializeReadTransformers(final Walker walker) { final List activeTransformers = new ArrayList(); @@ -672,41 +670,7 @@ public class GenomeAnalysisEngine { * Setup the intervals to be processed */ protected void initializeIntervals() { - // return if no interval arguments at all - if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) - return; - - // Note that the use of '-L all' is no longer supported. - - // if include argument isn't given, create new set of all possible intervals - - final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( - this.referenceDataSource, - argCollection.intervals, - argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, - argCollection.excludeIntervals); - - final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); - final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); - - // if no exclude arguments, can return parseIntervalArguments directly - if ( excludeSortedSet == null ) - intervals = includeSortedSet; - - // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets - else { - intervals = includeSortedSet.subtractRegions(excludeSortedSet); - - // logging messages only printed when exclude (-XL) arguments are given - final long toPruneSize = includeSortedSet.coveredSize(); - final long toExcludeSize = excludeSortedSet.coveredSize(); - final long intervalSize = intervals.coveredSize(); - logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); - logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", - toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); - } - - logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize())); + intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index ab09064dd..62ca38ad2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -26,11 +26,7 @@ package org.broadinstitute.sting.gatk.arguments; import net.sf.samtools.SAMFileReader; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.IntervalBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; @@ -38,8 +34,6 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalSetRule; import java.io.File; import java.util.ArrayList; @@ -100,41 +94,8 @@ public class GATKArgumentCollection { @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) public List readFilters = new ArrayList(); - /** - * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). - * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). - * To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped. - */ - @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> intervals = null; - - /** - * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times. - * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals). - * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf). - */ - @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) - public List> excludeIntervals = null; - - /** - * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions - * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION). - */ - @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false) - public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; - - /** - * Should abutting (but not overlapping) intervals be treated as separate intervals? - */ - @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false) - public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; - - /** - * For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'. - */ - @Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false) - public int intervalPadding = 0; + @ArgumentCollection + public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) public File referenceFile = null; diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index c647a7b80..7374dda14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -32,6 +32,7 @@ import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.IntervalArgumentCollection; import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; @@ -534,6 +535,47 @@ public class IntervalUtils { } } + public static GenomeLocSortedSet parseIntervalArguments(final ReferenceDataSource referenceDataSource, IntervalArgumentCollection argCollection) { + GenomeLocSortedSet intervals = null; + + // return if no interval arguments at all + if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) + return intervals; + + // Note that the use of '-L all' is no longer supported. + + // if include argument isn't given, create new set of all possible intervals + + final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( + referenceDataSource, + argCollection.intervals, + argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, + argCollection.excludeIntervals); + + final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); + + // if no exclude arguments, can return parseIntervalArguments directly + if ( excludeSortedSet == null ) + intervals = includeSortedSet; + + // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets + else { + intervals = includeSortedSet.subtractRegions(excludeSortedSet); + + // logging messages only printed when exclude (-XL) arguments are given + final long toPruneSize = includeSortedSet.coveredSize(); + final long toExcludeSize = excludeSortedSet.coveredSize(); + final long intervalSize = intervals.coveredSize(); + logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); + logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", + toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); + } + + logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize())); + return intervals; + } + public static Pair parseIntervalBindingsPair( final ReferenceDataSource referenceDataSource, final List> intervals, diff --git a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java index 583a01417..9bdb86a48 100644 --- a/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/variant/vcf/VCFHeader.java @@ -73,6 +73,10 @@ public class VCFHeader { public static final String REFERENCE_KEY = "reference"; public static final String CONTIG_KEY = "contig"; public static final String INTERVALS_KEY = "intervals"; + public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals"; + public static final String INTERVAL_MERGING_KEY = "interval_merging"; + public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule"; + public static final String INTERVAL_PADDING_KEY = "interval_padding"; // were the input samples sorted originally (or are we sorting them)? private boolean samplesWereAlreadySorted = true; diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 35f9d4137..2be2745de 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1068,7 +1068,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List> intervalArgs = new ArrayList>(1); intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); - IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser); } @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") @@ -1081,7 +1081,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List> intervalArgs = new ArrayList>(1); intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); - IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); + IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser); } private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala index 28be82136..23a99b586 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala @@ -27,6 +27,8 @@ package org.broadinstitute.sting.queue.util import java.io.File import org.broadinstitute.sting.utils.io.FileExtension +import java.util.Date +import java.net.URL /** * An extension of java.io.File that can be pulled from or pushed to a remote location. @@ -35,5 +37,6 @@ trait RemoteFile extends File with FileExtension { def pullToLocal() def pushToRemote() def deleteRemote() + def createUrl(expiration: Date): URL def remoteDescription: String }