Merge branch 'master' of github.com:broadinstitute/gsa-unstable
This commit is contained in:
commit
e47a389b26
|
|
@ -57,6 +57,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
|
|
@ -180,18 +182,47 @@ public class SelectHeaders extends RodWalker<Integer, Integer> implements TreeRe
|
|||
headerLines = new LinkedHashSet<VCFHeaderLine>(getSelectedHeaders(headerLines));
|
||||
|
||||
// Optionally add in the intervals.
|
||||
if (includeIntervals && getToolkit().getArguments().intervals != null) {
|
||||
for (IntervalBinding<Feature> intervalBinding : getToolkit().getArguments().intervals) {
|
||||
String source = intervalBinding.getSource();
|
||||
if (source == null)
|
||||
continue;
|
||||
File file = new File(source);
|
||||
if (file.exists()) {
|
||||
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
|
||||
} else {
|
||||
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source));
|
||||
if (includeIntervals) {
|
||||
IntervalArgumentCollection intervalArguments = getToolkit().getArguments().intervalArguments;
|
||||
if (intervalArguments.intervals != null) {
|
||||
for (IntervalBinding<Feature> intervalBinding : intervalArguments.intervals) {
|
||||
String source = intervalBinding.getSource();
|
||||
if (source == null)
|
||||
continue;
|
||||
File file = new File(source);
|
||||
if (file.exists()) {
|
||||
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
|
||||
} else {
|
||||
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (intervalArguments.excludeIntervals != null) {
|
||||
for (IntervalBinding<Feature> intervalBinding : intervalArguments.excludeIntervals) {
|
||||
String source = intervalBinding.getSource();
|
||||
if (source == null)
|
||||
continue;
|
||||
File file = new File(source);
|
||||
if (file.exists()) {
|
||||
headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
|
||||
} else {
|
||||
headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, source));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (intervalArguments.intervalMerging != IntervalMergingRule.ALL) {
|
||||
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_MERGING_KEY, String.valueOf(intervalArguments.intervalMerging)));
|
||||
}
|
||||
|
||||
if (intervalArguments.intervalSetRule != IntervalSetRule.UNION) {
|
||||
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_SET_RULE_KEY, String.valueOf(intervalArguments.intervalSetRule)));
|
||||
}
|
||||
|
||||
if (intervalArguments.intervalPadding != 0) {
|
||||
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_PADDING_KEY, String.valueOf(intervalArguments.intervalPadding)));
|
||||
}
|
||||
}
|
||||
|
||||
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class IntervalArgumentCollection {
|
||||
/**
|
||||
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
|
||||
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
|
||||
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
|
||||
* To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
|
||||
*/
|
||||
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
|
||||
public List<IntervalBinding<Feature>> intervals = null;
|
||||
|
||||
/**
|
||||
* Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
|
||||
* One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
|
||||
* Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
|
||||
*/
|
||||
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
|
||||
public List<IntervalBinding<Feature>> excludeIntervals = null;
|
||||
|
||||
/**
|
||||
* How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
|
||||
* for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
|
||||
*/
|
||||
@Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
|
||||
public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
|
||||
|
||||
/**
|
||||
* Should abutting (but not overlapping) intervals be treated as separate intervals?
|
||||
*/
|
||||
@Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
|
||||
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
|
||||
|
||||
/**
|
||||
* For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
|
||||
*/
|
||||
@Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
|
||||
public int intervalPadding = 0;
|
||||
}
|
||||
|
|
@ -55,7 +55,6 @@ import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
|
@ -361,7 +360,6 @@ public class GenomeAnalysisEngine {
|
|||
* Returns a list of active, initialized read transformers
|
||||
*
|
||||
* @param walker the walker we need to apply read transformers too
|
||||
* @return a non-null list of read transformers
|
||||
*/
|
||||
public void initializeReadTransformers(final Walker walker) {
|
||||
final List<ReadTransformer> activeTransformers = new ArrayList<ReadTransformer>();
|
||||
|
|
@ -672,41 +670,7 @@ public class GenomeAnalysisEngine {
|
|||
* Setup the intervals to be processed
|
||||
*/
|
||||
protected void initializeIntervals() {
|
||||
// return if no interval arguments at all
|
||||
if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
|
||||
return;
|
||||
|
||||
// Note that the use of '-L all' is no longer supported.
|
||||
|
||||
// if include argument isn't given, create new set of all possible intervals
|
||||
|
||||
final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
|
||||
this.referenceDataSource,
|
||||
argCollection.intervals,
|
||||
argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
|
||||
argCollection.excludeIntervals);
|
||||
|
||||
final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
|
||||
final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
|
||||
|
||||
// if no exclude arguments, can return parseIntervalArguments directly
|
||||
if ( excludeSortedSet == null )
|
||||
intervals = includeSortedSet;
|
||||
|
||||
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
|
||||
else {
|
||||
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
|
||||
|
||||
// logging messages only printed when exclude (-XL) arguments are given
|
||||
final long toPruneSize = includeSortedSet.coveredSize();
|
||||
final long toExcludeSize = excludeSortedSet.coveredSize();
|
||||
final long intervalSize = intervals.coveredSize();
|
||||
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
|
||||
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
|
||||
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
||||
}
|
||||
|
||||
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
|
||||
intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -26,11 +26,7 @@
|
|||
package org.broadinstitute.sting.gatk.arguments;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
|
|
@ -38,8 +34,6 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
|
|||
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -100,41 +94,8 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
|
||||
public List<String> readFilters = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
|
||||
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
|
||||
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
|
||||
* To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
|
||||
*/
|
||||
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
|
||||
public List<IntervalBinding<Feature>> intervals = null;
|
||||
|
||||
/**
|
||||
* Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
|
||||
* One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
|
||||
* Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
|
||||
*/
|
||||
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
|
||||
public List<IntervalBinding<Feature>> excludeIntervals = null;
|
||||
|
||||
/**
|
||||
* How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
|
||||
* for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
|
||||
*/
|
||||
@Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
|
||||
public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
|
||||
|
||||
/**
|
||||
* Should abutting (but not overlapping) intervals be treated as separate intervals?
|
||||
*/
|
||||
@Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
|
||||
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
|
||||
|
||||
/**
|
||||
* For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
|
||||
*/
|
||||
@Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
|
||||
public int intervalPadding = 0;
|
||||
@ArgumentCollection
|
||||
public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection();
|
||||
|
||||
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
|
||||
public File referenceFile = null;
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ import net.sf.picard.util.IntervalList;
|
|||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.IntervalArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
|
@ -534,6 +535,47 @@ public class IntervalUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static GenomeLocSortedSet parseIntervalArguments(final ReferenceDataSource referenceDataSource, IntervalArgumentCollection argCollection) {
|
||||
GenomeLocSortedSet intervals = null;
|
||||
|
||||
// return if no interval arguments at all
|
||||
if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
|
||||
return intervals;
|
||||
|
||||
// Note that the use of '-L all' is no longer supported.
|
||||
|
||||
// if include argument isn't given, create new set of all possible intervals
|
||||
|
||||
final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
|
||||
referenceDataSource,
|
||||
argCollection.intervals,
|
||||
argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
|
||||
argCollection.excludeIntervals);
|
||||
|
||||
final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
|
||||
final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
|
||||
|
||||
// if no exclude arguments, can return parseIntervalArguments directly
|
||||
if ( excludeSortedSet == null )
|
||||
intervals = includeSortedSet;
|
||||
|
||||
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
|
||||
else {
|
||||
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
|
||||
|
||||
// logging messages only printed when exclude (-XL) arguments are given
|
||||
final long toPruneSize = includeSortedSet.coveredSize();
|
||||
final long toExcludeSize = excludeSortedSet.coveredSize();
|
||||
final long intervalSize = intervals.coveredSize();
|
||||
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
|
||||
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
|
||||
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
||||
}
|
||||
|
||||
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
|
||||
return intervals;
|
||||
}
|
||||
|
||||
public static Pair<GenomeLocSortedSet, GenomeLocSortedSet> parseIntervalBindingsPair(
|
||||
final ReferenceDataSource referenceDataSource,
|
||||
final List<IntervalBinding<Feature>> intervals,
|
||||
|
|
|
|||
|
|
@ -73,6 +73,10 @@ public class VCFHeader {
|
|||
public static final String REFERENCE_KEY = "reference";
|
||||
public static final String CONTIG_KEY = "contig";
|
||||
public static final String INTERVALS_KEY = "intervals";
|
||||
public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals";
|
||||
public static final String INTERVAL_MERGING_KEY = "interval_merging";
|
||||
public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule";
|
||||
public static final String INTERVAL_PADDING_KEY = "interval_padding";
|
||||
|
||||
// were the input samples sorted originally (or are we sorting them)?
|
||||
private boolean samplesWereAlreadySorted = true;
|
||||
|
|
|
|||
|
|
@ -1068,7 +1068,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
|||
List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1);
|
||||
intervalArgs.add(new IntervalBinding<Feature>(picardIntervalFile.getAbsolutePath()));
|
||||
|
||||
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser);
|
||||
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData")
|
||||
|
|
@ -1081,7 +1081,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
|||
List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1);
|
||||
intervalArgs.add(new IntervalBinding<Feature>(gatkIntervalFile.getAbsolutePath()));
|
||||
|
||||
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser);
|
||||
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser);
|
||||
}
|
||||
|
||||
private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception {
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@ package org.broadinstitute.sting.queue.util
|
|||
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.utils.io.FileExtension
|
||||
import java.util.Date
|
||||
import java.net.URL
|
||||
|
||||
/**
|
||||
* An extension of java.io.File that can be pulled from or pushed to a remote location.
|
||||
|
|
@ -35,5 +37,6 @@ trait RemoteFile extends File with FileExtension {
|
|||
def pullToLocal()
|
||||
def pushToRemote()
|
||||
def deleteRemote()
|
||||
def createUrl(expiration: Date): URL
|
||||
def remoteDescription: String
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue