Merge branch 'master' of github.com:broadinstitute/gsa-unstable

This commit is contained in:
Eric Banks 2013-01-16 14:59:11 -05:00
commit e47a389b26
8 changed files with 166 additions and 91 deletions

View File

@ -57,6 +57,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.vcf.*;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
@ -180,18 +182,47 @@ public class SelectHeaders extends RodWalker<Integer, Integer> implements TreeRe
headerLines = new LinkedHashSet<VCFHeaderLine>(getSelectedHeaders(headerLines));
// Optionally add in the intervals.
if (includeIntervals && getToolkit().getArguments().intervals != null) {
for (IntervalBinding<Feature> intervalBinding : getToolkit().getArguments().intervals) {
String source = intervalBinding.getSource();
if (source == null)
continue;
File file = new File(source);
if (file.exists()) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
} else {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source));
if (includeIntervals) {
IntervalArgumentCollection intervalArguments = getToolkit().getArguments().intervalArguments;
if (intervalArguments.intervals != null) {
for (IntervalBinding<Feature> intervalBinding : intervalArguments.intervals) {
String source = intervalBinding.getSource();
if (source == null)
continue;
File file = new File(source);
if (file.exists()) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
} else {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source));
}
}
}
if (intervalArguments.excludeIntervals != null) {
for (IntervalBinding<Feature> intervalBinding : intervalArguments.excludeIntervals) {
String source = intervalBinding.getSource();
if (source == null)
continue;
File file = new File(source);
if (file.exists()) {
headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
} else {
headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, source));
}
}
}
if (intervalArguments.intervalMerging != IntervalMergingRule.ALL) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_MERGING_KEY, String.valueOf(intervalArguments.intervalMerging)));
}
if (intervalArguments.intervalSetRule != IntervalSetRule.UNION) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_SET_RULE_KEY, String.valueOf(intervalArguments.intervalSetRule)));
}
if (intervalArguments.intervalPadding != 0) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_PADDING_KEY, String.valueOf(intervalArguments.intervalPadding)));
}
}
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.commandline;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import java.util.List;
public class IntervalArgumentCollection {
/**
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
* To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
*/
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> intervals = null;
/**
* Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
* Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
*/
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> excludeIntervals = null;
/**
* How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
* for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
*/
@Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
/**
* Should abutting (but not overlapping) intervals be treated as separate intervals?
*/
@Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
/**
* For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
*/
@Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
public int intervalPadding = 0;
}

View File

@ -55,7 +55,6 @@ import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
@ -361,7 +360,6 @@ public class GenomeAnalysisEngine {
* Returns a list of active, initialized read transformers
*
* @param walker the walker we need to apply read transformers too
* @return a non-null list of read transformers
*/
public void initializeReadTransformers(final Walker walker) {
final List<ReadTransformer> activeTransformers = new ArrayList<ReadTransformer>();
@ -672,41 +670,7 @@ public class GenomeAnalysisEngine {
* Setup the intervals to be processed
*/
protected void initializeIntervals() {
// return if no interval arguments at all
if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
return;
// Note that the use of '-L all' is no longer supported.
// if include argument isn't given, create new set of all possible intervals
final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
this.referenceDataSource,
argCollection.intervals,
argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
argCollection.excludeIntervals);
final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
// if no exclude arguments, can return parseIntervalArguments directly
if ( excludeSortedSet == null )
intervals = includeSortedSet;
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
// logging messages only printed when exclude (-XL) arguments are given
final long toPruneSize = includeSortedSet.coveredSize();
final long toExcludeSize = excludeSortedSet.coveredSize();
final long intervalSize = intervals.coveredSize();
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
}
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments);
}
/**

View File

@ -26,11 +26,7 @@
package org.broadinstitute.sting.gatk.arguments;
import net.sf.samtools.SAMFileReader;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
@ -38,8 +34,6 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import java.io.File;
import java.util.ArrayList;
@ -100,41 +94,8 @@ public class GATKArgumentCollection {
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
public List<String> readFilters = new ArrayList<String>();
/**
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
* To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
*/
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> intervals = null;
/**
* Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
* Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
*/
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> excludeIntervals = null;
/**
* How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
* for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
*/
@Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
/**
* Should abutting (but not overlapping) intervals be treated as separate intervals?
*/
@Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
/**
* For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
*/
@Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
public int intervalPadding = 0;
@ArgumentCollection
public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection();
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;

View File

@ -32,6 +32,7 @@ import net.sf.picard.util.IntervalList;
import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.IntervalArgumentCollection;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -534,6 +535,47 @@ public class IntervalUtils {
}
}
public static GenomeLocSortedSet parseIntervalArguments(final ReferenceDataSource referenceDataSource, IntervalArgumentCollection argCollection) {
GenomeLocSortedSet intervals = null;
// return if no interval arguments at all
if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
return intervals;
// Note that the use of '-L all' is no longer supported.
// if include argument isn't given, create new set of all possible intervals
final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
referenceDataSource,
argCollection.intervals,
argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
argCollection.excludeIntervals);
final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
// if no exclude arguments, can return parseIntervalArguments directly
if ( excludeSortedSet == null )
intervals = includeSortedSet;
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
// logging messages only printed when exclude (-XL) arguments are given
final long toPruneSize = includeSortedSet.coveredSize();
final long toExcludeSize = excludeSortedSet.coveredSize();
final long intervalSize = intervals.coveredSize();
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
}
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
return intervals;
}
public static Pair<GenomeLocSortedSet, GenomeLocSortedSet> parseIntervalBindingsPair(
final ReferenceDataSource referenceDataSource,
final List<IntervalBinding<Feature>> intervals,

View File

@ -73,6 +73,10 @@ public class VCFHeader {
public static final String REFERENCE_KEY = "reference";
public static final String CONTIG_KEY = "contig";
public static final String INTERVALS_KEY = "intervals";
public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals";
public static final String INTERVAL_MERGING_KEY = "interval_merging";
public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule";
public static final String INTERVAL_PADDING_KEY = "interval_padding";
// were the input samples sorted originally (or are we sorting them)?
private boolean samplesWereAlreadySorted = true;

View File

@ -1068,7 +1068,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1);
intervalArgs.add(new IntervalBinding<Feature>(picardIntervalFile.getAbsolutePath()));
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser);
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser);
}
@Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData")
@ -1081,7 +1081,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1);
intervalArgs.add(new IntervalBinding<Feature>(gatkIntervalFile.getAbsolutePath()));
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser);
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser);
}
private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception {

View File

@ -27,6 +27,8 @@ package org.broadinstitute.sting.queue.util
import java.io.File
import org.broadinstitute.sting.utils.io.FileExtension
import java.util.Date
import java.net.URL
/**
* An extension of java.io.File that can be pulled from or pushed to a remote location.
@ -35,5 +37,6 @@ trait RemoteFile extends File with FileExtension {
def pullToLocal()
def pushToRemote()
def deleteRemote()
def createUrl(expiration: Date): URL
def remoteDescription: String
}