Merge branch 'master' of github.com:broadinstitute/gsa-unstable

This commit is contained in:
Eric Banks 2013-01-16 14:59:11 -05:00
commit e47a389b26
8 changed files with 166 additions and 91 deletions

View File

@ -57,6 +57,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.vcf.*; import org.broadinstitute.variant.vcf.*;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
@ -180,18 +182,47 @@ public class SelectHeaders extends RodWalker<Integer, Integer> implements TreeRe
headerLines = new LinkedHashSet<VCFHeaderLine>(getSelectedHeaders(headerLines)); headerLines = new LinkedHashSet<VCFHeaderLine>(getSelectedHeaders(headerLines));
// Optionally add in the intervals. // Optionally add in the intervals.
if (includeIntervals && getToolkit().getArguments().intervals != null) { if (includeIntervals) {
for (IntervalBinding<Feature> intervalBinding : getToolkit().getArguments().intervals) { IntervalArgumentCollection intervalArguments = getToolkit().getArguments().intervalArguments;
String source = intervalBinding.getSource(); if (intervalArguments.intervals != null) {
if (source == null) for (IntervalBinding<Feature> intervalBinding : intervalArguments.intervals) {
continue; String source = intervalBinding.getSource();
File file = new File(source); if (source == null)
if (file.exists()) { continue;
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName()))); File file = new File(source);
} else { if (file.exists()) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source)); headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
} else {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVALS_KEY, source));
}
} }
} }
if (intervalArguments.excludeIntervals != null) {
for (IntervalBinding<Feature> intervalBinding : intervalArguments.excludeIntervals) {
String source = intervalBinding.getSource();
if (source == null)
continue;
File file = new File(source);
if (file.exists()) {
headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, FilenameUtils.getBaseName(file.getName())));
} else {
headerLines.add(new VCFHeaderLine(VCFHeader.EXCLUDE_INTERVALS_KEY, source));
}
}
}
if (intervalArguments.intervalMerging != IntervalMergingRule.ALL) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_MERGING_KEY, String.valueOf(intervalArguments.intervalMerging)));
}
if (intervalArguments.intervalSetRule != IntervalSetRule.UNION) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_SET_RULE_KEY, String.valueOf(intervalArguments.intervalSetRule)));
}
if (intervalArguments.intervalPadding != 0) {
headerLines.add(new VCFHeaderLine(VCFHeader.INTERVAL_PADDING_KEY, String.valueOf(intervalArguments.intervalPadding)));
}
} }
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.commandline;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import java.util.List;
public class IntervalArgumentCollection {
/**
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
* To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
*/
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> intervals = null;
/**
* Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
* Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
*/
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> excludeIntervals = null;
/**
* How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
* for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
*/
@Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
/**
* Should abutting (but not overlapping) intervals be treated as separate intervals?
*/
@Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
/**
* For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
*/
@Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
public int intervalPadding = 0;
}

View File

@ -55,7 +55,6 @@ import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.interval.IntervalUtils;
@ -361,7 +360,6 @@ public class GenomeAnalysisEngine {
* Returns a list of active, initialized read transformers * Returns a list of active, initialized read transformers
* *
* @param walker the walker we need to apply read transformers too * @param walker the walker we need to apply read transformers too
* @return a non-null list of read transformers
*/ */
public void initializeReadTransformers(final Walker walker) { public void initializeReadTransformers(final Walker walker) {
final List<ReadTransformer> activeTransformers = new ArrayList<ReadTransformer>(); final List<ReadTransformer> activeTransformers = new ArrayList<ReadTransformer>();
@ -672,41 +670,7 @@ public class GenomeAnalysisEngine {
* Setup the intervals to be processed * Setup the intervals to be processed
*/ */
protected void initializeIntervals() { protected void initializeIntervals() {
// return if no interval arguments at all intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments);
if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
return;
// Note that the use of '-L all' is no longer supported.
// if include argument isn't given, create new set of all possible intervals
final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
this.referenceDataSource,
argCollection.intervals,
argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
argCollection.excludeIntervals);
final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
// if no exclude arguments, can return parseIntervalArguments directly
if ( excludeSortedSet == null )
intervals = includeSortedSet;
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
// logging messages only printed when exclude (-XL) arguments are given
final long toPruneSize = includeSortedSet.coveredSize();
final long toExcludeSize = excludeSortedSet.coveredSize();
final long intervalSize = intervals.coveredSize();
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
}
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
} }
/** /**

View File

@ -26,11 +26,7 @@
package org.broadinstitute.sting.gatk.arguments; package org.broadinstitute.sting.gatk.arguments;
import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileReader;
import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
@ -38,8 +34,6 @@ import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
@ -100,41 +94,8 @@ public class GATKArgumentCollection {
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
public List<String> readFilters = new ArrayList<String>(); public List<String> readFilters = new ArrayList<String>();
/** @ArgumentCollection
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection();
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
* To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
*/
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> intervals = null;
/**
* Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times.
* One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals).
* Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf).
*/
@Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
public List<IntervalBinding<Feature>> excludeIntervals = null;
/**
* How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions
* for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION).
*/
@Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false)
public IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
/**
* Should abutting (but not overlapping) intervals be treated as separate intervals?
*/
@Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
/**
* For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
*/
@Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
public int intervalPadding = 0;
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null; public File referenceFile = null;

View File

@ -32,6 +32,7 @@ import net.sf.picard.util.IntervalList;
import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broad.tribble.Feature; import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.IntervalArgumentCollection;
import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
@ -534,6 +535,47 @@ public class IntervalUtils {
} }
} }
public static GenomeLocSortedSet parseIntervalArguments(final ReferenceDataSource referenceDataSource, IntervalArgumentCollection argCollection) {
GenomeLocSortedSet intervals = null;
// return if no interval arguments at all
if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
return intervals;
// Note that the use of '-L all' is no longer supported.
// if include argument isn't given, create new set of all possible intervals
final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
referenceDataSource,
argCollection.intervals,
argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
argCollection.excludeIntervals);
final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
// if no exclude arguments, can return parseIntervalArguments directly
if ( excludeSortedSet == null )
intervals = includeSortedSet;
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
// logging messages only printed when exclude (-XL) arguments are given
final long toPruneSize = includeSortedSet.coveredSize();
final long toExcludeSize = excludeSortedSet.coveredSize();
final long intervalSize = intervals.coveredSize();
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
}
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
return intervals;
}
public static Pair<GenomeLocSortedSet, GenomeLocSortedSet> parseIntervalBindingsPair( public static Pair<GenomeLocSortedSet, GenomeLocSortedSet> parseIntervalBindingsPair(
final ReferenceDataSource referenceDataSource, final ReferenceDataSource referenceDataSource,
final List<IntervalBinding<Feature>> intervals, final List<IntervalBinding<Feature>> intervals,

View File

@ -73,6 +73,10 @@ public class VCFHeader {
public static final String REFERENCE_KEY = "reference"; public static final String REFERENCE_KEY = "reference";
public static final String CONTIG_KEY = "contig"; public static final String CONTIG_KEY = "contig";
public static final String INTERVALS_KEY = "intervals"; public static final String INTERVALS_KEY = "intervals";
public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals";
public static final String INTERVAL_MERGING_KEY = "interval_merging";
public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule";
public static final String INTERVAL_PADDING_KEY = "interval_padding";
// were the input samples sorted originally (or are we sorting them)? // were the input samples sorted originally (or are we sorting them)?
private boolean samplesWereAlreadySorted = true; private boolean samplesWereAlreadySorted = true;

View File

@ -1068,7 +1068,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1); List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1);
intervalArgs.add(new IntervalBinding<Feature>(picardIntervalFile.getAbsolutePath())); intervalArgs.add(new IntervalBinding<Feature>(picardIntervalFile.getAbsolutePath()));
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser);
} }
@Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData")
@ -1081,7 +1081,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1); List<IntervalBinding<Feature>> intervalArgs = new ArrayList<IntervalBinding<Feature>>(1);
intervalArgs.add(new IntervalBinding<Feature>(gatkIntervalFile.getAbsolutePath())); intervalArgs.add(new IntervalBinding<Feature>(gatkIntervalFile.getAbsolutePath()));
IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, genomeLocParser); IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser);
} }
private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception {

View File

@ -27,6 +27,8 @@ package org.broadinstitute.sting.queue.util
import java.io.File import java.io.File
import org.broadinstitute.sting.utils.io.FileExtension import org.broadinstitute.sting.utils.io.FileExtension
import java.util.Date
import java.net.URL
/** /**
* An extension of java.io.File that can be pulled from or pushed to a remote location. * An extension of java.io.File that can be pulled from or pushed to a remote location.
@ -35,5 +37,6 @@ trait RemoteFile extends File with FileExtension {
def pullToLocal() def pullToLocal()
def pushToRemote() def pushToRemote()
def deleteRemote() def deleteRemote()
def createUrl(expiration: Date): URL
def remoteDescription: String def remoteDescription: String
} }