Feature request from Tim that could be useful to all: there's now an --interval_padding argument that specifies how many basepairs to add to each of the intervals provided with -L (on both ends). This is particularly useful when trying to run over the exome plus flanks and don't want to have to pre-compute the flanks (just use e.g. --interval_padding 50). Added integration test to cover this feature.

This commit is contained in:
Eric Banks 2012-06-18 21:36:27 -04:00
parent 4393adf9e7
commit 62cee2fb5b
6 changed files with 91 additions and 13 deletions

View File

@ -29,17 +29,14 @@ import org.broad.tribble.AbstractFeatureReader;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureReader;
import org.broad.tribble.readers.AsciiLineReader;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
/**
@ -73,7 +70,11 @@ public final class IntervalBinding<T extends Feature> {
return stringIntervals;
}
public List<GenomeLoc> getIntervals(GenomeAnalysisEngine toolkit) {
public List<GenomeLoc> getIntervals(final GenomeAnalysisEngine toolkit) {
return getIntervals(toolkit.getGenomeLocParser());
}
public List<GenomeLoc> getIntervals(final GenomeLocParser genomeLocParser) {
List<GenomeLoc> intervals;
if ( featureIntervals != null ) {
@ -83,17 +84,17 @@ public final class IntervalBinding<T extends Feature> {
final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
if ( codec instanceof ReferenceDependentFeatureCodec )
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser);
try {
FeatureReader<Feature> reader = AbstractFeatureReader.getFeatureReader(featureIntervals.getSource(), codec, false);
for ( Feature feature : reader.iterator() )
intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(feature));
intervals.add(genomeLocParser.createGenomeLoc(feature));
} catch (Exception e) {
throw new UserException.MalformedFile(featureIntervals.getSource(), "Problem reading the interval file", e);
}
} else {
intervals = IntervalUtils.parseIntervalArguments(toolkit.getGenomeLocParser(), stringIntervals);
intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals);
}
return intervals;

View File

@ -592,7 +592,7 @@ public class GenomeAnalysisEngine {
// if include argument isn't given, create new set of all possible intervals
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
loadIntervals(argCollection.intervals, argCollection.intervalSetRule));
loadIntervals(argCollection.intervals, argCollection.intervalSetRule, argCollection.intervalPadding));
// if no exclude arguments, can return parseIntervalArguments directly
if ( argCollection.excludeIntervals == null )
@ -619,16 +619,31 @@ public class GenomeAnalysisEngine {
* @param rule interval merging rule
* @return A sorted, merged list of all intervals specified in this arg list.
*/
protected GenomeLocSortedSet loadIntervals( List<IntervalBinding<Feature>> argList, IntervalSetRule rule ) {
protected GenomeLocSortedSet loadIntervals( final List<IntervalBinding<Feature>> argList, final IntervalSetRule rule ) {
return loadIntervals(argList, rule, 0);
}
/**
* Loads the intervals relevant to the current execution
* @param argList argument bindings; might include filenames, intervals in samtools notation, or a combination of the above
* @param rule interval merging rule
* @param padding how much to pad the intervals
* @return A sorted, merged list of all intervals specified in this arg list.
*/
protected GenomeLocSortedSet loadIntervals( final List<IntervalBinding<Feature>> argList, final IntervalSetRule rule, final int padding ) {
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>();
for ( IntervalBinding intervalBinding : argList ) {
List<GenomeLoc> intervals = intervalBinding.getIntervals(this);
List<GenomeLoc> intervals = intervalBinding.getIntervals(this.getGenomeLocParser());
if ( intervals.isEmpty() ) {
logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed.");
}
if ( padding > 0 ) {
intervals = IntervalUtils.getIntervalsWithFlanks(this.getGenomeLocParser(), intervals, padding);
}
allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, rule);
}

View File

@ -104,6 +104,12 @@ public class GATKArgumentCollection {
@Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
/**
* For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
*/
@Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
public int intervalPadding = 0;
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;

View File

@ -587,6 +587,27 @@ public final class GenomeLocParser {
return createGenomeLoc(contigName, contigIndex, start, stop, true);
}
/**
* Creates a loc padded in both directions by maxBasePairs size (if possible).
* @param loc The original loc
* @param padding The number of base pairs to pad on either end
* @return The contiguous loc of length up to the original length + 2*padding (depending on the start/end of the contig).
*/
@Requires({"loc != null", "padding > 0"})
public GenomeLoc createPaddedGenomeLoc(final GenomeLoc loc, final int padding) {
if (GenomeLoc.isUnmapped(loc))
return loc;
final String contigName = loc.getContig();
final SAMSequenceRecord contig = contigInfo.getSequence(contigName);
final int contigIndex = contig.getSequenceIndex();
final int contigLength = contig.getSequenceLength();
final int start = Math.max(1, loc.getStart() - padding);
final int stop = Math.min(contigLength, loc.getStop() + padding);
return createGenomeLoc(contigName, contigIndex, start, stop, true);
}
/**
* Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size.
* @param loc The original loc

View File

@ -648,6 +648,26 @@ public class IntervalUtils {
return expanded;
}
/**
* Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs.
* @param parser A genome loc parser for creating the new intervals
* @param locs Original genome locs
* @param basePairs Number of base pairs on each side of loc
* @return The list of intervals between the locs
*/
public static List<GenomeLoc> getIntervalsWithFlanks(final GenomeLocParser parser, final List<GenomeLoc> locs, final int basePairs) {
if (locs.size() == 0)
return Collections.emptyList();
final List<GenomeLoc> expanded = new ArrayList<GenomeLoc>();
for ( final GenomeLoc loc : locs ) {
expanded.add(parser.createPaddedGenomeLoc(loc, basePairs));
}
return sortAndMergeIntervals(parser, expanded, IntervalMergingRule.ALL).toList();
}
private static LinkedHashMap<String, List<GenomeLoc>> splitByContig(List<GenomeLoc> sorted) {
LinkedHashMap<String, List<GenomeLoc>> splits = new LinkedHashMap<String, List<GenomeLoc>>();
GenomeLoc last = null;

View File

@ -207,11 +207,26 @@ public class IntervalIntegrationTest extends WalkerTest {
" -R " + hg18Reference +
" -o %s" +
" -L " + validationDataLocation + "intervalTest.3.vcf",
1, // just one output file
Arrays.asList(md5));
1, // just one output file
Arrays.asList(md5));
executeTest("testComplexVCF", spec);
}
@Test(enabled = true)
public void testComplexVCFWithPadding() {
String md5 = "649ee93d50739c656e94ec88a32c7ffe";
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CountLoci" +
" --interval_padding 2" +
" -I " + validationDataLocation + "OV-0930.normal.chunk.bam" +
" -R " + hg18Reference +
" -o %s" +
" -L " + validationDataLocation + "intervalTest.3.vcf",
1, // just one output file
Arrays.asList(md5));
executeTest("testComplexVCFWithPadding", spec);
}
@Test(enabled = true)
public void testMergingWithComplexVCF() {
String md5 = "6d7fce9fee471194aa8b5b6e47267f03";