providing a way to specify how you'd like -BTI combined with your -L options; set BTIMR to either UNION (default) or INTERSECTION.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3983 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-08-09 14:00:52 +00:00
parent f3883585d0
commit 30178c05c5
7 changed files with 172 additions and 14 deletions

View File

@ -227,16 +227,14 @@ public class GenomeAnalysisEngine {
* @param additionalIntervals a list of additional intervals to add to the returned set. Can be null.
* @return A sorted, merged list of all intervals specified in this arg list.
*/
private GenomeLocSortedSet loadIntervals(List<String> argList, IntervalMergingRule mergingRule, List<GenomeLoc> additionalIntervals) {
List<GenomeLoc> rawIntervals = (additionalIntervals == null) ? new ArrayList<GenomeLoc>() : additionalIntervals; // running list of raw GenomeLocs
private GenomeLocSortedSet loadIntervals(List<String> argList,
IntervalMergingRule mergingRule,
List<GenomeLoc> additionalIntervals) {
rawIntervals.addAll(IntervalUtils.parseIntervalArguments(argList));
// redundant check => default no arguments is null, not empty list
if (rawIntervals.size() == 0)
return null;
return IntervalUtils.sortAndMergeIntervals(rawIntervals,mergingRule);
return IntervalUtils.sortAndMergeIntervals(IntervalUtils.mergeListsBySetOperator(additionalIntervals,
IntervalUtils.parseIntervalArguments(argList),
argCollection.BTIMergeRule),
mergingRule);
}
/**

View File

@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.simpleframework.xml.*;
import org.simpleframework.xml.core.Persister;
import org.simpleframework.xml.stream.Format;
@ -92,7 +93,11 @@ public class GATKArgumentCollection {
@Element(required = false)
@Argument(fullName = "rodToIntervalTrackName", shortName = "BTI", doc = "Indicates that the named track should be converted into an interval list, to drive the traversal", required = false)
public String RODToInterval = null;
public String RODToInterval = null;
@Element(required = false)
@Argument(fullName = "BTI_merge_rule", shortName = "BTIMR", doc = "Indicates the merging approach the interval parser should use to combine the BTI track with other -L options", required = false)
public IntervalSetRule BTIMergeRule = IntervalSetRule.UNION;
@Element(required = false)
@Argument(fullName = "DBSNP", shortName = "D", doc = "DBSNP file", required = false)
@ -351,6 +356,9 @@ public class GATKArgumentCollection {
(other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) {
return false;
}
if (BTIMergeRule != other.BTIMergeRule)
return false;
// if (other.enableRodWalkers != this.enableRodWalkers) {
// return false;
// }

View File

@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.refdata.tracks;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.util.CloseableIterator;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureSource;
import org.broad.tribble.source.BasicFeatureSource;
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
@ -51,20 +52,23 @@ public class TribbleTrack extends RMDTrack implements QueryableTrack {
// our sequence dictionary, which can be null
private final SAMSequenceDictionary dictionary;
// our codec type
private final FeatureCodec codec;
/**
* Create a track
*
* @param type the type of track, used for track lookup
* @param recordType the type of record we produce
* @param name the name of this specific track
* @param file the associated file, for reference or recreating the reader
* @param reader the feature reader to use as the underlying data source
* @param dict the sam sequence dictionary
* @param codec the feature codec we use to decode this type
*/
public TribbleTrack(Class type, Class recordType, String name, File file, BasicFeatureSource reader, SAMSequenceDictionary dict) {
super(type, recordType, name, file);
public TribbleTrack(Class type, String name, File file, BasicFeatureSource reader, SAMSequenceDictionary dict, FeatureCodec codec) {
super(type, codec.getFeatureType(), name, file);
this.reader = reader;
this.dictionary = dict;
this.codec = codec;
}
/**
@ -135,4 +139,8 @@ public class TribbleTrack extends RMDTrack implements QueryableTrack {
public Object getHeader() {
return reader.getHeader();
}
public FeatureCodec getCodec() {
return codec;
}
}

View File

@ -105,7 +105,7 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
// return a feature reader track
Pair<BasicFeatureSource, SAMSequenceDictionary> pair = createFeatureReader(targetClass, name, inputFile);
if (pair == null) throw new StingException("Unable to make the feature reader for input file " + inputFile);
return new TribbleTrack(targetClass, createCodec(targetClass, name).getFeatureType(), name, inputFile, pair.first, pair.second);
return new TribbleTrack(targetClass, name, inputFile, pair.first, pair.second, createCodec(targetClass, name));
}
public Pair<BasicFeatureSource, SAMSequenceDictionary> createFeatureReader(Class targetClass, File inputFile) {

View File

@ -0,0 +1,9 @@
package org.broadinstitute.sting.utils.interval;
/**
* set operators for combining lists of intervals
*/
public enum IntervalSetRule {
UNION,
INTERSECTION;
}

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.LinkedList;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
@ -69,6 +70,49 @@ public class IntervalUtils {
return rawIntervals;
}
/**
* merge two interval lists, using an interval set rule
* @param setOne a list of genomeLocs, in order (cannot be NULL)
* @param setTwo a list of genomeLocs, also in order (cannot be NULL)
* @param rule the rule to use for merging, i.e. union, intersection, etc
* @return a list, correctly merged using the specified rule
*/
public static List<GenomeLoc> mergeListsBySetOperator(List<GenomeLoc> setOne, List<GenomeLoc> setTwo, IntervalSetRule rule) {
// shortcut, if either set is zero, return the other set
if (setOne.size() == 0 || setTwo.size() == 0) return (setOne.size() == 0) ? setTwo : setOne;
// if we're set to UNION, just add them all
if (rule == IntervalSetRule.UNION) {
setOne.addAll(setTwo);
return setOne;
}
// else we're INTERSECTION, create two indexes into the lists
int iOne = 0;
int iTwo = 0;
// our master list, since we can't guarantee removal time in a generic list
LinkedList<GenomeLoc> retList = new LinkedList<GenomeLoc>();
// merge the second into the first using the rule
while (iTwo < setTwo.size() && iOne < setOne.size())
// if the first list is ahead, drop items off the second until we overlap
if (setTwo.get(iTwo).isBefore(setOne.get(iOne)))
iTwo++;
// if the second is ahead, drop intervals off the first until we overlap
else if (setOne.get(iOne).isBefore(setTwo.get(iTwo)))
iOne++;
// we overlap, intersect the two intervals and add the result. Then remove the interval that ends first.
else {
retList.add(setOne.get(iOne).intersect(setTwo.get(iTwo)));
if (setOne.get(iOne).getStop() < setTwo.get(iTwo).getStop()) iOne++;
else iTwo++;
}
// we don't need to add the rest of remaining locations, since we know they don't overlap. return what we have
return retList;
}
/**
* Sorts and merges an interval list. Multiple techniques are available for merging: ALL, which combines
* all overlapping and abutting intervals into an interval that spans the union of all covered bases, and

View File

@ -0,0 +1,91 @@
package org.broadinstitute.sting.utils.interval;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/**
* test out the interval utility methods
*/
public class IntervalUtilsTest extends BaseTest {
// used to seed the genome loc parser with a sequence dictionary
private static ReferenceSequenceFile seq;
@BeforeClass
public static void init() throws FileNotFoundException {
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
GenomeLocParser.setupRefContigOrdering(seq);
}
@Test
public void testMergeListsBySetOperatorNoOverlap() {
// a couple of lists we'll use for the testing
List<GenomeLoc> listEveryTwoFromOne = new ArrayList<GenomeLoc>();
List<GenomeLoc> listEveryTwoFromTwo = new ArrayList<GenomeLoc>();
// create the two lists we'll use
for (int x = 1; x < 101; x++) {
if (x % 2 == 0)
listEveryTwoFromTwo.add(GenomeLocParser.createGenomeLoc("chr1",x,x));
else
listEveryTwoFromOne.add(GenomeLocParser.createGenomeLoc("chr1",x,x));
}
List<GenomeLoc> ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION);
Assert.assertEquals(100,ret.size());
ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION);
Assert.assertEquals(0,ret.size());
}
@Test
public void testMergeListsBySetOperatorAllOverlap() {
// a couple of lists we'll use for the testing
List<GenomeLoc> allSites = new ArrayList<GenomeLoc>();
List<GenomeLoc> listEveryTwoFromTwo = new ArrayList<GenomeLoc>();
// create the two lists we'll use
for (int x = 1; x < 101; x++) {
if (x % 2 == 0)
listEveryTwoFromTwo.add(GenomeLocParser.createGenomeLoc("chr1",x,x));
allSites.add(GenomeLocParser.createGenomeLoc("chr1",x,x));
}
List<GenomeLoc> ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION);
Assert.assertEquals(150,ret.size());
ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION);
Assert.assertEquals(50,ret.size());
}
@Test
public void testMergeListsBySetOperator() {
// a couple of lists we'll use for the testing
List<GenomeLoc> allSites = new ArrayList<GenomeLoc>();
List<GenomeLoc> listEveryTwoFromTwo = new ArrayList<GenomeLoc>();
// create the two lists we'll use
for (int x = 1; x < 101; x++) {
if (x % 5 == 0) {
listEveryTwoFromTwo.add(GenomeLocParser.createGenomeLoc("chr1",x,x));
allSites.add(GenomeLocParser.createGenomeLoc("chr1",x,x));
}
}
List<GenomeLoc> ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION);
Assert.assertEquals(40,ret.size());
ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION);
Assert.assertEquals(20,ret.size());
}
}