diff --git a/ivy.xml b/ivy.xml
index 96c1de844..ee24bc367 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -76,7 +76,7 @@
-
+
diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java
index e10bcbaa0..8cba183da 100644
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java
@@ -554,4 +554,54 @@ public class GenomeLocParser {
return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true);
}
+ /**
+ * Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size.
+ * @param loc The original loc
+ * @param maxBasePairs The maximum number of basePairs
+ * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig.
+ */
+ @Requires({"loc != null", "maxBasePairs > 0"})
+ public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) {
+ if (GenomeLoc.isUnmapped(loc))
+ return null;
+ String contigName = loc.getContig();
+ SAMSequenceRecord contig = contigInfo.getSequence(contigName);
+ int contigIndex = contig.getSequenceIndex();
+
+ int start = loc.getStart() - maxBasePairs;
+ int stop = loc.getStart() - 1;
+
+ if (start < 1)
+ start = 1;
+ if (stop < 1)
+ return null;
+
+ return createGenomeLoc(contigName, contigIndex, start, stop, true);
+ }
+
+ /**
+ * Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size.
+ * @param loc The original loc
+ * @param maxBasePairs The maximum number of basePairs
+ * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig.
+ */
+ @Requires({"loc != null", "maxBasePairs > 0"})
+ public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) {
+ if (GenomeLoc.isUnmapped(loc))
+ return null;
+ String contigName = loc.getContig();
+ SAMSequenceRecord contig = contigInfo.getSequence(contigName);
+ int contigIndex = contig.getSequenceIndex();
+ int contigLength = contig.getSequenceLength();
+
+ int start = loc.getStop() + 1;
+ int stop = loc.getStop() + maxBasePairs;
+
+ if (start > contigLength)
+ return null;
+ if (stop > contigLength)
+ stop = contigLength;
+
+ return createGenomeLoc(contigName, contigIndex, start, stop, true);
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
index f0e164c87..159b145a0 100644
--- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
@@ -233,8 +233,12 @@ public class IntervalUtils {
*
* Returns a null string if there are no differences, otherwise returns a string describing the difference
* (useful for UnitTests). Assumes both lists are sorted
+ *
+ * @param masterArg sorted master genome locs
+ * @param testArg sorted test genome locs
+ * @return null string if there are no difference, otherwise a string describing the difference
*/
- public static final String equateIntervals(List masterArg, List testArg) {
+ public static String equateIntervals(List masterArg, List testArg) {
LinkedList master = new LinkedList(masterArg);
LinkedList test = new LinkedList(testArg);
@@ -317,23 +321,6 @@ public class IntervalUtils {
return lengths;
}
- /**
- * Counts the number of interval files an interval list can be split into using scatterIntervalArguments.
- * @param locs The genome locs.
- * @return The maximum number of parts the intervals can be split into.
- */
- public static int countContigIntervals(List locs) {
- int maxFiles = 0;
- String contig = null;
- for (GenomeLoc loc: locs) {
- if (contig == null || !contig.equals(loc.getContig())) {
- maxFiles++;
- contig = loc.getContig();
- }
- }
- return maxFiles;
- }
-
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
@@ -373,7 +360,6 @@ public class IntervalUtils {
* @return A list of lists of genome locs, split according to splits
*/
public static List> splitIntervalsToSubLists(List locs, List splits) {
- int locIndex = 1;
int start = 0;
List> sublists = new ArrayList>(splits.size());
for (Integer stop: splits) {
@@ -465,7 +451,7 @@ public class IntervalUtils {
@Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"})
@Ensures({"result != null"})
- final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) {
+ static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) {
final List split = new ArrayList();
long size = 0;
@@ -579,10 +565,101 @@ public class IntervalUtils {
}
}
- public static final long intervalSize(final List locs) {
+ public static long intervalSize(final List locs) {
long size = 0;
for ( final GenomeLoc loc : locs )
size += loc.size();
return size;
}
+
+ public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) {
+ ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference);
+ GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference());
+ List originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath());
+
+ if (originalList.isEmpty())
+ throw new UserException.MalformedFile(inputIntervals, "File contains no intervals");
+
+ List flankingList = getFlankingIntervals(parser, originalList, basePairs);
+
+ if (flankingList.isEmpty())
+ throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals");
+
+ SAMFileHeader samFileHeader = new SAMFileHeader();
+ samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
+ IntervalList intervalList = new IntervalList(samFileHeader);
+ int i = 0;
+ for (GenomeLoc loc: flankingList)
+ intervalList.add(toInterval(loc, ++i));
+ intervalList.write(flankingIntervals);
+ }
+
+ /**
+ * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs.
+ * @param parser A genome loc parser for creating the new intervals
+ * @param locs Original genome locs
+ * @param basePairs Number of base pairs on each side of loc
+ * @return The list of intervals between the locs
+ */
+ public static List getFlankingIntervals(final GenomeLocParser parser, final List locs, final int basePairs) {
+ List sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList();
+
+ if (sorted.size() == 0)
+ return Collections.emptyList();
+
+ LinkedHashMap> locsByContig = splitByContig(sorted);
+ List expanded = new ArrayList();
+ for (String contig: locsByContig.keySet()) {
+ List contigLocs = locsByContig.get(contig);
+ int contigLocsSize = contigLocs.size();
+
+ GenomeLoc startLoc, stopLoc;
+
+ // Create loc at start of the list
+ startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs);
+ if (startLoc != null)
+ expanded.add(startLoc);
+
+ // Create locs between each loc[i] and loc[i+1]
+ for (int i = 0; i < contigLocsSize - 1; i++) {
+ stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs);
+ startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs);
+ if (stopLoc.getStop() + 1 >= startLoc.getStart()) {
+ // NOTE: This is different than GenomeLoc.merge()
+ // merge() returns a loc which covers the entire range of stop and start,
+ // possibly returning positions inside loc(i) or loc(i+1)
+ // We want to make sure that the start of the stopLoc is used, and the stop of the startLoc
+ GenomeLoc merged = parser.createGenomeLoc(
+ stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop());
+ expanded.add(merged);
+ } else {
+ expanded.add(stopLoc);
+ expanded.add(startLoc);
+ }
+ }
+
+ // Create loc at the end of the list
+ stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs);
+ if (stopLoc != null)
+ expanded.add(stopLoc);
+ }
+ return expanded;
+ }
+
+ private static LinkedHashMap> splitByContig(List sorted) {
+ LinkedHashMap> splits = new LinkedHashMap>();
+ GenomeLoc last = null;
+ List contigLocs = null;
+ for (GenomeLoc loc: sorted) {
+ if (GenomeLoc.isUnmapped(loc))
+ continue;
+ if (last == null || !last.onSameContig(loc)) {
+ contigLocs = new ArrayList();
+ splits.put(loc.getContig(), contigLocs);
+ }
+ contigLocs.add(loc);
+ last = loc;
+ }
+ return splits;
+ }
}
diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java
index f1f849bf5..e9f138a0e 100644
--- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java
@@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.BeforeClass;
+import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
@@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
@Test
public void testGetContigIndexValid() {
- SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference
}
@@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
@Test
public void testGetContigInfoKnownContig() {
- SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference
}
@@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest {
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
}
+
+ private static class FlankingGenomeLocTestData extends TestDataProvider {
+ final GenomeLocParser parser;
+ final int basePairs;
+ final GenomeLoc original, flankStart, flankStop;
+
+ private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) {
+ super(FlankingGenomeLocTestData.class, name);
+ this.parser = parser;
+ this.basePairs = basePairs;
+ this.original = parse(parser, original);
+ this.flankStart = flankStart == null ? null : parse(parser, flankStart);
+ this.flankStop = flankStop == null ? null : parse(parser, flankStop);
+ }
+
+ private static GenomeLoc parse(GenomeLocParser parser, String str) {
+ return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str);
+ }
+ }
+
+ @DataProvider(name = "flankingGenomeLocs")
+ public Object[][] getFlankingGenomeLocs() {
+ int contigLength = 10000;
+ SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength);
+ GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary());
+
+ new FlankingGenomeLocTestData("atStartBase1", parser, 1,
+ "chr1:1", null, "chr1:2");
+
+ new FlankingGenomeLocTestData("atStartBase50", parser, 50,
+ "chr1:1", null, "chr1:2-51");
+
+ new FlankingGenomeLocTestData("atStartRange50", parser, 50,
+ "chr1:1-10", null, "chr1:11-60");
+
+ new FlankingGenomeLocTestData("atEndBase1", parser, 1,
+ "chr1:" + contigLength, "chr1:" + (contigLength - 1), null);
+
+ new FlankingGenomeLocTestData("atEndBase50", parser, 50,
+ "chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null);
+
+ new FlankingGenomeLocTestData("atEndRange50", parser, 50,
+ String.format("chr1:%d-%d", contigLength - 10, contigLength),
+ String.format("chr1:%d-%d", contigLength - 60, contigLength - 11),
+ null);
+
+ new FlankingGenomeLocTestData("nearStartBase1", parser, 1,
+ "chr1:2", "chr1:1", "chr1:3");
+
+ new FlankingGenomeLocTestData("nearStartRange50", parser, 50,
+ "chr1:21-30", "chr1:1-20", "chr1:31-80");
+
+ new FlankingGenomeLocTestData("nearEndBase1", parser, 1,
+ "chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength);
+
+ new FlankingGenomeLocTestData("nearEndRange50", parser, 50,
+ String.format("chr1:%d-%d", contigLength - 30, contigLength - 21),
+ String.format("chr1:%d-%d", contigLength - 80, contigLength - 31),
+ String.format("chr1:%d-%d", contigLength - 20, contigLength));
+
+ new FlankingGenomeLocTestData("beyondStartBase1", parser, 1,
+ "chr1:3", "chr1:2", "chr1:4");
+
+ new FlankingGenomeLocTestData("beyondStartRange50", parser, 50,
+ "chr1:101-200", "chr1:51-100", "chr1:201-250");
+
+ new FlankingGenomeLocTestData("beyondEndBase1", parser, 1,
+ "chr1:" + (contigLength - 3),
+ "chr1:" + (contigLength - 4),
+ "chr1:" + (contigLength - 2));
+
+ new FlankingGenomeLocTestData("beyondEndRange50", parser, 50,
+ String.format("chr1:%d-%d", contigLength - 200, contigLength - 101),
+ String.format("chr1:%d-%d", contigLength - 250, contigLength - 201),
+ String.format("chr1:%d-%d", contigLength - 100, contigLength - 51));
+
+ new FlankingGenomeLocTestData("unmapped", parser, 50,
+ "unmapped", null, null);
+
+ new FlankingGenomeLocTestData("fullContig", parser, 50,
+ "chr1", null, null);
+
+ return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class);
+ }
+
+ @Test(dataProvider = "flankingGenomeLocs")
+ public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) {
+ GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs);
+ String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
+ data.toString(), data.original, actual, data.flankStart);
+ assertEquals(actual, data.flankStart, description);
+ }
+
+ @Test(dataProvider = "flankingGenomeLocs")
+ public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) {
+ GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs);
+ String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
+ data.toString(), data.original, actual, data.flankStop);
+ assertEquals(actual, data.flankStop, description);
+ }
}
diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
index 9c3b905c2..03d33d2c5 100644
--- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
@@ -1,8 +1,8 @@
package org.broadinstitute.sting.utils.interval;
import net.sf.picard.reference.ReferenceSequenceFile;
-import net.sf.picard.util.IntervalUtil;
import net.sf.samtools.SAMFileHeader;
+import org.apache.commons.io.FileUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@@ -762,4 +762,225 @@ public class IntervalUtilsUnitTest extends BaseTest {
List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
Assert.assertEquals(merged.size(), 1);
}
+
+ /*
+ Split into tests that can be written to files and tested by writeFlankingIntervals,
+ and lists that cannot but are still handled by getFlankingIntervals.
+ */
+ private static abstract class FlankingIntervalsTestData extends TestDataProvider {
+ final public File referenceFile;
+ final public GenomeLocParser parser;
+ final int basePairs;
+ final List original;
+ final List expected;
+
+ protected FlankingIntervalsTestData(Class> clazz, String name, File referenceFile, GenomeLocParser parser,
+ int basePairs, List original, List expected) {
+ super(clazz, name);
+ this.referenceFile = referenceFile;
+ this.parser = parser;
+ this.basePairs = basePairs;
+ this.original = parse(parser, original);
+ this.expected = parse(parser, expected);
+ }
+
+ private static List parse(GenomeLocParser parser, List locs) {
+ List parsed = new ArrayList();
+ for (String loc: locs)
+ parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc));
+ return parsed;
+ }
+ }
+
+ private static class FlankingIntervalsFile extends FlankingIntervalsTestData {
+ public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser,
+ int basePairs, List original, List expected) {
+ super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected);
+ }
+ }
+
+ private static class FlankingIntervalsList extends FlankingIntervalsTestData {
+ public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser,
+ int basePairs, List original, List expected) {
+ super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected);
+ }
+ }
+
+ /* Intervals where the original and the flanks can be written to files. */
+ @DataProvider(name = "flankingIntervalsFiles")
+ public Object[][] getFlankingIntervalsFiles() {
+ File hg19ReferenceFile = new File(BaseTest.hg19Reference);
+ int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength();
+
+ new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:1"),
+ Arrays.asList("1:2"));
+
+ new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:1"),
+ Arrays.asList("1:2-51"));
+
+ new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:1-10"),
+ Arrays.asList("1:11-60"));
+
+ new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:" + hg19Length1),
+ Arrays.asList("1:" + (hg19Length1 - 1)));
+
+ new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:" + hg19Length1),
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1)));
+
+ new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)),
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11)));
+
+ new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:2"),
+ Arrays.asList("1:1", "1:3"));
+
+ new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:21-30"),
+ Arrays.asList("1:1-20", "1:31-80"));
+
+ new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:" + (hg19Length1 - 1)),
+ Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1));
+
+ new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)),
+ Arrays.asList(
+ String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31),
+ String.format("1:%d-%d", hg19Length1 - 20, hg19Length1)));
+
+ new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:3"),
+ Arrays.asList("1:2", "1:4"));
+
+ new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200"),
+ Arrays.asList("1:51-100", "1:201-250"));
+
+ new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
+ Arrays.asList("1:" + (hg19Length1 - 3)),
+ Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2)));
+
+ new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)),
+ Arrays.asList(
+ String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201),
+ String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51)));
+
+ new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:401-500"),
+ Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550"));
+
+ new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:301-400"),
+ Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
+
+ new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:271-400"),
+ Arrays.asList("1:51-100", "1:201-270", "1:401-450"));
+
+ new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:221-400"),
+ Arrays.asList("1:51-100", "1:201-220", "1:401-450"));
+
+ new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:121-400"),
+ Arrays.asList("1:51-100", "1:401-450"));
+
+ new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "2:301-400"),
+ Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450"));
+
+ // Explicit testing a problematic agilent target pair
+ new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("2:74756257-74756411", "2:74756487-74756628"),
+ // wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679")
+ Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678"));
+
+ return TestDataProvider.getTests(FlankingIntervalsFile.class);
+ }
+
+ /* Intervals where either the original and/or the flanks cannot be written to a file. */
+ @DataProvider(name = "flankingIntervalsLists")
+ public Object[][] getFlankingIntervalsLists() {
+ File hg19ReferenceFile = new File(BaseTest.hg19Reference);
+ List empty = Collections.emptyList();
+
+ new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ empty,
+ empty);
+
+ new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("unmapped"),
+ empty);
+
+ new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1"),
+ empty);
+
+ new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1", "2", "3"),
+ empty);
+
+ new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
+ Arrays.asList("1:101-200", "1:301-400", "unmapped"),
+ Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
+
+ return TestDataProvider.getTests(FlankingIntervalsList.class);
+ }
+
+ @Test(dataProvider = "flankingIntervalsFiles")
+ public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
+ File originalFile = createTempFile("original.", ".intervals");
+ File flankingFile = createTempFile("flanking.", ".intervals");
+ try {
+ List lines = new ArrayList();
+ for (GenomeLoc loc: data.original)
+ lines.add(loc.toString());
+ FileUtils.writeLines(originalFile, lines);
+
+ IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
+
+ List actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath());
+
+ String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
+ data.toString(), data.original, actual, data.expected);
+ Assert.assertEquals(actual, data.expected, description);
+ } finally {
+ FileUtils.deleteQuietly(originalFile);
+ FileUtils.deleteQuietly(flankingFile);
+ }
+ }
+
+ @Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class)
+ public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
+ File originalFile = createTempFile("original.", ".intervals");
+ File flankingFile = createTempFile("flanking.", ".intervals");
+ try {
+ List lines = new ArrayList();
+ for (GenomeLoc loc: data.original)
+ lines.add(loc.toString());
+ FileUtils.writeLines(originalFile, lines);
+
+ // Should throw a user exception on bad input if either the original
+ // intervals are empty or if the flanking intervals are empty
+ IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
+ } finally {
+ FileUtils.deleteQuietly(originalFile);
+ FileUtils.deleteQuietly(flankingFile);
+ }
+ }
+
+ @Test(dataProvider = "flankingIntervalsLists")
+ public void testGetFlankingIntervals(FlankingIntervalsTestData data) {
+ List actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs);
+ String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
+ data.toString(), data.original, actual, data.expected);
+ Assert.assertEquals(actual, data.expected, description);
+ }
}
diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala
new file mode 100644
index 000000000..d90db0de4
--- /dev/null
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.queue.extensions.gatk
+
+import org.broadinstitute.sting.queue.function.InProcessFunction
+import org.broadinstitute.sting.commandline.{Output, Argument, Input}
+import java.io.File
+import org.broadinstitute.sting.utils.interval.IntervalUtils
+
+class WriteFlankingIntervalsFunction extends InProcessFunction {
+ @Input(doc="The reference sequence")
+ var reference : File = _
+
+ @Input(doc="The interval list to flank")
+ var inputIntervals : File = _
+
+ @Output(doc="The output intervals file to write to")
+ var outputIntervals: File = _
+
+ @Argument(doc="Number of base pair to flank the input intervals")
+ var flankSize : Int = _
+
+ def run() {
+ IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize)
+ }
+}
diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala
deleted file mode 100755
index 77eb3ccbc..000000000
--- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala
+++ /dev/null
@@ -1,135 +0,0 @@
-package org.broadinstitute.sting.queue.library.ipf.intervals
-
-import org.broadinstitute.sting.queue.function.InProcessFunction
-import org.broadinstitute.sting.commandline._
-import java.io.{PrintStream, File}
-import collection.JavaConversions._
-import org.broadinstitute.sting.utils.text.XReadLines
-import net.sf.picard.reference.FastaSequenceFile
-import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
-import collection.immutable.TreeSet
-
-// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
-class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction {
- @Input(doc="The interval list to expand") val inList : File = in
- @Input(doc="The reference sequence") val refDict : File = ref
- @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start
- @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size
- @Output(doc="The output intervals file to write to") val outList : File = out
- @Argument(doc="The output format for the intervals") val outTypeStr = opType
- @Argument(doc="The input format for the intervals") val inTypeStr = ipType
-
- var output : PrintStream = _
- var parser : GenomeLocParser = _
- var xrl : XReadLines = _
- val outType = IntervalFormatType.convert(outTypeStr)
- val inType = IntervalFormatType.convert(inTypeStr)
-
- var offsetIn : Int = 0
- var offsetOut : Int = 0
-
- var first : Boolean = true
- var lastTwo : (GenomeLoc,GenomeLoc) = _
-
- var intervalCache : TreeSet[GenomeLoc] = _
- val LINES_TO_CACHE : Int = 1000
-
- def run = {
- output = new PrintStream(outList)
- intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{
- def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) }
- })
- parser = new GenomeLocParser(new FastaSequenceFile(ref,true))
- xrl = new XReadLines(inList)
- offsetIn = if (isBed(inType)) 1 else 0
- offsetOut = if( isBed(outType)) 1 else 0
- var line : String = xrl.next
- while ( line.startsWith("@") ) {
- line = xrl.next
- }
- var prevLoc: GenomeLoc = null
- var curLoc: GenomeLoc = null
- var nextLoc : GenomeLoc = parseGenomeInterval(line)
- var linesProcessed : Int = 1
- while ( prevLoc != null || curLoc != null || nextLoc != null ) {
- prevLoc = curLoc
- curLoc = nextLoc
- nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null
- if ( curLoc != null ) {
- val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc)
- val right: GenomeLoc = refine(expandRight(curLoc),nextLoc)
- if ( left != null ) {
- intervalCache += left
- }
- if ( right != null ) {
- intervalCache += right
- }
- }
- linesProcessed += 1
- if ( linesProcessed % LINES_TO_CACHE == 0 ) {
- val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt))
- intervalCache = intervalCache -- toPrint
- toPrint.foreach(u => output.print("%s%n".format(repr(u))))
- }
- //System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc)))
- }
-
- intervalCache.foreach(u => output.print("%s%n".format(repr(u))))
-
- output.close()
- }
-
- def expandLeft(g: GenomeLoc) : GenomeLoc = {
- parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt)
- }
-
- def expandRight(g: GenomeLoc) : GenomeLoc = {
- parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt)
- }
-
- def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = {
- if ( borderG == null || ! newG.overlapsP(borderG) ) {
- return newG
- } else {
- if ( newG.getStart < borderG.getStart ) {
- if ( borderG.getStart - startInt > newG.getStart ) {
- return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt)
- }
- } else {
- if ( borderG.getStop + startInt < newG.getStop ){
- return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop)
- }
- }
- }
-
- null
- }
-
- def repr(loc : GenomeLoc) : String = {
- if ( loc == null ) return "null"
- if ( outType == IntervalFormatType.INTERVALS ) {
- return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop)
- } else {
- return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut)
- }
- }
-
- def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = {
- t == IntervalFormatType.BED
- }
-
- def parseGenomeInterval( s : String ) : GenomeLoc = {
- val sp = s.split("\\s+")
- // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted
- if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn)
- }
-
- object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") {
- type IntervalFormatType = Value
- val INTERVALS,BED,TDF = Value
-
- def convert(s : String) : IntervalFormatType = {
- if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF}
- }
- }
-}
\ No newline at end of file
diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala
deleted file mode 100755
index e929477a1..000000000
--- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-package org.broadinstitute.sting.queue.library.ipf.intervals
-
-import org.broadinstitute.sting.queue.function.InProcessFunction
-import collection.JavaConversions._
-import org.broadinstitute.sting.commandline._
-import java.io.{PrintStream, File}
-import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary}
-import org.broadinstitute.sting.utils.text.XReadLines
-import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
-
-class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction {
- @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
- @Output(doc="Output interval file to which to write") val output : File = outFile
- @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false
- @Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed
-
-
- var outStream : PrintStream = _
- var contigs : List[String] = Nil
- var dict : SAMSequenceDictionary = _
- var parser : GenomeLocParser = _
-
- def run = {
- outStream = new PrintStream(output)
- dict = new SAMSequenceDictionary
- // note: memory hog
- val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex
- sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE)))
- parser = new GenomeLocParser(dict)
- sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h))
- outStream.close()
- }
-
- def writeOut(g : GenomeLoc) : Unit = {
- outStream.print("%s%n".format(g.toString))
- }
-
- def parse(s : String) : (String,Int,Int) = {
- if ( s.contains(":") ) {
- val split1 = s.split(":")
- val split2 = split1(1).split("-")
- return (split1(0),split2(0).toInt,split2(1).toInt)
- } else {
- val split = s.split("\\s+")
- return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) )
- }
- }
-
- def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = {
- parser.createGenomeLoc(coords._1,coords._2,coords._3)
- }
-
- def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = {
- if ( a.last.forall(u => u._1.overlapsP(b._1)) ) {
- a.init :+ (a.last :+ b)
- } else {
- a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b)
- }
- }
-
- def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = {
- if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for
- null
- }
- u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) )
- }
-
- def range(a: Int, b: Int) : Range = new Range(a,b+1,1)
-
-}
\ No newline at end of file
diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar
old mode 100755
new mode 100644
similarity index 88%
rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.jar
rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar
index bfd06f97f..ee5d02367
Binary files a/settings/repository/net.sf.snpeff/snpeff-2.0.2.jar and b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.jar differ
diff --git a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml
similarity index 77%
rename from settings/repository/net.sf.snpeff/snpeff-2.0.2.xml
rename to settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml
index f0568def4..5417641d3 100644
--- a/settings/repository/net.sf.snpeff/snpeff-2.0.2.xml
+++ b/settings/repository/net.sf.snpeff/snpeff-2.0.4rc3.xml
@@ -1,3 +1,3 @@
-
+