From c50274e02e8bd381a2f7995a77b22f0ac8e0b00b Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Thu, 17 Nov 2011 13:53:46 -0500 Subject: [PATCH] During flanking interval creation merging overlapping flanks so that on scatter the list doesn't accidentally genotype the same site twice. Moved flanking interval utilies to IntervalUtils with UnitTests. --- .../sting/utils/GenomeLocParser.java | 50 ++++ .../sting/utils/interval/IntervalUtils.java | 119 ++++++++-- .../sting/utils/GenomeLocParserUnitTest.java | 104 +++++++- .../utils/interval/IntervalUtilsUnitTest.java | 223 +++++++++++++++++- .../gatk/WriteFlankingIntervalsFunction.scala | 48 ++++ .../ipf/intervals/ExpandIntervals.scala | 135 ----------- .../ipf/intervals/IntersectIntervals.scala | 70 ------ 7 files changed, 519 insertions(+), 230 deletions(-) create mode 100644 public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala delete mode 100755 public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala delete mode 100755 public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index e10bcbaa0..8cba183da 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -554,4 +554,54 @@ public class GenomeLocParser { return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true); } + /** + * Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + + int start = loc.getStart() - maxBasePairs; + int stop = loc.getStart() - 1; + + if (start < 1) + start = 1; + if (stop < 1) + return null; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } + + /** + * Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size. + * @param loc The original loc + * @param maxBasePairs The maximum number of basePairs + * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig. + */ + @Requires({"loc != null", "maxBasePairs > 0"}) + public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) { + if (GenomeLoc.isUnmapped(loc)) + return null; + String contigName = loc.getContig(); + SAMSequenceRecord contig = contigInfo.getSequence(contigName); + int contigIndex = contig.getSequenceIndex(); + int contigLength = contig.getSequenceLength(); + + int start = loc.getStop() + 1; + int stop = loc.getStop() + maxBasePairs; + + if (start > contigLength) + return null; + if (stop > contigLength) + stop = contigLength; + + return createGenomeLoc(contigName, contigIndex, start, stop, true); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f0e164c87..159b145a0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -233,8 +233,12 @@ public class IntervalUtils { * * Returns a null string if there are no differences, otherwise returns a string describing the difference * (useful for UnitTests). Assumes both lists are sorted + * + * @param masterArg sorted master genome locs + * @param testArg sorted test genome locs + * @return null string if there are no difference, otherwise a string describing the difference */ - public static final String equateIntervals(List masterArg, List testArg) { + public static String equateIntervals(List masterArg, List testArg) { LinkedList master = new LinkedList(masterArg); LinkedList test = new LinkedList(testArg); @@ -317,23 +321,6 @@ public class IntervalUtils { return lengths; } - /** - * Counts the number of interval files an interval list can be split into using scatterIntervalArguments. - * @param locs The genome locs. - * @return The maximum number of parts the intervals can be split into. - */ - public static int countContigIntervals(List locs) { - int maxFiles = 0; - String contig = null; - for (GenomeLoc loc: locs) { - if (contig == null || !contig.equals(loc.getContig())) { - maxFiles++; - contig = loc.getContig(); - } - } - return maxFiles; - } - /** * Splits an interval list into multiple files. * @param fileHeader The sam file header. @@ -373,7 +360,6 @@ public class IntervalUtils { * @return A list of lists of genome locs, split according to splits */ public static List> splitIntervalsToSubLists(List locs, List splits) { - int locIndex = 1; int start = 0; List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { @@ -465,7 +451,7 @@ public class IntervalUtils { @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) @Ensures({"result != null"}) - final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { final List split = new ArrayList(); long size = 0; @@ -579,10 +565,101 @@ public class IntervalUtils { } } - public static final long intervalSize(final List locs) { + public static long intervalSize(final List locs) { long size = 0; for ( final GenomeLoc loc : locs ) size += loc.size(); return size; } + + public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) { + ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference); + GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference()); + List originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath()); + + if (originalList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "File contains no intervals"); + + List flankingList = getFlankingIntervals(parser, originalList, basePairs); + + if (flankingList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals"); + + SAMFileHeader samFileHeader = new SAMFileHeader(); + samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); + IntervalList intervalList = new IntervalList(samFileHeader); + int i = 0; + for (GenomeLoc loc: flankingList) + intervalList.add(toInterval(loc, ++i)); + intervalList.write(flankingIntervals); + } + + /** + * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs. + * @param parser A genome loc parser for creating the new intervals + * @param locs Original genome locs + * @param basePairs Number of base pairs on each side of loc + * @return The list of intervals between the locs + */ + public static List getFlankingIntervals(final GenomeLocParser parser, final List locs, final int basePairs) { + List sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList(); + + if (sorted.size() == 0) + return Collections.emptyList(); + + LinkedHashMap> locsByContig = splitByContig(sorted); + List expanded = new ArrayList(); + for (String contig: locsByContig.keySet()) { + List contigLocs = locsByContig.get(contig); + int contigLocsSize = contigLocs.size(); + + GenomeLoc startLoc, stopLoc; + + // Create loc at start of the list + startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs); + if (startLoc != null) + expanded.add(startLoc); + + // Create locs between each loc[i] and loc[i+1] + for (int i = 0; i < contigLocsSize - 1; i++) { + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs); + startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs); + if (stopLoc.getStop() + 1 >= startLoc.getStart()) { + // NOTE: This is different than GenomeLoc.merge() + // merge() returns a loc which covers the entire range of stop and start, + // possibly returning positions inside loc(i) or loc(i+1) + // We want to make sure that the start of the stopLoc is used, and the stop of the startLoc + GenomeLoc merged = parser.createGenomeLoc( + stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop()); + expanded.add(merged); + } else { + expanded.add(stopLoc); + expanded.add(startLoc); + } + } + + // Create loc at the end of the list + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs); + if (stopLoc != null) + expanded.add(stopLoc); + } + return expanded; + } + + private static LinkedHashMap> splitByContig(List sorted) { + LinkedHashMap> splits = new LinkedHashMap>(); + GenomeLoc last = null; + List contigLocs = null; + for (GenomeLoc loc: sorted) { + if (GenomeLoc.isUnmapped(loc)) + continue; + if (last == null || !last.onSameContig(loc)) { + contigLocs = new ArrayList(); + splits.put(loc.getContig(), contigLocs); + } + contigLocs.add(loc); + last = loc; + } + return splits; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index f1f849bf5..e9f138a0e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; /** @@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testGetContigIndexValid() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference } @@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testGetContigInfoKnownContig() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference } @@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest { assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end } + + private static class FlankingGenomeLocTestData extends TestDataProvider { + final GenomeLocParser parser; + final int basePairs; + final GenomeLoc original, flankStart, flankStop; + + private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) { + super(FlankingGenomeLocTestData.class, name); + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.flankStart = flankStart == null ? null : parse(parser, flankStart); + this.flankStop = flankStop == null ? null : parse(parser, flankStop); + } + + private static GenomeLoc parse(GenomeLocParser parser, String str) { + return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str); + } + } + + @DataProvider(name = "flankingGenomeLocs") + public Object[][] getFlankingGenomeLocs() { + int contigLength = 10000; + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength); + GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary()); + + new FlankingGenomeLocTestData("atStartBase1", parser, 1, + "chr1:1", null, "chr1:2"); + + new FlankingGenomeLocTestData("atStartBase50", parser, 50, + "chr1:1", null, "chr1:2-51"); + + new FlankingGenomeLocTestData("atStartRange50", parser, 50, + "chr1:1-10", null, "chr1:11-60"); + + new FlankingGenomeLocTestData("atEndBase1", parser, 1, + "chr1:" + contigLength, "chr1:" + (contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndBase50", parser, 50, + "chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null); + + new FlankingGenomeLocTestData("atEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 10, contigLength), + String.format("chr1:%d-%d", contigLength - 60, contigLength - 11), + null); + + new FlankingGenomeLocTestData("nearStartBase1", parser, 1, + "chr1:2", "chr1:1", "chr1:3"); + + new FlankingGenomeLocTestData("nearStartRange50", parser, 50, + "chr1:21-30", "chr1:1-20", "chr1:31-80"); + + new FlankingGenomeLocTestData("nearEndBase1", parser, 1, + "chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength); + + new FlankingGenomeLocTestData("nearEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 30, contigLength - 21), + String.format("chr1:%d-%d", contigLength - 80, contigLength - 31), + String.format("chr1:%d-%d", contigLength - 20, contigLength)); + + new FlankingGenomeLocTestData("beyondStartBase1", parser, 1, + "chr1:3", "chr1:2", "chr1:4"); + + new FlankingGenomeLocTestData("beyondStartRange50", parser, 50, + "chr1:101-200", "chr1:51-100", "chr1:201-250"); + + new FlankingGenomeLocTestData("beyondEndBase1", parser, 1, + "chr1:" + (contigLength - 3), + "chr1:" + (contigLength - 4), + "chr1:" + (contigLength - 2)); + + new FlankingGenomeLocTestData("beyondEndRange50", parser, 50, + String.format("chr1:%d-%d", contigLength - 200, contigLength - 101), + String.format("chr1:%d-%d", contigLength - 250, contigLength - 201), + String.format("chr1:%d-%d", contigLength - 100, contigLength - 51)); + + new FlankingGenomeLocTestData("unmapped", parser, 50, + "unmapped", null, null); + + new FlankingGenomeLocTestData("fullContig", parser, 50, + "chr1", null, null); + + return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStart); + assertEquals(actual, data.flankStart, description); + } + + @Test(dataProvider = "flankingGenomeLocs") + public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) { + GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.flankStop); + assertEquals(actual, data.flankStop, description); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 9c3b905c2..03d33d2c5 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,8 +1,8 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; +import org.apache.commons.io.FileUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -762,4 +762,225 @@ public class IntervalUtilsUnitTest extends BaseTest { List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); Assert.assertEquals(merged.size(), 1); } + + /* + Split into tests that can be written to files and tested by writeFlankingIntervals, + and lists that cannot but are still handled by getFlankingIntervals. + */ + private static abstract class FlankingIntervalsTestData extends TestDataProvider { + final public File referenceFile; + final public GenomeLocParser parser; + final int basePairs; + final List original; + final List expected; + + protected FlankingIntervalsTestData(Class clazz, String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(clazz, name); + this.referenceFile = referenceFile; + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.expected = parse(parser, expected); + } + + private static List parse(GenomeLocParser parser, List locs) { + List parsed = new ArrayList(); + for (String loc: locs) + parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc)); + return parsed; + } + } + + private static class FlankingIntervalsFile extends FlankingIntervalsTestData { + public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + private static class FlankingIntervalsList extends FlankingIntervalsTestData { + public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + /* Intervals where the original and the flanks can be written to files. */ + @DataProvider(name = "flankingIntervalsFiles") + public Object[][] getFlankingIntervalsFiles() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength(); + + new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:1"), + Arrays.asList("1:2")); + + new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1"), + Arrays.asList("1:2-51")); + + new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1-10"), + Arrays.asList("1:11-60")); + + new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + hg19Length1), + Arrays.asList("1:" + (hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:" + hg19Length1), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11))); + + new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:2"), + Arrays.asList("1:1", "1:3")); + + new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:21-30"), + Arrays.asList("1:1-20", "1:31-80")); + + new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 1)), + Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1)); + + new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31), + String.format("1:%d-%d", hg19Length1 - 20, hg19Length1))); + + new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:3"), + Arrays.asList("1:2", "1:4")); + + new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200"), + Arrays.asList("1:51-100", "1:201-250")); + + new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 3)), + Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2))); + + new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201), + String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51))); + + new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:401-500"), + Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550")); + + new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:271-400"), + Arrays.asList("1:51-100", "1:201-270", "1:401-450")); + + new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:221-400"), + Arrays.asList("1:51-100", "1:201-220", "1:401-450")); + + new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:121-400"), + Arrays.asList("1:51-100", "1:401-450")); + + new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "2:301-400"), + Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450")); + + // Explicit testing a problematic agilent target pair + new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("2:74756257-74756411", "2:74756487-74756628"), + // wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679") + Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678")); + + return TestDataProvider.getTests(FlankingIntervalsFile.class); + } + + /* Intervals where either the original and/or the flanks cannot be written to a file. */ + @DataProvider(name = "flankingIntervalsLists") + public Object[][] getFlankingIntervalsLists() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + List empty = Collections.emptyList(); + + new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50, + empty, + empty); + + new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("unmapped"), + empty); + + new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1"), + empty); + + new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1", "2", "3"), + empty); + + new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400", "unmapped"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + return TestDataProvider.getTests(FlankingIntervalsList.class); + } + + @Test(dataProvider = "flankingIntervalsFiles") + public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + + List actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath()); + + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class) + public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + // Should throw a user exception on bad input if either the original + // intervals are empty or if the flanking intervals are empty + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists") + public void testGetFlankingIntervals(FlankingIntervalsTestData data) { + List actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala new file mode 100644 index 000000000..d90db0de4 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/WriteFlankingIntervalsFunction.scala @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.extensions.gatk + +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.commandline.{Output, Argument, Input} +import java.io.File +import org.broadinstitute.sting.utils.interval.IntervalUtils + +class WriteFlankingIntervalsFunction extends InProcessFunction { + @Input(doc="The reference sequence") + var reference : File = _ + + @Input(doc="The interval list to flank") + var inputIntervals : File = _ + + @Output(doc="The output intervals file to write to") + var outputIntervals: File = _ + + @Argument(doc="Number of base pair to flank the input intervals") + var flankSize : Int = _ + + def run() { + IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize) + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala deleted file mode 100755 index 77eb3ccbc..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala +++ /dev/null @@ -1,135 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import collection.JavaConversions._ -import org.broadinstitute.sting.utils.text.XReadLines -import net.sf.picard.reference.FastaSequenceFile -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} -import collection.immutable.TreeSet - -// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created -class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction { - @Input(doc="The interval list to expand") val inList : File = in - @Input(doc="The reference sequence") val refDict : File = ref - @Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start - @Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size - @Output(doc="The output intervals file to write to") val outList : File = out - @Argument(doc="The output format for the intervals") val outTypeStr = opType - @Argument(doc="The input format for the intervals") val inTypeStr = ipType - - var output : PrintStream = _ - var parser : GenomeLocParser = _ - var xrl : XReadLines = _ - val outType = IntervalFormatType.convert(outTypeStr) - val inType = IntervalFormatType.convert(inTypeStr) - - var offsetIn : Int = 0 - var offsetOut : Int = 0 - - var first : Boolean = true - var lastTwo : (GenomeLoc,GenomeLoc) = _ - - var intervalCache : TreeSet[GenomeLoc] = _ - val LINES_TO_CACHE : Int = 1000 - - def run = { - output = new PrintStream(outList) - intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{ - def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) } - }) - parser = new GenomeLocParser(new FastaSequenceFile(ref,true)) - xrl = new XReadLines(inList) - offsetIn = if (isBed(inType)) 1 else 0 - offsetOut = if( isBed(outType)) 1 else 0 - var line : String = xrl.next - while ( line.startsWith("@") ) { - line = xrl.next - } - var prevLoc: GenomeLoc = null - var curLoc: GenomeLoc = null - var nextLoc : GenomeLoc = parseGenomeInterval(line) - var linesProcessed : Int = 1 - while ( prevLoc != null || curLoc != null || nextLoc != null ) { - prevLoc = curLoc - curLoc = nextLoc - nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null - if ( curLoc != null ) { - val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc) - val right: GenomeLoc = refine(expandRight(curLoc),nextLoc) - if ( left != null ) { - intervalCache += left - } - if ( right != null ) { - intervalCache += right - } - } - linesProcessed += 1 - if ( linesProcessed % LINES_TO_CACHE == 0 ) { - val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt)) - intervalCache = intervalCache -- toPrint - toPrint.foreach(u => output.print("%s%n".format(repr(u)))) - } - //System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc))) - } - - intervalCache.foreach(u => output.print("%s%n".format(repr(u)))) - - output.close() - } - - def expandLeft(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt) - } - - def expandRight(g: GenomeLoc) : GenomeLoc = { - parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt) - } - - def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = { - if ( borderG == null || ! newG.overlapsP(borderG) ) { - return newG - } else { - if ( newG.getStart < borderG.getStart ) { - if ( borderG.getStart - startInt > newG.getStart ) { - return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt) - } - } else { - if ( borderG.getStop + startInt < newG.getStop ){ - return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop) - } - } - } - - null - } - - def repr(loc : GenomeLoc) : String = { - if ( loc == null ) return "null" - if ( outType == IntervalFormatType.INTERVALS ) { - return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop) - } else { - return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut) - } - } - - def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = { - t == IntervalFormatType.BED - } - - def parseGenomeInterval( s : String ) : GenomeLoc = { - val sp = s.split("\\s+") - // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted - if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn) - } - - object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") { - type IntervalFormatType = Value - val INTERVALS,BED,TDF = Value - - def convert(s : String) : IntervalFormatType = { - if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF} - } - } -} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala deleted file mode 100755 index e929477a1..000000000 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/IntersectIntervals.scala +++ /dev/null @@ -1,70 +0,0 @@ -package org.broadinstitute.sting.queue.library.ipf.intervals - -import org.broadinstitute.sting.queue.function.InProcessFunction -import collection.JavaConversions._ -import org.broadinstitute.sting.commandline._ -import java.io.{PrintStream, File} -import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary} -import org.broadinstitute.sting.utils.text.XReadLines -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} - -class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction { - @Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals - @Output(doc="Output interval file to which to write") val output : File = outFile - @Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false - @Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed - - - var outStream : PrintStream = _ - var contigs : List[String] = Nil - var dict : SAMSequenceDictionary = _ - var parser : GenomeLocParser = _ - - def run = { - outStream = new PrintStream(output) - dict = new SAMSequenceDictionary - // note: memory hog - val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex - sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE))) - parser = new GenomeLocParser(dict) - sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h)) - outStream.close() - } - - def writeOut(g : GenomeLoc) : Unit = { - outStream.print("%s%n".format(g.toString)) - } - - def parse(s : String) : (String,Int,Int) = { - if ( s.contains(":") ) { - val split1 = s.split(":") - val split2 = split1(1).split("-") - return (split1(0),split2(0).toInt,split2(1).toInt) - } else { - val split = s.split("\\s+") - return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) ) - } - } - - def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = { - parser.createGenomeLoc(coords._1,coords._2,coords._3) - } - - def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = { - if ( a.last.forall(u => u._1.overlapsP(b._1)) ) { - a.init :+ (a.last :+ b) - } else { - a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b) - } - } - - def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = { - if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for - null - } - u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) ) - } - - def range(a: Int, b: Int) : Range = new Range(a,b+1,1) - -} \ No newline at end of file