Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2011-11-18 12:29:33 -05:00
commit 8710673a97
10 changed files with 521 additions and 232 deletions

View File

@ -76,7 +76,7 @@
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3" />
<!-- snpEff annotator for pipelines -->
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.2" />
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.4rc3" />
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
<exclude org="javax.servlet" />

View File

@ -554,4 +554,54 @@ public class GenomeLocParser {
return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true);
}
/**
* Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size.
* @param loc The original loc
* @param maxBasePairs The maximum number of basePairs
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig.
*/
@Requires({"loc != null", "maxBasePairs > 0"})
public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) {
if (GenomeLoc.isUnmapped(loc))
return null;
String contigName = loc.getContig();
SAMSequenceRecord contig = contigInfo.getSequence(contigName);
int contigIndex = contig.getSequenceIndex();
int start = loc.getStart() - maxBasePairs;
int stop = loc.getStart() - 1;
if (start < 1)
start = 1;
if (stop < 1)
return null;
return createGenomeLoc(contigName, contigIndex, start, stop, true);
}
/**
* Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size.
* @param loc The original loc
* @param maxBasePairs The maximum number of basePairs
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig.
*/
@Requires({"loc != null", "maxBasePairs > 0"})
public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) {
if (GenomeLoc.isUnmapped(loc))
return null;
String contigName = loc.getContig();
SAMSequenceRecord contig = contigInfo.getSequence(contigName);
int contigIndex = contig.getSequenceIndex();
int contigLength = contig.getSequenceLength();
int start = loc.getStop() + 1;
int stop = loc.getStop() + maxBasePairs;
if (start > contigLength)
return null;
if (stop > contigLength)
stop = contigLength;
return createGenomeLoc(contigName, contigIndex, start, stop, true);
}
}

View File

@ -233,8 +233,12 @@ public class IntervalUtils {
*
* Returns a null string if there are no differences, otherwise returns a string describing the difference
* (useful for UnitTests). Assumes both lists are sorted
*
* @param masterArg sorted master genome locs
* @param testArg sorted test genome locs
* @return null string if there are no difference, otherwise a string describing the difference
*/
public static final String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
public static String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
LinkedList<GenomeLoc> master = new LinkedList<GenomeLoc>(masterArg);
LinkedList<GenomeLoc> test = new LinkedList<GenomeLoc>(testArg);
@ -317,23 +321,6 @@ public class IntervalUtils {
return lengths;
}
/**
* Counts the number of interval files an interval list can be split into using scatterIntervalArguments.
* @param locs The genome locs.
* @return The maximum number of parts the intervals can be split into.
*/
public static int countContigIntervals(List<GenomeLoc> locs) {
int maxFiles = 0;
String contig = null;
for (GenomeLoc loc: locs) {
if (contig == null || !contig.equals(loc.getContig())) {
maxFiles++;
contig = loc.getContig();
}
}
return maxFiles;
}
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
@ -373,7 +360,6 @@ public class IntervalUtils {
* @return A list of lists of genome locs, split according to splits
*/
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
int locIndex = 1;
int start = 0;
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
for (Integer stop: splits) {
@ -465,7 +451,7 @@ public class IntervalUtils {
@Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"})
@Ensures({"result != null"})
final static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
final List<GenomeLoc> split = new ArrayList<GenomeLoc>();
long size = 0;
@ -579,10 +565,101 @@ public class IntervalUtils {
}
}
public static final long intervalSize(final List<GenomeLoc> locs) {
public static long intervalSize(final List<GenomeLoc> locs) {
long size = 0;
for ( final GenomeLoc loc : locs )
size += loc.size();
return size;
}
public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) {
ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference);
GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference());
List<GenomeLoc> originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath());
if (originalList.isEmpty())
throw new UserException.MalformedFile(inputIntervals, "File contains no intervals");
List<GenomeLoc> flankingList = getFlankingIntervals(parser, originalList, basePairs);
if (flankingList.isEmpty())
throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals");
SAMFileHeader samFileHeader = new SAMFileHeader();
samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
IntervalList intervalList = new IntervalList(samFileHeader);
int i = 0;
for (GenomeLoc loc: flankingList)
intervalList.add(toInterval(loc, ++i));
intervalList.write(flankingIntervals);
}
/**
* Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs.
* @param parser A genome loc parser for creating the new intervals
* @param locs Original genome locs
* @param basePairs Number of base pairs on each side of loc
* @return The list of intervals between the locs
*/
public static List<GenomeLoc> getFlankingIntervals(final GenomeLocParser parser, final List<GenomeLoc> locs, final int basePairs) {
List<GenomeLoc> sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList();
if (sorted.size() == 0)
return Collections.emptyList();
LinkedHashMap<String, List<GenomeLoc>> locsByContig = splitByContig(sorted);
List<GenomeLoc> expanded = new ArrayList<GenomeLoc>();
for (String contig: locsByContig.keySet()) {
List<GenomeLoc> contigLocs = locsByContig.get(contig);
int contigLocsSize = contigLocs.size();
GenomeLoc startLoc, stopLoc;
// Create loc at start of the list
startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs);
if (startLoc != null)
expanded.add(startLoc);
// Create locs between each loc[i] and loc[i+1]
for (int i = 0; i < contigLocsSize - 1; i++) {
stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs);
startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs);
if (stopLoc.getStop() + 1 >= startLoc.getStart()) {
// NOTE: This is different than GenomeLoc.merge()
// merge() returns a loc which covers the entire range of stop and start,
// possibly returning positions inside loc(i) or loc(i+1)
// We want to make sure that the start of the stopLoc is used, and the stop of the startLoc
GenomeLoc merged = parser.createGenomeLoc(
stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop());
expanded.add(merged);
} else {
expanded.add(stopLoc);
expanded.add(startLoc);
}
}
// Create loc at the end of the list
stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs);
if (stopLoc != null)
expanded.add(stopLoc);
}
return expanded;
}
private static LinkedHashMap<String, List<GenomeLoc>> splitByContig(List<GenomeLoc> sorted) {
LinkedHashMap<String, List<GenomeLoc>> splits = new LinkedHashMap<String, List<GenomeLoc>>();
GenomeLoc last = null;
List<GenomeLoc> contigLocs = null;
for (GenomeLoc loc: sorted) {
if (GenomeLoc.isUnmapped(loc))
continue;
if (last == null || !last.onSameContig(loc)) {
contigLocs = new ArrayList<GenomeLoc>();
splits.put(loc.getContig(), contigLocs);
}
contigLocs.add(loc);
last = loc;
}
return splits;
}
}

View File

@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
@Test
public void testGetContigIndexValid() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference
}
@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
@Test
public void testGetContigInfoKnownContig() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference
}
@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest {
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
}
private static class FlankingGenomeLocTestData extends TestDataProvider {
final GenomeLocParser parser;
final int basePairs;
final GenomeLoc original, flankStart, flankStop;
private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) {
super(FlankingGenomeLocTestData.class, name);
this.parser = parser;
this.basePairs = basePairs;
this.original = parse(parser, original);
this.flankStart = flankStart == null ? null : parse(parser, flankStart);
this.flankStop = flankStop == null ? null : parse(parser, flankStop);
}
private static GenomeLoc parse(GenomeLocParser parser, String str) {
return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str);
}
}
@DataProvider(name = "flankingGenomeLocs")
public Object[][] getFlankingGenomeLocs() {
int contigLength = 10000;
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength);
GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary());
new FlankingGenomeLocTestData("atStartBase1", parser, 1,
"chr1:1", null, "chr1:2");
new FlankingGenomeLocTestData("atStartBase50", parser, 50,
"chr1:1", null, "chr1:2-51");
new FlankingGenomeLocTestData("atStartRange50", parser, 50,
"chr1:1-10", null, "chr1:11-60");
new FlankingGenomeLocTestData("atEndBase1", parser, 1,
"chr1:" + contigLength, "chr1:" + (contigLength - 1), null);
new FlankingGenomeLocTestData("atEndBase50", parser, 50,
"chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null);
new FlankingGenomeLocTestData("atEndRange50", parser, 50,
String.format("chr1:%d-%d", contigLength - 10, contigLength),
String.format("chr1:%d-%d", contigLength - 60, contigLength - 11),
null);
new FlankingGenomeLocTestData("nearStartBase1", parser, 1,
"chr1:2", "chr1:1", "chr1:3");
new FlankingGenomeLocTestData("nearStartRange50", parser, 50,
"chr1:21-30", "chr1:1-20", "chr1:31-80");
new FlankingGenomeLocTestData("nearEndBase1", parser, 1,
"chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength);
new FlankingGenomeLocTestData("nearEndRange50", parser, 50,
String.format("chr1:%d-%d", contigLength - 30, contigLength - 21),
String.format("chr1:%d-%d", contigLength - 80, contigLength - 31),
String.format("chr1:%d-%d", contigLength - 20, contigLength));
new FlankingGenomeLocTestData("beyondStartBase1", parser, 1,
"chr1:3", "chr1:2", "chr1:4");
new FlankingGenomeLocTestData("beyondStartRange50", parser, 50,
"chr1:101-200", "chr1:51-100", "chr1:201-250");
new FlankingGenomeLocTestData("beyondEndBase1", parser, 1,
"chr1:" + (contigLength - 3),
"chr1:" + (contigLength - 4),
"chr1:" + (contigLength - 2));
new FlankingGenomeLocTestData("beyondEndRange50", parser, 50,
String.format("chr1:%d-%d", contigLength - 200, contigLength - 101),
String.format("chr1:%d-%d", contigLength - 250, contigLength - 201),
String.format("chr1:%d-%d", contigLength - 100, contigLength - 51));
new FlankingGenomeLocTestData("unmapped", parser, 50,
"unmapped", null, null);
new FlankingGenomeLocTestData("fullContig", parser, 50,
"chr1", null, null);
return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class);
}
@Test(dataProvider = "flankingGenomeLocs")
public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) {
GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs);
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
data.toString(), data.original, actual, data.flankStart);
assertEquals(actual, data.flankStart, description);
}
@Test(dataProvider = "flankingGenomeLocs")
public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) {
GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs);
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
data.toString(), data.original, actual, data.flankStop);
assertEquals(actual, data.flankStop, description);
}
}

View File

@ -1,8 +1,8 @@
package org.broadinstitute.sting.utils.interval;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.util.IntervalUtil;
import net.sf.samtools.SAMFileHeader;
import org.apache.commons.io.FileUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@ -762,4 +762,225 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<GenomeLoc> merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
Assert.assertEquals(merged.size(), 1);
}
/*
Split into tests that can be written to files and tested by writeFlankingIntervals,
and lists that cannot but are still handled by getFlankingIntervals.
*/
private static abstract class FlankingIntervalsTestData extends TestDataProvider {
final public File referenceFile;
final public GenomeLocParser parser;
final int basePairs;
final List<GenomeLoc> original;
final List<GenomeLoc> expected;
protected FlankingIntervalsTestData(Class<?> clazz, String name, File referenceFile, GenomeLocParser parser,
int basePairs, List<String> original, List<String> expected) {
super(clazz, name);
this.referenceFile = referenceFile;
this.parser = parser;
this.basePairs = basePairs;
this.original = parse(parser, original);
this.expected = parse(parser, expected);
}
private static List<GenomeLoc> parse(GenomeLocParser parser, List<String> locs) {
List<GenomeLoc> parsed = new ArrayList<GenomeLoc>();
for (String loc: locs)
parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc));
return parsed;
}
}
private static class FlankingIntervalsFile extends FlankingIntervalsTestData {
public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser,
int basePairs, List<String> original, List<String> expected) {
super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected);
}
}
private static class FlankingIntervalsList extends FlankingIntervalsTestData {
public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser,
int basePairs, List<String> original, List<String> expected) {
super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected);
}
}
/* Intervals where the original and the flanks can be written to files. */
@DataProvider(name = "flankingIntervalsFiles")
public Object[][] getFlankingIntervalsFiles() {
File hg19ReferenceFile = new File(BaseTest.hg19Reference);
int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength();
new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
Arrays.asList("1:1"),
Arrays.asList("1:2"));
new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:1"),
Arrays.asList("1:2-51"));
new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:1-10"),
Arrays.asList("1:11-60"));
new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
Arrays.asList("1:" + hg19Length1),
Arrays.asList("1:" + (hg19Length1 - 1)));
new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:" + hg19Length1),
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1)));
new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)),
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11)));
new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
Arrays.asList("1:2"),
Arrays.asList("1:1", "1:3"));
new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:21-30"),
Arrays.asList("1:1-20", "1:31-80"));
new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
Arrays.asList("1:" + (hg19Length1 - 1)),
Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1));
new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)),
Arrays.asList(
String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31),
String.format("1:%d-%d", hg19Length1 - 20, hg19Length1)));
new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
Arrays.asList("1:3"),
Arrays.asList("1:2", "1:4"));
new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:101-200"),
Arrays.asList("1:51-100", "1:201-250"));
new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
Arrays.asList("1:" + (hg19Length1 - 3)),
Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2)));
new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)),
Arrays.asList(
String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201),
String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51)));
new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:101-200", "1:401-500"),
Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550"));
new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:101-200", "1:301-400"),
Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:101-200", "1:271-400"),
Arrays.asList("1:51-100", "1:201-270", "1:401-450"));
new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:101-200", "1:221-400"),
Arrays.asList("1:51-100", "1:201-220", "1:401-450"));
new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:101-200", "1:121-400"),
Arrays.asList("1:51-100", "1:401-450"));
new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:101-200", "2:301-400"),
Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450"));
// Explicit testing a problematic agilent target pair
new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("2:74756257-74756411", "2:74756487-74756628"),
// wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679")
Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678"));
return TestDataProvider.getTests(FlankingIntervalsFile.class);
}
/* Intervals where either the original and/or the flanks cannot be written to a file. */
@DataProvider(name = "flankingIntervalsLists")
public Object[][] getFlankingIntervalsLists() {
File hg19ReferenceFile = new File(BaseTest.hg19Reference);
List<String> empty = Collections.emptyList();
new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50,
empty,
empty);
new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("unmapped"),
empty);
new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1"),
empty);
new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1", "2", "3"),
empty);
new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
Arrays.asList("1:101-200", "1:301-400", "unmapped"),
Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
return TestDataProvider.getTests(FlankingIntervalsList.class);
}
@Test(dataProvider = "flankingIntervalsFiles")
public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
File originalFile = createTempFile("original.", ".intervals");
File flankingFile = createTempFile("flanking.", ".intervals");
try {
List<String> lines = new ArrayList<String>();
for (GenomeLoc loc: data.original)
lines.add(loc.toString());
FileUtils.writeLines(originalFile, lines);
IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
List<GenomeLoc> actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath());
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
data.toString(), data.original, actual, data.expected);
Assert.assertEquals(actual, data.expected, description);
} finally {
FileUtils.deleteQuietly(originalFile);
FileUtils.deleteQuietly(flankingFile);
}
}
@Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class)
public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
File originalFile = createTempFile("original.", ".intervals");
File flankingFile = createTempFile("flanking.", ".intervals");
try {
List<String> lines = new ArrayList<String>();
for (GenomeLoc loc: data.original)
lines.add(loc.toString());
FileUtils.writeLines(originalFile, lines);
// Should throw a user exception on bad input if either the original
// intervals are empty or if the flanking intervals are empty
IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
} finally {
FileUtils.deleteQuietly(originalFile);
FileUtils.deleteQuietly(flankingFile);
}
}
@Test(dataProvider = "flankingIntervalsLists")
public void testGetFlankingIntervals(FlankingIntervalsTestData data) {
List<GenomeLoc> actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs);
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
data.toString(), data.original, actual, data.expected);
Assert.assertEquals(actual, data.expected, description);
}
}

View File

@ -0,0 +1,48 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.extensions.gatk
import org.broadinstitute.sting.queue.function.InProcessFunction
import org.broadinstitute.sting.commandline.{Output, Argument, Input}
import java.io.File
import org.broadinstitute.sting.utils.interval.IntervalUtils
class WriteFlankingIntervalsFunction extends InProcessFunction {
@Input(doc="The reference sequence")
var reference : File = _
@Input(doc="The interval list to flank")
var inputIntervals : File = _
@Output(doc="The output intervals file to write to")
var outputIntervals: File = _
@Argument(doc="Number of base pair to flank the input intervals")
var flankSize : Int = _
def run() {
IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize)
}
}

View File

@ -1,135 +0,0 @@
package org.broadinstitute.sting.queue.library.ipf.intervals
import org.broadinstitute.sting.queue.function.InProcessFunction
import org.broadinstitute.sting.commandline._
import java.io.{PrintStream, File}
import collection.JavaConversions._
import org.broadinstitute.sting.utils.text.XReadLines
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
import collection.immutable.TreeSet
// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction {
@Input(doc="The interval list to expand") val inList : File = in
@Input(doc="The reference sequence") val refDict : File = ref
@Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start
@Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size
@Output(doc="The output intervals file to write to") val outList : File = out
@Argument(doc="The output format for the intervals") val outTypeStr = opType
@Argument(doc="The input format for the intervals") val inTypeStr = ipType
var output : PrintStream = _
var parser : GenomeLocParser = _
var xrl : XReadLines = _
val outType = IntervalFormatType.convert(outTypeStr)
val inType = IntervalFormatType.convert(inTypeStr)
var offsetIn : Int = 0
var offsetOut : Int = 0
var first : Boolean = true
var lastTwo : (GenomeLoc,GenomeLoc) = _
var intervalCache : TreeSet[GenomeLoc] = _
val LINES_TO_CACHE : Int = 1000
def run = {
output = new PrintStream(outList)
intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{
def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) }
})
parser = new GenomeLocParser(new FastaSequenceFile(ref,true))
xrl = new XReadLines(inList)
offsetIn = if (isBed(inType)) 1 else 0
offsetOut = if( isBed(outType)) 1 else 0
var line : String = xrl.next
while ( line.startsWith("@") ) {
line = xrl.next
}
var prevLoc: GenomeLoc = null
var curLoc: GenomeLoc = null
var nextLoc : GenomeLoc = parseGenomeInterval(line)
var linesProcessed : Int = 1
while ( prevLoc != null || curLoc != null || nextLoc != null ) {
prevLoc = curLoc
curLoc = nextLoc
nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null
if ( curLoc != null ) {
val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc)
val right: GenomeLoc = refine(expandRight(curLoc),nextLoc)
if ( left != null ) {
intervalCache += left
}
if ( right != null ) {
intervalCache += right
}
}
linesProcessed += 1
if ( linesProcessed % LINES_TO_CACHE == 0 ) {
val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt))
intervalCache = intervalCache -- toPrint
toPrint.foreach(u => output.print("%s%n".format(repr(u))))
}
//System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc)))
}
intervalCache.foreach(u => output.print("%s%n".format(repr(u))))
output.close()
}
def expandLeft(g: GenomeLoc) : GenomeLoc = {
parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt)
}
def expandRight(g: GenomeLoc) : GenomeLoc = {
parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt)
}
def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = {
if ( borderG == null || ! newG.overlapsP(borderG) ) {
return newG
} else {
if ( newG.getStart < borderG.getStart ) {
if ( borderG.getStart - startInt > newG.getStart ) {
return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt)
}
} else {
if ( borderG.getStop + startInt < newG.getStop ){
return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop)
}
}
}
null
}
def repr(loc : GenomeLoc) : String = {
if ( loc == null ) return "null"
if ( outType == IntervalFormatType.INTERVALS ) {
return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop)
} else {
return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut)
}
}
def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = {
t == IntervalFormatType.BED
}
def parseGenomeInterval( s : String ) : GenomeLoc = {
val sp = s.split("\\s+")
// todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted
if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn)
}
object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") {
type IntervalFormatType = Value
val INTERVALS,BED,TDF = Value
def convert(s : String) : IntervalFormatType = {
if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF}
}
}
}

View File

@ -1,70 +0,0 @@
package org.broadinstitute.sting.queue.library.ipf.intervals
import org.broadinstitute.sting.queue.function.InProcessFunction
import collection.JavaConversions._
import org.broadinstitute.sting.commandline._
import java.io.{PrintStream, File}
import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary}
import org.broadinstitute.sting.utils.text.XReadLines
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction {
@Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
@Output(doc="Output interval file to which to write") val output : File = outFile
@Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false
@Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed
var outStream : PrintStream = _
var contigs : List[String] = Nil
var dict : SAMSequenceDictionary = _
var parser : GenomeLocParser = _
def run = {
outStream = new PrintStream(output)
dict = new SAMSequenceDictionary
// note: memory hog
val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex
sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE)))
parser = new GenomeLocParser(dict)
sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h))
outStream.close()
}
def writeOut(g : GenomeLoc) : Unit = {
outStream.print("%s%n".format(g.toString))
}
def parse(s : String) : (String,Int,Int) = {
if ( s.contains(":") ) {
val split1 = s.split(":")
val split2 = split1(1).split("-")
return (split1(0),split2(0).toInt,split2(1).toInt)
} else {
val split = s.split("\\s+")
return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) )
}
}
def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = {
parser.createGenomeLoc(coords._1,coords._2,coords._3)
}
def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = {
if ( a.last.forall(u => u._1.overlapsP(b._1)) ) {
a.init :+ (a.last :+ b)
} else {
a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b)
}
}
def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = {
if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for
null
}
u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) )
}
def range(a: Int, b: Int) : Range = new Range(a,b+1,1)
}

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="net.sf.snpeff" module="snpeff" revision="2.0.2" status="release" />
<info organisation="net.sf.snpeff" module="snpeff" revision="2.0.4rc3" status="release" />
</ivy-module>