Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
8710673a97
2
ivy.xml
2
ivy.xml
|
|
@ -76,7 +76,7 @@
|
||||||
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3" />
|
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3" />
|
||||||
|
|
||||||
<!-- snpEff annotator for pipelines -->
|
<!-- snpEff annotator for pipelines -->
|
||||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.2" />
|
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.4rc3" />
|
||||||
|
|
||||||
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
|
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
|
||||||
<exclude org="javax.servlet" />
|
<exclude org="javax.servlet" />
|
||||||
|
|
|
||||||
|
|
@ -554,4 +554,54 @@ public class GenomeLocParser {
|
||||||
return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true);
|
return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size.
|
||||||
|
* @param loc The original loc
|
||||||
|
* @param maxBasePairs The maximum number of basePairs
|
||||||
|
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig.
|
||||||
|
*/
|
||||||
|
@Requires({"loc != null", "maxBasePairs > 0"})
|
||||||
|
public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) {
|
||||||
|
if (GenomeLoc.isUnmapped(loc))
|
||||||
|
return null;
|
||||||
|
String contigName = loc.getContig();
|
||||||
|
SAMSequenceRecord contig = contigInfo.getSequence(contigName);
|
||||||
|
int contigIndex = contig.getSequenceIndex();
|
||||||
|
|
||||||
|
int start = loc.getStart() - maxBasePairs;
|
||||||
|
int stop = loc.getStart() - 1;
|
||||||
|
|
||||||
|
if (start < 1)
|
||||||
|
start = 1;
|
||||||
|
if (stop < 1)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return createGenomeLoc(contigName, contigIndex, start, stop, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size.
|
||||||
|
* @param loc The original loc
|
||||||
|
* @param maxBasePairs The maximum number of basePairs
|
||||||
|
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig.
|
||||||
|
*/
|
||||||
|
@Requires({"loc != null", "maxBasePairs > 0"})
|
||||||
|
public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) {
|
||||||
|
if (GenomeLoc.isUnmapped(loc))
|
||||||
|
return null;
|
||||||
|
String contigName = loc.getContig();
|
||||||
|
SAMSequenceRecord contig = contigInfo.getSequence(contigName);
|
||||||
|
int contigIndex = contig.getSequenceIndex();
|
||||||
|
int contigLength = contig.getSequenceLength();
|
||||||
|
|
||||||
|
int start = loc.getStop() + 1;
|
||||||
|
int stop = loc.getStop() + maxBasePairs;
|
||||||
|
|
||||||
|
if (start > contigLength)
|
||||||
|
return null;
|
||||||
|
if (stop > contigLength)
|
||||||
|
stop = contigLength;
|
||||||
|
|
||||||
|
return createGenomeLoc(contigName, contigIndex, start, stop, true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -233,8 +233,12 @@ public class IntervalUtils {
|
||||||
*
|
*
|
||||||
* Returns a null string if there are no differences, otherwise returns a string describing the difference
|
* Returns a null string if there are no differences, otherwise returns a string describing the difference
|
||||||
* (useful for UnitTests). Assumes both lists are sorted
|
* (useful for UnitTests). Assumes both lists are sorted
|
||||||
|
*
|
||||||
|
* @param masterArg sorted master genome locs
|
||||||
|
* @param testArg sorted test genome locs
|
||||||
|
* @return null string if there are no difference, otherwise a string describing the difference
|
||||||
*/
|
*/
|
||||||
public static final String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
|
public static String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
|
||||||
LinkedList<GenomeLoc> master = new LinkedList<GenomeLoc>(masterArg);
|
LinkedList<GenomeLoc> master = new LinkedList<GenomeLoc>(masterArg);
|
||||||
LinkedList<GenomeLoc> test = new LinkedList<GenomeLoc>(testArg);
|
LinkedList<GenomeLoc> test = new LinkedList<GenomeLoc>(testArg);
|
||||||
|
|
||||||
|
|
@ -317,23 +321,6 @@ public class IntervalUtils {
|
||||||
return lengths;
|
return lengths;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Counts the number of interval files an interval list can be split into using scatterIntervalArguments.
|
|
||||||
* @param locs The genome locs.
|
|
||||||
* @return The maximum number of parts the intervals can be split into.
|
|
||||||
*/
|
|
||||||
public static int countContigIntervals(List<GenomeLoc> locs) {
|
|
||||||
int maxFiles = 0;
|
|
||||||
String contig = null;
|
|
||||||
for (GenomeLoc loc: locs) {
|
|
||||||
if (contig == null || !contig.equals(loc.getContig())) {
|
|
||||||
maxFiles++;
|
|
||||||
contig = loc.getContig();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return maxFiles;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits an interval list into multiple files.
|
* Splits an interval list into multiple files.
|
||||||
* @param fileHeader The sam file header.
|
* @param fileHeader The sam file header.
|
||||||
|
|
@ -373,7 +360,6 @@ public class IntervalUtils {
|
||||||
* @return A list of lists of genome locs, split according to splits
|
* @return A list of lists of genome locs, split according to splits
|
||||||
*/
|
*/
|
||||||
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
|
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
|
||||||
int locIndex = 1;
|
|
||||||
int start = 0;
|
int start = 0;
|
||||||
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
|
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
|
||||||
for (Integer stop: splits) {
|
for (Integer stop: splits) {
|
||||||
|
|
@ -465,7 +451,7 @@ public class IntervalUtils {
|
||||||
|
|
||||||
@Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"})
|
@Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"})
|
||||||
@Ensures({"result != null"})
|
@Ensures({"result != null"})
|
||||||
final static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
|
static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
|
||||||
final List<GenomeLoc> split = new ArrayList<GenomeLoc>();
|
final List<GenomeLoc> split = new ArrayList<GenomeLoc>();
|
||||||
long size = 0;
|
long size = 0;
|
||||||
|
|
||||||
|
|
@ -579,10 +565,101 @@ public class IntervalUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static final long intervalSize(final List<GenomeLoc> locs) {
|
public static long intervalSize(final List<GenomeLoc> locs) {
|
||||||
long size = 0;
|
long size = 0;
|
||||||
for ( final GenomeLoc loc : locs )
|
for ( final GenomeLoc loc : locs )
|
||||||
size += loc.size();
|
size += loc.size();
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) {
|
||||||
|
ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference);
|
||||||
|
GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference());
|
||||||
|
List<GenomeLoc> originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath());
|
||||||
|
|
||||||
|
if (originalList.isEmpty())
|
||||||
|
throw new UserException.MalformedFile(inputIntervals, "File contains no intervals");
|
||||||
|
|
||||||
|
List<GenomeLoc> flankingList = getFlankingIntervals(parser, originalList, basePairs);
|
||||||
|
|
||||||
|
if (flankingList.isEmpty())
|
||||||
|
throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals");
|
||||||
|
|
||||||
|
SAMFileHeader samFileHeader = new SAMFileHeader();
|
||||||
|
samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
|
||||||
|
IntervalList intervalList = new IntervalList(samFileHeader);
|
||||||
|
int i = 0;
|
||||||
|
for (GenomeLoc loc: flankingList)
|
||||||
|
intervalList.add(toInterval(loc, ++i));
|
||||||
|
intervalList.write(flankingIntervals);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs.
|
||||||
|
* @param parser A genome loc parser for creating the new intervals
|
||||||
|
* @param locs Original genome locs
|
||||||
|
* @param basePairs Number of base pairs on each side of loc
|
||||||
|
* @return The list of intervals between the locs
|
||||||
|
*/
|
||||||
|
public static List<GenomeLoc> getFlankingIntervals(final GenomeLocParser parser, final List<GenomeLoc> locs, final int basePairs) {
|
||||||
|
List<GenomeLoc> sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList();
|
||||||
|
|
||||||
|
if (sorted.size() == 0)
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
LinkedHashMap<String, List<GenomeLoc>> locsByContig = splitByContig(sorted);
|
||||||
|
List<GenomeLoc> expanded = new ArrayList<GenomeLoc>();
|
||||||
|
for (String contig: locsByContig.keySet()) {
|
||||||
|
List<GenomeLoc> contigLocs = locsByContig.get(contig);
|
||||||
|
int contigLocsSize = contigLocs.size();
|
||||||
|
|
||||||
|
GenomeLoc startLoc, stopLoc;
|
||||||
|
|
||||||
|
// Create loc at start of the list
|
||||||
|
startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs);
|
||||||
|
if (startLoc != null)
|
||||||
|
expanded.add(startLoc);
|
||||||
|
|
||||||
|
// Create locs between each loc[i] and loc[i+1]
|
||||||
|
for (int i = 0; i < contigLocsSize - 1; i++) {
|
||||||
|
stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs);
|
||||||
|
startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs);
|
||||||
|
if (stopLoc.getStop() + 1 >= startLoc.getStart()) {
|
||||||
|
// NOTE: This is different than GenomeLoc.merge()
|
||||||
|
// merge() returns a loc which covers the entire range of stop and start,
|
||||||
|
// possibly returning positions inside loc(i) or loc(i+1)
|
||||||
|
// We want to make sure that the start of the stopLoc is used, and the stop of the startLoc
|
||||||
|
GenomeLoc merged = parser.createGenomeLoc(
|
||||||
|
stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop());
|
||||||
|
expanded.add(merged);
|
||||||
|
} else {
|
||||||
|
expanded.add(stopLoc);
|
||||||
|
expanded.add(startLoc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create loc at the end of the list
|
||||||
|
stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs);
|
||||||
|
if (stopLoc != null)
|
||||||
|
expanded.add(stopLoc);
|
||||||
|
}
|
||||||
|
return expanded;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static LinkedHashMap<String, List<GenomeLoc>> splitByContig(List<GenomeLoc> sorted) {
|
||||||
|
LinkedHashMap<String, List<GenomeLoc>> splits = new LinkedHashMap<String, List<GenomeLoc>>();
|
||||||
|
GenomeLoc last = null;
|
||||||
|
List<GenomeLoc> contigLocs = null;
|
||||||
|
for (GenomeLoc loc: sorted) {
|
||||||
|
if (GenomeLoc.isUnmapped(loc))
|
||||||
|
continue;
|
||||||
|
if (last == null || !last.onSameContig(loc)) {
|
||||||
|
contigLocs = new ArrayList<GenomeLoc>();
|
||||||
|
splits.put(loc.getContig(), contigLocs);
|
||||||
|
}
|
||||||
|
contigLocs.add(loc);
|
||||||
|
last = loc;
|
||||||
|
}
|
||||||
|
return splits;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import net.sf.samtools.SAMSequenceDictionary;
|
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
import static org.testng.Assert.assertEquals;
|
import static org.testng.Assert.assertEquals;
|
||||||
import static org.testng.Assert.assertTrue;
|
import static org.testng.Assert.assertTrue;
|
||||||
import org.testng.annotations.BeforeClass;
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.DataProvider;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetContigIndexValid() {
|
public void testGetContigIndexValid() {
|
||||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
|
|
||||||
assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference
|
assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetContigInfoKnownContig() {
|
public void testGetContigInfoKnownContig() {
|
||||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
|
|
||||||
assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference
|
assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
||||||
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
|
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
|
||||||
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
|
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class FlankingGenomeLocTestData extends TestDataProvider {
|
||||||
|
final GenomeLocParser parser;
|
||||||
|
final int basePairs;
|
||||||
|
final GenomeLoc original, flankStart, flankStop;
|
||||||
|
|
||||||
|
private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) {
|
||||||
|
super(FlankingGenomeLocTestData.class, name);
|
||||||
|
this.parser = parser;
|
||||||
|
this.basePairs = basePairs;
|
||||||
|
this.original = parse(parser, original);
|
||||||
|
this.flankStart = flankStart == null ? null : parse(parser, flankStart);
|
||||||
|
this.flankStop = flankStop == null ? null : parse(parser, flankStop);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static GenomeLoc parse(GenomeLocParser parser, String str) {
|
||||||
|
return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@DataProvider(name = "flankingGenomeLocs")
|
||||||
|
public Object[][] getFlankingGenomeLocs() {
|
||||||
|
int contigLength = 10000;
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength);
|
||||||
|
GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary());
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("atStartBase1", parser, 1,
|
||||||
|
"chr1:1", null, "chr1:2");
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("atStartBase50", parser, 50,
|
||||||
|
"chr1:1", null, "chr1:2-51");
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("atStartRange50", parser, 50,
|
||||||
|
"chr1:1-10", null, "chr1:11-60");
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("atEndBase1", parser, 1,
|
||||||
|
"chr1:" + contigLength, "chr1:" + (contigLength - 1), null);
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("atEndBase50", parser, 50,
|
||||||
|
"chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null);
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("atEndRange50", parser, 50,
|
||||||
|
String.format("chr1:%d-%d", contigLength - 10, contigLength),
|
||||||
|
String.format("chr1:%d-%d", contigLength - 60, contigLength - 11),
|
||||||
|
null);
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("nearStartBase1", parser, 1,
|
||||||
|
"chr1:2", "chr1:1", "chr1:3");
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("nearStartRange50", parser, 50,
|
||||||
|
"chr1:21-30", "chr1:1-20", "chr1:31-80");
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("nearEndBase1", parser, 1,
|
||||||
|
"chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength);
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("nearEndRange50", parser, 50,
|
||||||
|
String.format("chr1:%d-%d", contigLength - 30, contigLength - 21),
|
||||||
|
String.format("chr1:%d-%d", contigLength - 80, contigLength - 31),
|
||||||
|
String.format("chr1:%d-%d", contigLength - 20, contigLength));
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("beyondStartBase1", parser, 1,
|
||||||
|
"chr1:3", "chr1:2", "chr1:4");
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("beyondStartRange50", parser, 50,
|
||||||
|
"chr1:101-200", "chr1:51-100", "chr1:201-250");
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("beyondEndBase1", parser, 1,
|
||||||
|
"chr1:" + (contigLength - 3),
|
||||||
|
"chr1:" + (contigLength - 4),
|
||||||
|
"chr1:" + (contigLength - 2));
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("beyondEndRange50", parser, 50,
|
||||||
|
String.format("chr1:%d-%d", contigLength - 200, contigLength - 101),
|
||||||
|
String.format("chr1:%d-%d", contigLength - 250, contigLength - 201),
|
||||||
|
String.format("chr1:%d-%d", contigLength - 100, contigLength - 51));
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("unmapped", parser, 50,
|
||||||
|
"unmapped", null, null);
|
||||||
|
|
||||||
|
new FlankingGenomeLocTestData("fullContig", parser, 50,
|
||||||
|
"chr1", null, null);
|
||||||
|
|
||||||
|
return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "flankingGenomeLocs")
|
||||||
|
public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) {
|
||||||
|
GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs);
|
||||||
|
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||||
|
data.toString(), data.original, actual, data.flankStart);
|
||||||
|
assertEquals(actual, data.flankStart, description);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "flankingGenomeLocs")
|
||||||
|
public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) {
|
||||||
|
GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs);
|
||||||
|
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||||
|
data.toString(), data.original, actual, data.flankStop);
|
||||||
|
assertEquals(actual, data.flankStop, description);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
package org.broadinstitute.sting.utils.interval;
|
package org.broadinstitute.sting.utils.interval;
|
||||||
|
|
||||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
import net.sf.picard.util.IntervalUtil;
|
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
@ -762,4 +762,225 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
||||||
List<GenomeLoc> merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
|
List<GenomeLoc> merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
|
||||||
Assert.assertEquals(merged.size(), 1);
|
Assert.assertEquals(merged.size(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Split into tests that can be written to files and tested by writeFlankingIntervals,
|
||||||
|
and lists that cannot but are still handled by getFlankingIntervals.
|
||||||
|
*/
|
||||||
|
private static abstract class FlankingIntervalsTestData extends TestDataProvider {
|
||||||
|
final public File referenceFile;
|
||||||
|
final public GenomeLocParser parser;
|
||||||
|
final int basePairs;
|
||||||
|
final List<GenomeLoc> original;
|
||||||
|
final List<GenomeLoc> expected;
|
||||||
|
|
||||||
|
protected FlankingIntervalsTestData(Class<?> clazz, String name, File referenceFile, GenomeLocParser parser,
|
||||||
|
int basePairs, List<String> original, List<String> expected) {
|
||||||
|
super(clazz, name);
|
||||||
|
this.referenceFile = referenceFile;
|
||||||
|
this.parser = parser;
|
||||||
|
this.basePairs = basePairs;
|
||||||
|
this.original = parse(parser, original);
|
||||||
|
this.expected = parse(parser, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<GenomeLoc> parse(GenomeLocParser parser, List<String> locs) {
|
||||||
|
List<GenomeLoc> parsed = new ArrayList<GenomeLoc>();
|
||||||
|
for (String loc: locs)
|
||||||
|
parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc));
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class FlankingIntervalsFile extends FlankingIntervalsTestData {
|
||||||
|
public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser,
|
||||||
|
int basePairs, List<String> original, List<String> expected) {
|
||||||
|
super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class FlankingIntervalsList extends FlankingIntervalsTestData {
|
||||||
|
public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser,
|
||||||
|
int basePairs, List<String> original, List<String> expected) {
|
||||||
|
super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Intervals where the original and the flanks can be written to files. */
|
||||||
|
@DataProvider(name = "flankingIntervalsFiles")
|
||||||
|
public Object[][] getFlankingIntervalsFiles() {
|
||||||
|
File hg19ReferenceFile = new File(BaseTest.hg19Reference);
|
||||||
|
int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength();
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||||
|
Arrays.asList("1:1"),
|
||||||
|
Arrays.asList("1:2"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:1"),
|
||||||
|
Arrays.asList("1:2-51"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:1-10"),
|
||||||
|
Arrays.asList("1:11-60"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||||
|
Arrays.asList("1:" + hg19Length1),
|
||||||
|
Arrays.asList("1:" + (hg19Length1 - 1)));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:" + hg19Length1),
|
||||||
|
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1)));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)),
|
||||||
|
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11)));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||||
|
Arrays.asList("1:2"),
|
||||||
|
Arrays.asList("1:1", "1:3"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:21-30"),
|
||||||
|
Arrays.asList("1:1-20", "1:31-80"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||||
|
Arrays.asList("1:" + (hg19Length1 - 1)),
|
||||||
|
Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)),
|
||||||
|
Arrays.asList(
|
||||||
|
String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31),
|
||||||
|
String.format("1:%d-%d", hg19Length1 - 20, hg19Length1)));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||||
|
Arrays.asList("1:3"),
|
||||||
|
Arrays.asList("1:2", "1:4"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:101-200"),
|
||||||
|
Arrays.asList("1:51-100", "1:201-250"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||||
|
Arrays.asList("1:" + (hg19Length1 - 3)),
|
||||||
|
Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2)));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)),
|
||||||
|
Arrays.asList(
|
||||||
|
String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201),
|
||||||
|
String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51)));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:101-200", "1:401-500"),
|
||||||
|
Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:101-200", "1:301-400"),
|
||||||
|
Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:101-200", "1:271-400"),
|
||||||
|
Arrays.asList("1:51-100", "1:201-270", "1:401-450"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:101-200", "1:221-400"),
|
||||||
|
Arrays.asList("1:51-100", "1:201-220", "1:401-450"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:101-200", "1:121-400"),
|
||||||
|
Arrays.asList("1:51-100", "1:401-450"));
|
||||||
|
|
||||||
|
new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:101-200", "2:301-400"),
|
||||||
|
Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450"));
|
||||||
|
|
||||||
|
// Explicit testing a problematic agilent target pair
|
||||||
|
new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("2:74756257-74756411", "2:74756487-74756628"),
|
||||||
|
// wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679")
|
||||||
|
Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678"));
|
||||||
|
|
||||||
|
return TestDataProvider.getTests(FlankingIntervalsFile.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Intervals where either the original and/or the flanks cannot be written to a file. */
|
||||||
|
@DataProvider(name = "flankingIntervalsLists")
|
||||||
|
public Object[][] getFlankingIntervalsLists() {
|
||||||
|
File hg19ReferenceFile = new File(BaseTest.hg19Reference);
|
||||||
|
List<String> empty = Collections.emptyList();
|
||||||
|
|
||||||
|
new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
empty,
|
||||||
|
empty);
|
||||||
|
|
||||||
|
new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("unmapped"),
|
||||||
|
empty);
|
||||||
|
|
||||||
|
new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1"),
|
||||||
|
empty);
|
||||||
|
|
||||||
|
new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1", "2", "3"),
|
||||||
|
empty);
|
||||||
|
|
||||||
|
new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||||
|
Arrays.asList("1:101-200", "1:301-400", "unmapped"),
|
||||||
|
Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
|
||||||
|
|
||||||
|
return TestDataProvider.getTests(FlankingIntervalsList.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "flankingIntervalsFiles")
|
||||||
|
public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
|
||||||
|
File originalFile = createTempFile("original.", ".intervals");
|
||||||
|
File flankingFile = createTempFile("flanking.", ".intervals");
|
||||||
|
try {
|
||||||
|
List<String> lines = new ArrayList<String>();
|
||||||
|
for (GenomeLoc loc: data.original)
|
||||||
|
lines.add(loc.toString());
|
||||||
|
FileUtils.writeLines(originalFile, lines);
|
||||||
|
|
||||||
|
IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
|
||||||
|
|
||||||
|
List<GenomeLoc> actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath());
|
||||||
|
|
||||||
|
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||||
|
data.toString(), data.original, actual, data.expected);
|
||||||
|
Assert.assertEquals(actual, data.expected, description);
|
||||||
|
} finally {
|
||||||
|
FileUtils.deleteQuietly(originalFile);
|
||||||
|
FileUtils.deleteQuietly(flankingFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class)
|
||||||
|
public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
|
||||||
|
File originalFile = createTempFile("original.", ".intervals");
|
||||||
|
File flankingFile = createTempFile("flanking.", ".intervals");
|
||||||
|
try {
|
||||||
|
List<String> lines = new ArrayList<String>();
|
||||||
|
for (GenomeLoc loc: data.original)
|
||||||
|
lines.add(loc.toString());
|
||||||
|
FileUtils.writeLines(originalFile, lines);
|
||||||
|
|
||||||
|
// Should throw a user exception on bad input if either the original
|
||||||
|
// intervals are empty or if the flanking intervals are empty
|
||||||
|
IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
|
||||||
|
} finally {
|
||||||
|
FileUtils.deleteQuietly(originalFile);
|
||||||
|
FileUtils.deleteQuietly(flankingFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(dataProvider = "flankingIntervalsLists")
|
||||||
|
public void testGetFlankingIntervals(FlankingIntervalsTestData data) {
|
||||||
|
List<GenomeLoc> actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs);
|
||||||
|
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||||
|
data.toString(), data.original, actual, data.expected);
|
||||||
|
Assert.assertEquals(actual, data.expected, description);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.queue.extensions.gatk
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||||
|
import org.broadinstitute.sting.commandline.{Output, Argument, Input}
|
||||||
|
import java.io.File
|
||||||
|
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||||
|
|
||||||
|
class WriteFlankingIntervalsFunction extends InProcessFunction {
|
||||||
|
@Input(doc="The reference sequence")
|
||||||
|
var reference : File = _
|
||||||
|
|
||||||
|
@Input(doc="The interval list to flank")
|
||||||
|
var inputIntervals : File = _
|
||||||
|
|
||||||
|
@Output(doc="The output intervals file to write to")
|
||||||
|
var outputIntervals: File = _
|
||||||
|
|
||||||
|
@Argument(doc="Number of base pair to flank the input intervals")
|
||||||
|
var flankSize : Int = _
|
||||||
|
|
||||||
|
def run() {
|
||||||
|
IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,135 +0,0 @@
|
||||||
package org.broadinstitute.sting.queue.library.ipf.intervals
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
|
||||||
import org.broadinstitute.sting.commandline._
|
|
||||||
import java.io.{PrintStream, File}
|
|
||||||
import collection.JavaConversions._
|
|
||||||
import org.broadinstitute.sting.utils.text.XReadLines
|
|
||||||
import net.sf.picard.reference.FastaSequenceFile
|
|
||||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
|
||||||
import collection.immutable.TreeSet
|
|
||||||
|
|
||||||
// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
|
|
||||||
class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction {
|
|
||||||
@Input(doc="The interval list to expand") val inList : File = in
|
|
||||||
@Input(doc="The reference sequence") val refDict : File = ref
|
|
||||||
@Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start
|
|
||||||
@Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size
|
|
||||||
@Output(doc="The output intervals file to write to") val outList : File = out
|
|
||||||
@Argument(doc="The output format for the intervals") val outTypeStr = opType
|
|
||||||
@Argument(doc="The input format for the intervals") val inTypeStr = ipType
|
|
||||||
|
|
||||||
var output : PrintStream = _
|
|
||||||
var parser : GenomeLocParser = _
|
|
||||||
var xrl : XReadLines = _
|
|
||||||
val outType = IntervalFormatType.convert(outTypeStr)
|
|
||||||
val inType = IntervalFormatType.convert(inTypeStr)
|
|
||||||
|
|
||||||
var offsetIn : Int = 0
|
|
||||||
var offsetOut : Int = 0
|
|
||||||
|
|
||||||
var first : Boolean = true
|
|
||||||
var lastTwo : (GenomeLoc,GenomeLoc) = _
|
|
||||||
|
|
||||||
var intervalCache : TreeSet[GenomeLoc] = _
|
|
||||||
val LINES_TO_CACHE : Int = 1000
|
|
||||||
|
|
||||||
def run = {
|
|
||||||
output = new PrintStream(outList)
|
|
||||||
intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{
|
|
||||||
def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) }
|
|
||||||
})
|
|
||||||
parser = new GenomeLocParser(new FastaSequenceFile(ref,true))
|
|
||||||
xrl = new XReadLines(inList)
|
|
||||||
offsetIn = if (isBed(inType)) 1 else 0
|
|
||||||
offsetOut = if( isBed(outType)) 1 else 0
|
|
||||||
var line : String = xrl.next
|
|
||||||
while ( line.startsWith("@") ) {
|
|
||||||
line = xrl.next
|
|
||||||
}
|
|
||||||
var prevLoc: GenomeLoc = null
|
|
||||||
var curLoc: GenomeLoc = null
|
|
||||||
var nextLoc : GenomeLoc = parseGenomeInterval(line)
|
|
||||||
var linesProcessed : Int = 1
|
|
||||||
while ( prevLoc != null || curLoc != null || nextLoc != null ) {
|
|
||||||
prevLoc = curLoc
|
|
||||||
curLoc = nextLoc
|
|
||||||
nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null
|
|
||||||
if ( curLoc != null ) {
|
|
||||||
val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc)
|
|
||||||
val right: GenomeLoc = refine(expandRight(curLoc),nextLoc)
|
|
||||||
if ( left != null ) {
|
|
||||||
intervalCache += left
|
|
||||||
}
|
|
||||||
if ( right != null ) {
|
|
||||||
intervalCache += right
|
|
||||||
}
|
|
||||||
}
|
|
||||||
linesProcessed += 1
|
|
||||||
if ( linesProcessed % LINES_TO_CACHE == 0 ) {
|
|
||||||
val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt))
|
|
||||||
intervalCache = intervalCache -- toPrint
|
|
||||||
toPrint.foreach(u => output.print("%s%n".format(repr(u))))
|
|
||||||
}
|
|
||||||
//System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc)))
|
|
||||||
}
|
|
||||||
|
|
||||||
intervalCache.foreach(u => output.print("%s%n".format(repr(u))))
|
|
||||||
|
|
||||||
output.close()
|
|
||||||
}
|
|
||||||
|
|
||||||
def expandLeft(g: GenomeLoc) : GenomeLoc = {
|
|
||||||
parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt)
|
|
||||||
}
|
|
||||||
|
|
||||||
def expandRight(g: GenomeLoc) : GenomeLoc = {
|
|
||||||
parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt)
|
|
||||||
}
|
|
||||||
|
|
||||||
def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = {
|
|
||||||
if ( borderG == null || ! newG.overlapsP(borderG) ) {
|
|
||||||
return newG
|
|
||||||
} else {
|
|
||||||
if ( newG.getStart < borderG.getStart ) {
|
|
||||||
if ( borderG.getStart - startInt > newG.getStart ) {
|
|
||||||
return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if ( borderG.getStop + startInt < newG.getStop ){
|
|
||||||
return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
null
|
|
||||||
}
|
|
||||||
|
|
||||||
def repr(loc : GenomeLoc) : String = {
|
|
||||||
if ( loc == null ) return "null"
|
|
||||||
if ( outType == IntervalFormatType.INTERVALS ) {
|
|
||||||
return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop)
|
|
||||||
} else {
|
|
||||||
return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = {
|
|
||||||
t == IntervalFormatType.BED
|
|
||||||
}
|
|
||||||
|
|
||||||
def parseGenomeInterval( s : String ) : GenomeLoc = {
|
|
||||||
val sp = s.split("\\s+")
|
|
||||||
// todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted
|
|
||||||
if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn)
|
|
||||||
}
|
|
||||||
|
|
||||||
object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") {
|
|
||||||
type IntervalFormatType = Value
|
|
||||||
val INTERVALS,BED,TDF = Value
|
|
||||||
|
|
||||||
def convert(s : String) : IntervalFormatType = {
|
|
||||||
if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,70 +0,0 @@
|
||||||
package org.broadinstitute.sting.queue.library.ipf.intervals
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
|
||||||
import collection.JavaConversions._
|
|
||||||
import org.broadinstitute.sting.commandline._
|
|
||||||
import java.io.{PrintStream, File}
|
|
||||||
import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary}
|
|
||||||
import org.broadinstitute.sting.utils.text.XReadLines
|
|
||||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
|
||||||
|
|
||||||
class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction {
|
|
||||||
@Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
|
|
||||||
@Output(doc="Output interval file to which to write") val output : File = outFile
|
|
||||||
@Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false
|
|
||||||
@Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed
|
|
||||||
|
|
||||||
|
|
||||||
var outStream : PrintStream = _
|
|
||||||
var contigs : List[String] = Nil
|
|
||||||
var dict : SAMSequenceDictionary = _
|
|
||||||
var parser : GenomeLocParser = _
|
|
||||||
|
|
||||||
def run = {
|
|
||||||
outStream = new PrintStream(output)
|
|
||||||
dict = new SAMSequenceDictionary
|
|
||||||
// note: memory hog
|
|
||||||
val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex
|
|
||||||
sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE)))
|
|
||||||
parser = new GenomeLocParser(dict)
|
|
||||||
sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h))
|
|
||||||
outStream.close()
|
|
||||||
}
|
|
||||||
|
|
||||||
def writeOut(g : GenomeLoc) : Unit = {
|
|
||||||
outStream.print("%s%n".format(g.toString))
|
|
||||||
}
|
|
||||||
|
|
||||||
def parse(s : String) : (String,Int,Int) = {
|
|
||||||
if ( s.contains(":") ) {
|
|
||||||
val split1 = s.split(":")
|
|
||||||
val split2 = split1(1).split("-")
|
|
||||||
return (split1(0),split2(0).toInt,split2(1).toInt)
|
|
||||||
} else {
|
|
||||||
val split = s.split("\\s+")
|
|
||||||
return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) )
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = {
|
|
||||||
parser.createGenomeLoc(coords._1,coords._2,coords._3)
|
|
||||||
}
|
|
||||||
|
|
||||||
def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = {
|
|
||||||
if ( a.last.forall(u => u._1.overlapsP(b._1)) ) {
|
|
||||||
a.init :+ (a.last :+ b)
|
|
||||||
} else {
|
|
||||||
a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = {
|
|
||||||
if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for
|
|
||||||
null
|
|
||||||
}
|
|
||||||
u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) )
|
|
||||||
}
|
|
||||||
|
|
||||||
def range(a: Int, b: Int) : Range = new Range(a,b+1,1)
|
|
||||||
|
|
||||||
}
|
|
||||||
Binary file not shown.
|
|
@ -1,3 +1,3 @@
|
||||||
<ivy-module version="1.0">
|
<ivy-module version="1.0">
|
||||||
<info organisation="net.sf.snpeff" module="snpeff" revision="2.0.2" status="release" />
|
<info organisation="net.sf.snpeff" module="snpeff" revision="2.0.4rc3" status="release" />
|
||||||
</ivy-module>
|
</ivy-module>
|
||||||
Loading…
Reference in New Issue