Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
8710673a97
2
ivy.xml
2
ivy.xml
|
|
@ -76,7 +76,7 @@
|
|||
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3" />
|
||||
|
||||
<!-- snpEff annotator for pipelines -->
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.2" />
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.4rc3" />
|
||||
|
||||
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
|
||||
<exclude org="javax.servlet" />
|
||||
|
|
|
|||
|
|
@ -554,4 +554,54 @@ public class GenomeLocParser {
|
|||
return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size.
|
||||
* @param loc The original loc
|
||||
* @param maxBasePairs The maximum number of basePairs
|
||||
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig.
|
||||
*/
|
||||
@Requires({"loc != null", "maxBasePairs > 0"})
|
||||
public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) {
|
||||
if (GenomeLoc.isUnmapped(loc))
|
||||
return null;
|
||||
String contigName = loc.getContig();
|
||||
SAMSequenceRecord contig = contigInfo.getSequence(contigName);
|
||||
int contigIndex = contig.getSequenceIndex();
|
||||
|
||||
int start = loc.getStart() - maxBasePairs;
|
||||
int stop = loc.getStart() - 1;
|
||||
|
||||
if (start < 1)
|
||||
start = 1;
|
||||
if (stop < 1)
|
||||
return null;
|
||||
|
||||
return createGenomeLoc(contigName, contigIndex, start, stop, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size.
|
||||
* @param loc The original loc
|
||||
* @param maxBasePairs The maximum number of basePairs
|
||||
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig.
|
||||
*/
|
||||
@Requires({"loc != null", "maxBasePairs > 0"})
|
||||
public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) {
|
||||
if (GenomeLoc.isUnmapped(loc))
|
||||
return null;
|
||||
String contigName = loc.getContig();
|
||||
SAMSequenceRecord contig = contigInfo.getSequence(contigName);
|
||||
int contigIndex = contig.getSequenceIndex();
|
||||
int contigLength = contig.getSequenceLength();
|
||||
|
||||
int start = loc.getStop() + 1;
|
||||
int stop = loc.getStop() + maxBasePairs;
|
||||
|
||||
if (start > contigLength)
|
||||
return null;
|
||||
if (stop > contigLength)
|
||||
stop = contigLength;
|
||||
|
||||
return createGenomeLoc(contigName, contigIndex, start, stop, true);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -233,8 +233,12 @@ public class IntervalUtils {
|
|||
*
|
||||
* Returns a null string if there are no differences, otherwise returns a string describing the difference
|
||||
* (useful for UnitTests). Assumes both lists are sorted
|
||||
*
|
||||
* @param masterArg sorted master genome locs
|
||||
* @param testArg sorted test genome locs
|
||||
* @return null string if there are no difference, otherwise a string describing the difference
|
||||
*/
|
||||
public static final String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
|
||||
public static String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
|
||||
LinkedList<GenomeLoc> master = new LinkedList<GenomeLoc>(masterArg);
|
||||
LinkedList<GenomeLoc> test = new LinkedList<GenomeLoc>(testArg);
|
||||
|
||||
|
|
@ -317,23 +321,6 @@ public class IntervalUtils {
|
|||
return lengths;
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts the number of interval files an interval list can be split into using scatterIntervalArguments.
|
||||
* @param locs The genome locs.
|
||||
* @return The maximum number of parts the intervals can be split into.
|
||||
*/
|
||||
public static int countContigIntervals(List<GenomeLoc> locs) {
|
||||
int maxFiles = 0;
|
||||
String contig = null;
|
||||
for (GenomeLoc loc: locs) {
|
||||
if (contig == null || !contig.equals(loc.getContig())) {
|
||||
maxFiles++;
|
||||
contig = loc.getContig();
|
||||
}
|
||||
}
|
||||
return maxFiles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits an interval list into multiple files.
|
||||
* @param fileHeader The sam file header.
|
||||
|
|
@ -373,7 +360,6 @@ public class IntervalUtils {
|
|||
* @return A list of lists of genome locs, split according to splits
|
||||
*/
|
||||
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
|
||||
int locIndex = 1;
|
||||
int start = 0;
|
||||
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
|
||||
for (Integer stop: splits) {
|
||||
|
|
@ -465,7 +451,7 @@ public class IntervalUtils {
|
|||
|
||||
@Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"})
|
||||
@Ensures({"result != null"})
|
||||
final static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
|
||||
static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
|
||||
final List<GenomeLoc> split = new ArrayList<GenomeLoc>();
|
||||
long size = 0;
|
||||
|
||||
|
|
@ -579,10 +565,101 @@ public class IntervalUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static final long intervalSize(final List<GenomeLoc> locs) {
|
||||
public static long intervalSize(final List<GenomeLoc> locs) {
|
||||
long size = 0;
|
||||
for ( final GenomeLoc loc : locs )
|
||||
size += loc.size();
|
||||
return size;
|
||||
}
|
||||
|
||||
public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) {
|
||||
ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference);
|
||||
GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference());
|
||||
List<GenomeLoc> originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath());
|
||||
|
||||
if (originalList.isEmpty())
|
||||
throw new UserException.MalformedFile(inputIntervals, "File contains no intervals");
|
||||
|
||||
List<GenomeLoc> flankingList = getFlankingIntervals(parser, originalList, basePairs);
|
||||
|
||||
if (flankingList.isEmpty())
|
||||
throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals");
|
||||
|
||||
SAMFileHeader samFileHeader = new SAMFileHeader();
|
||||
samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
|
||||
IntervalList intervalList = new IntervalList(samFileHeader);
|
||||
int i = 0;
|
||||
for (GenomeLoc loc: flankingList)
|
||||
intervalList.add(toInterval(loc, ++i));
|
||||
intervalList.write(flankingIntervals);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs.
|
||||
* @param parser A genome loc parser for creating the new intervals
|
||||
* @param locs Original genome locs
|
||||
* @param basePairs Number of base pairs on each side of loc
|
||||
* @return The list of intervals between the locs
|
||||
*/
|
||||
public static List<GenomeLoc> getFlankingIntervals(final GenomeLocParser parser, final List<GenomeLoc> locs, final int basePairs) {
|
||||
List<GenomeLoc> sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList();
|
||||
|
||||
if (sorted.size() == 0)
|
||||
return Collections.emptyList();
|
||||
|
||||
LinkedHashMap<String, List<GenomeLoc>> locsByContig = splitByContig(sorted);
|
||||
List<GenomeLoc> expanded = new ArrayList<GenomeLoc>();
|
||||
for (String contig: locsByContig.keySet()) {
|
||||
List<GenomeLoc> contigLocs = locsByContig.get(contig);
|
||||
int contigLocsSize = contigLocs.size();
|
||||
|
||||
GenomeLoc startLoc, stopLoc;
|
||||
|
||||
// Create loc at start of the list
|
||||
startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs);
|
||||
if (startLoc != null)
|
||||
expanded.add(startLoc);
|
||||
|
||||
// Create locs between each loc[i] and loc[i+1]
|
||||
for (int i = 0; i < contigLocsSize - 1; i++) {
|
||||
stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs);
|
||||
startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs);
|
||||
if (stopLoc.getStop() + 1 >= startLoc.getStart()) {
|
||||
// NOTE: This is different than GenomeLoc.merge()
|
||||
// merge() returns a loc which covers the entire range of stop and start,
|
||||
// possibly returning positions inside loc(i) or loc(i+1)
|
||||
// We want to make sure that the start of the stopLoc is used, and the stop of the startLoc
|
||||
GenomeLoc merged = parser.createGenomeLoc(
|
||||
stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop());
|
||||
expanded.add(merged);
|
||||
} else {
|
||||
expanded.add(stopLoc);
|
||||
expanded.add(startLoc);
|
||||
}
|
||||
}
|
||||
|
||||
// Create loc at the end of the list
|
||||
stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs);
|
||||
if (stopLoc != null)
|
||||
expanded.add(stopLoc);
|
||||
}
|
||||
return expanded;
|
||||
}
|
||||
|
||||
private static LinkedHashMap<String, List<GenomeLoc>> splitByContig(List<GenomeLoc> sorted) {
|
||||
LinkedHashMap<String, List<GenomeLoc>> splits = new LinkedHashMap<String, List<GenomeLoc>>();
|
||||
GenomeLoc last = null;
|
||||
List<GenomeLoc> contigLocs = null;
|
||||
for (GenomeLoc loc: sorted) {
|
||||
if (GenomeLoc.isUnmapped(loc))
|
||||
continue;
|
||||
if (last == null || !last.onSameContig(loc)) {
|
||||
contigLocs = new ArrayList<GenomeLoc>();
|
||||
splits.put(loc.getContig(), contigLocs);
|
||||
}
|
||||
contigLocs.add(loc);
|
||||
last = loc;
|
||||
}
|
||||
return splits;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils;
|
|||
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
|||
import static org.testng.Assert.assertEquals;
|
||||
import static org.testng.Assert.assertTrue;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
|
|
@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testGetContigIndexValid() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
|
||||
assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference
|
||||
}
|
||||
|
||||
|
|
@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testGetContigInfoKnownContig() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
|
||||
assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference
|
||||
}
|
||||
|
||||
|
|
@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
|||
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
|
||||
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
|
||||
}
|
||||
|
||||
private static class FlankingGenomeLocTestData extends TestDataProvider {
|
||||
final GenomeLocParser parser;
|
||||
final int basePairs;
|
||||
final GenomeLoc original, flankStart, flankStop;
|
||||
|
||||
private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) {
|
||||
super(FlankingGenomeLocTestData.class, name);
|
||||
this.parser = parser;
|
||||
this.basePairs = basePairs;
|
||||
this.original = parse(parser, original);
|
||||
this.flankStart = flankStart == null ? null : parse(parser, flankStart);
|
||||
this.flankStop = flankStop == null ? null : parse(parser, flankStop);
|
||||
}
|
||||
|
||||
private static GenomeLoc parse(GenomeLocParser parser, String str) {
|
||||
return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "flankingGenomeLocs")
|
||||
public Object[][] getFlankingGenomeLocs() {
|
||||
int contigLength = 10000;
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength);
|
||||
GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
|
||||
new FlankingGenomeLocTestData("atStartBase1", parser, 1,
|
||||
"chr1:1", null, "chr1:2");
|
||||
|
||||
new FlankingGenomeLocTestData("atStartBase50", parser, 50,
|
||||
"chr1:1", null, "chr1:2-51");
|
||||
|
||||
new FlankingGenomeLocTestData("atStartRange50", parser, 50,
|
||||
"chr1:1-10", null, "chr1:11-60");
|
||||
|
||||
new FlankingGenomeLocTestData("atEndBase1", parser, 1,
|
||||
"chr1:" + contigLength, "chr1:" + (contigLength - 1), null);
|
||||
|
||||
new FlankingGenomeLocTestData("atEndBase50", parser, 50,
|
||||
"chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null);
|
||||
|
||||
new FlankingGenomeLocTestData("atEndRange50", parser, 50,
|
||||
String.format("chr1:%d-%d", contigLength - 10, contigLength),
|
||||
String.format("chr1:%d-%d", contigLength - 60, contigLength - 11),
|
||||
null);
|
||||
|
||||
new FlankingGenomeLocTestData("nearStartBase1", parser, 1,
|
||||
"chr1:2", "chr1:1", "chr1:3");
|
||||
|
||||
new FlankingGenomeLocTestData("nearStartRange50", parser, 50,
|
||||
"chr1:21-30", "chr1:1-20", "chr1:31-80");
|
||||
|
||||
new FlankingGenomeLocTestData("nearEndBase1", parser, 1,
|
||||
"chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength);
|
||||
|
||||
new FlankingGenomeLocTestData("nearEndRange50", parser, 50,
|
||||
String.format("chr1:%d-%d", contigLength - 30, contigLength - 21),
|
||||
String.format("chr1:%d-%d", contigLength - 80, contigLength - 31),
|
||||
String.format("chr1:%d-%d", contigLength - 20, contigLength));
|
||||
|
||||
new FlankingGenomeLocTestData("beyondStartBase1", parser, 1,
|
||||
"chr1:3", "chr1:2", "chr1:4");
|
||||
|
||||
new FlankingGenomeLocTestData("beyondStartRange50", parser, 50,
|
||||
"chr1:101-200", "chr1:51-100", "chr1:201-250");
|
||||
|
||||
new FlankingGenomeLocTestData("beyondEndBase1", parser, 1,
|
||||
"chr1:" + (contigLength - 3),
|
||||
"chr1:" + (contigLength - 4),
|
||||
"chr1:" + (contigLength - 2));
|
||||
|
||||
new FlankingGenomeLocTestData("beyondEndRange50", parser, 50,
|
||||
String.format("chr1:%d-%d", contigLength - 200, contigLength - 101),
|
||||
String.format("chr1:%d-%d", contigLength - 250, contigLength - 201),
|
||||
String.format("chr1:%d-%d", contigLength - 100, contigLength - 51));
|
||||
|
||||
new FlankingGenomeLocTestData("unmapped", parser, 50,
|
||||
"unmapped", null, null);
|
||||
|
||||
new FlankingGenomeLocTestData("fullContig", parser, 50,
|
||||
"chr1", null, null);
|
||||
|
||||
return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingGenomeLocs")
|
||||
public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) {
|
||||
GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs);
|
||||
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||
data.toString(), data.original, actual, data.flankStart);
|
||||
assertEquals(actual, data.flankStart, description);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingGenomeLocs")
|
||||
public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) {
|
||||
GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs);
|
||||
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||
data.toString(), data.original, actual, data.flankStop);
|
||||
assertEquals(actual, data.flankStop, description);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
package org.broadinstitute.sting.utils.interval;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.util.IntervalUtil;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
|
@ -762,4 +762,225 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
|||
List<GenomeLoc> merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
|
||||
Assert.assertEquals(merged.size(), 1);
|
||||
}
|
||||
|
||||
/*
|
||||
Split into tests that can be written to files and tested by writeFlankingIntervals,
|
||||
and lists that cannot but are still handled by getFlankingIntervals.
|
||||
*/
|
||||
private static abstract class FlankingIntervalsTestData extends TestDataProvider {
|
||||
final public File referenceFile;
|
||||
final public GenomeLocParser parser;
|
||||
final int basePairs;
|
||||
final List<GenomeLoc> original;
|
||||
final List<GenomeLoc> expected;
|
||||
|
||||
protected FlankingIntervalsTestData(Class<?> clazz, String name, File referenceFile, GenomeLocParser parser,
|
||||
int basePairs, List<String> original, List<String> expected) {
|
||||
super(clazz, name);
|
||||
this.referenceFile = referenceFile;
|
||||
this.parser = parser;
|
||||
this.basePairs = basePairs;
|
||||
this.original = parse(parser, original);
|
||||
this.expected = parse(parser, expected);
|
||||
}
|
||||
|
||||
private static List<GenomeLoc> parse(GenomeLocParser parser, List<String> locs) {
|
||||
List<GenomeLoc> parsed = new ArrayList<GenomeLoc>();
|
||||
for (String loc: locs)
|
||||
parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc));
|
||||
return parsed;
|
||||
}
|
||||
}
|
||||
|
||||
private static class FlankingIntervalsFile extends FlankingIntervalsTestData {
|
||||
public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser,
|
||||
int basePairs, List<String> original, List<String> expected) {
|
||||
super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected);
|
||||
}
|
||||
}
|
||||
|
||||
private static class FlankingIntervalsList extends FlankingIntervalsTestData {
|
||||
public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser,
|
||||
int basePairs, List<String> original, List<String> expected) {
|
||||
super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected);
|
||||
}
|
||||
}
|
||||
|
||||
/* Intervals where the original and the flanks can be written to files. */
|
||||
@DataProvider(name = "flankingIntervalsFiles")
|
||||
public Object[][] getFlankingIntervalsFiles() {
|
||||
File hg19ReferenceFile = new File(BaseTest.hg19Reference);
|
||||
int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength();
|
||||
|
||||
new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:1"),
|
||||
Arrays.asList("1:2"));
|
||||
|
||||
new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:1"),
|
||||
Arrays.asList("1:2-51"));
|
||||
|
||||
new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:1-10"),
|
||||
Arrays.asList("1:11-60"));
|
||||
|
||||
new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:" + hg19Length1),
|
||||
Arrays.asList("1:" + (hg19Length1 - 1)));
|
||||
|
||||
new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:" + hg19Length1),
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1)));
|
||||
|
||||
new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)),
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11)));
|
||||
|
||||
new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:2"),
|
||||
Arrays.asList("1:1", "1:3"));
|
||||
|
||||
new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:21-30"),
|
||||
Arrays.asList("1:1-20", "1:31-80"));
|
||||
|
||||
new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:" + (hg19Length1 - 1)),
|
||||
Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1));
|
||||
|
||||
new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)),
|
||||
Arrays.asList(
|
||||
String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31),
|
||||
String.format("1:%d-%d", hg19Length1 - 20, hg19Length1)));
|
||||
|
||||
new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:3"),
|
||||
Arrays.asList("1:2", "1:4"));
|
||||
|
||||
new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200"),
|
||||
Arrays.asList("1:51-100", "1:201-250"));
|
||||
|
||||
new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:" + (hg19Length1 - 3)),
|
||||
Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2)));
|
||||
|
||||
new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)),
|
||||
Arrays.asList(
|
||||
String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201),
|
||||
String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51)));
|
||||
|
||||
new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:401-500"),
|
||||
Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550"));
|
||||
|
||||
new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:301-400"),
|
||||
Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
|
||||
|
||||
new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:271-400"),
|
||||
Arrays.asList("1:51-100", "1:201-270", "1:401-450"));
|
||||
|
||||
new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:221-400"),
|
||||
Arrays.asList("1:51-100", "1:201-220", "1:401-450"));
|
||||
|
||||
new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:121-400"),
|
||||
Arrays.asList("1:51-100", "1:401-450"));
|
||||
|
||||
new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "2:301-400"),
|
||||
Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450"));
|
||||
|
||||
// Explicit testing a problematic agilent target pair
|
||||
new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("2:74756257-74756411", "2:74756487-74756628"),
|
||||
// wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679")
|
||||
Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678"));
|
||||
|
||||
return TestDataProvider.getTests(FlankingIntervalsFile.class);
|
||||
}
|
||||
|
||||
/* Intervals where either the original and/or the flanks cannot be written to a file. */
|
||||
@DataProvider(name = "flankingIntervalsLists")
|
||||
public Object[][] getFlankingIntervalsLists() {
|
||||
File hg19ReferenceFile = new File(BaseTest.hg19Reference);
|
||||
List<String> empty = Collections.emptyList();
|
||||
|
||||
new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
empty,
|
||||
empty);
|
||||
|
||||
new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("unmapped"),
|
||||
empty);
|
||||
|
||||
new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1"),
|
||||
empty);
|
||||
|
||||
new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1", "2", "3"),
|
||||
empty);
|
||||
|
||||
new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:301-400", "unmapped"),
|
||||
Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
|
||||
|
||||
return TestDataProvider.getTests(FlankingIntervalsList.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingIntervalsFiles")
|
||||
public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
|
||||
File originalFile = createTempFile("original.", ".intervals");
|
||||
File flankingFile = createTempFile("flanking.", ".intervals");
|
||||
try {
|
||||
List<String> lines = new ArrayList<String>();
|
||||
for (GenomeLoc loc: data.original)
|
||||
lines.add(loc.toString());
|
||||
FileUtils.writeLines(originalFile, lines);
|
||||
|
||||
IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
|
||||
|
||||
List<GenomeLoc> actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath());
|
||||
|
||||
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||
data.toString(), data.original, actual, data.expected);
|
||||
Assert.assertEquals(actual, data.expected, description);
|
||||
} finally {
|
||||
FileUtils.deleteQuietly(originalFile);
|
||||
FileUtils.deleteQuietly(flankingFile);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class)
|
||||
public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
|
||||
File originalFile = createTempFile("original.", ".intervals");
|
||||
File flankingFile = createTempFile("flanking.", ".intervals");
|
||||
try {
|
||||
List<String> lines = new ArrayList<String>();
|
||||
for (GenomeLoc loc: data.original)
|
||||
lines.add(loc.toString());
|
||||
FileUtils.writeLines(originalFile, lines);
|
||||
|
||||
// Should throw a user exception on bad input if either the original
|
||||
// intervals are empty or if the flanking intervals are empty
|
||||
IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
|
||||
} finally {
|
||||
FileUtils.deleteQuietly(originalFile);
|
||||
FileUtils.deleteQuietly(flankingFile);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingIntervalsLists")
|
||||
public void testGetFlankingIntervals(FlankingIntervalsTestData data) {
|
||||
List<GenomeLoc> actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs);
|
||||
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||
data.toString(), data.original, actual, data.expected);
|
||||
Assert.assertEquals(actual, data.expected, description);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.queue.extensions.gatk
|
||||
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
import org.broadinstitute.sting.commandline.{Output, Argument, Input}
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||
|
||||
class WriteFlankingIntervalsFunction extends InProcessFunction {
|
||||
@Input(doc="The reference sequence")
|
||||
var reference : File = _
|
||||
|
||||
@Input(doc="The interval list to flank")
|
||||
var inputIntervals : File = _
|
||||
|
||||
@Output(doc="The output intervals file to write to")
|
||||
var outputIntervals: File = _
|
||||
|
||||
@Argument(doc="Number of base pair to flank the input intervals")
|
||||
var flankSize : Int = _
|
||||
|
||||
def run() {
|
||||
IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,135 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.library.ipf.intervals
|
||||
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
import org.broadinstitute.sting.commandline._
|
||||
import java.io.{PrintStream, File}
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import net.sf.picard.reference.FastaSequenceFile
|
||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
||||
import collection.immutable.TreeSet
|
||||
|
||||
// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
|
||||
class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction {
|
||||
@Input(doc="The interval list to expand") val inList : File = in
|
||||
@Input(doc="The reference sequence") val refDict : File = ref
|
||||
@Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start
|
||||
@Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size
|
||||
@Output(doc="The output intervals file to write to") val outList : File = out
|
||||
@Argument(doc="The output format for the intervals") val outTypeStr = opType
|
||||
@Argument(doc="The input format for the intervals") val inTypeStr = ipType
|
||||
|
||||
var output : PrintStream = _
|
||||
var parser : GenomeLocParser = _
|
||||
var xrl : XReadLines = _
|
||||
val outType = IntervalFormatType.convert(outTypeStr)
|
||||
val inType = IntervalFormatType.convert(inTypeStr)
|
||||
|
||||
var offsetIn : Int = 0
|
||||
var offsetOut : Int = 0
|
||||
|
||||
var first : Boolean = true
|
||||
var lastTwo : (GenomeLoc,GenomeLoc) = _
|
||||
|
||||
var intervalCache : TreeSet[GenomeLoc] = _
|
||||
val LINES_TO_CACHE : Int = 1000
|
||||
|
||||
def run = {
|
||||
output = new PrintStream(outList)
|
||||
intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{
|
||||
def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) }
|
||||
})
|
||||
parser = new GenomeLocParser(new FastaSequenceFile(ref,true))
|
||||
xrl = new XReadLines(inList)
|
||||
offsetIn = if (isBed(inType)) 1 else 0
|
||||
offsetOut = if( isBed(outType)) 1 else 0
|
||||
var line : String = xrl.next
|
||||
while ( line.startsWith("@") ) {
|
||||
line = xrl.next
|
||||
}
|
||||
var prevLoc: GenomeLoc = null
|
||||
var curLoc: GenomeLoc = null
|
||||
var nextLoc : GenomeLoc = parseGenomeInterval(line)
|
||||
var linesProcessed : Int = 1
|
||||
while ( prevLoc != null || curLoc != null || nextLoc != null ) {
|
||||
prevLoc = curLoc
|
||||
curLoc = nextLoc
|
||||
nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null
|
||||
if ( curLoc != null ) {
|
||||
val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc)
|
||||
val right: GenomeLoc = refine(expandRight(curLoc),nextLoc)
|
||||
if ( left != null ) {
|
||||
intervalCache += left
|
||||
}
|
||||
if ( right != null ) {
|
||||
intervalCache += right
|
||||
}
|
||||
}
|
||||
linesProcessed += 1
|
||||
if ( linesProcessed % LINES_TO_CACHE == 0 ) {
|
||||
val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt))
|
||||
intervalCache = intervalCache -- toPrint
|
||||
toPrint.foreach(u => output.print("%s%n".format(repr(u))))
|
||||
}
|
||||
//System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc)))
|
||||
}
|
||||
|
||||
intervalCache.foreach(u => output.print("%s%n".format(repr(u))))
|
||||
|
||||
output.close()
|
||||
}
|
||||
|
||||
def expandLeft(g: GenomeLoc) : GenomeLoc = {
|
||||
parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt)
|
||||
}
|
||||
|
||||
def expandRight(g: GenomeLoc) : GenomeLoc = {
|
||||
parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt)
|
||||
}
|
||||
|
||||
def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = {
|
||||
if ( borderG == null || ! newG.overlapsP(borderG) ) {
|
||||
return newG
|
||||
} else {
|
||||
if ( newG.getStart < borderG.getStart ) {
|
||||
if ( borderG.getStart - startInt > newG.getStart ) {
|
||||
return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt)
|
||||
}
|
||||
} else {
|
||||
if ( borderG.getStop + startInt < newG.getStop ){
|
||||
return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
null
|
||||
}
|
||||
|
||||
def repr(loc : GenomeLoc) : String = {
|
||||
if ( loc == null ) return "null"
|
||||
if ( outType == IntervalFormatType.INTERVALS ) {
|
||||
return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop)
|
||||
} else {
|
||||
return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut)
|
||||
}
|
||||
}
|
||||
|
||||
def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = {
|
||||
t == IntervalFormatType.BED
|
||||
}
|
||||
|
||||
def parseGenomeInterval( s : String ) : GenomeLoc = {
|
||||
val sp = s.split("\\s+")
|
||||
// todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted
|
||||
if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn)
|
||||
}
|
||||
|
||||
object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") {
|
||||
type IntervalFormatType = Value
|
||||
val INTERVALS,BED,TDF = Value
|
||||
|
||||
def convert(s : String) : IntervalFormatType = {
|
||||
if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.library.ipf.intervals
|
||||
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.commandline._
|
||||
import java.io.{PrintStream, File}
|
||||
import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary}
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
||||
|
||||
class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction {
|
||||
@Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
|
||||
@Output(doc="Output interval file to which to write") val output : File = outFile
|
||||
@Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false
|
||||
@Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed
|
||||
|
||||
|
||||
var outStream : PrintStream = _
|
||||
var contigs : List[String] = Nil
|
||||
var dict : SAMSequenceDictionary = _
|
||||
var parser : GenomeLocParser = _
|
||||
|
||||
def run = {
|
||||
outStream = new PrintStream(output)
|
||||
dict = new SAMSequenceDictionary
|
||||
// note: memory hog
|
||||
val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex
|
||||
sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE)))
|
||||
parser = new GenomeLocParser(dict)
|
||||
sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h))
|
||||
outStream.close()
|
||||
}
|
||||
|
||||
def writeOut(g : GenomeLoc) : Unit = {
|
||||
outStream.print("%s%n".format(g.toString))
|
||||
}
|
||||
|
||||
def parse(s : String) : (String,Int,Int) = {
|
||||
if ( s.contains(":") ) {
|
||||
val split1 = s.split(":")
|
||||
val split2 = split1(1).split("-")
|
||||
return (split1(0),split2(0).toInt,split2(1).toInt)
|
||||
} else {
|
||||
val split = s.split("\\s+")
|
||||
return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) )
|
||||
}
|
||||
}
|
||||
|
||||
def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = {
|
||||
parser.createGenomeLoc(coords._1,coords._2,coords._3)
|
||||
}
|
||||
|
||||
def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = {
|
||||
if ( a.last.forall(u => u._1.overlapsP(b._1)) ) {
|
||||
a.init :+ (a.last :+ b)
|
||||
} else {
|
||||
a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b)
|
||||
}
|
||||
}
|
||||
|
||||
def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = {
|
||||
if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for
|
||||
null
|
||||
}
|
||||
u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) )
|
||||
}
|
||||
|
||||
def range(a: Int, b: Int) : Range = new Range(a,b+1,1)
|
||||
|
||||
}
|
||||
Binary file not shown.
|
|
@ -1,3 +1,3 @@
|
|||
<ivy-module version="1.0">
|
||||
<info organisation="net.sf.snpeff" module="snpeff" revision="2.0.2" status="release" />
|
||||
<info organisation="net.sf.snpeff" module="snpeff" revision="2.0.4rc3" status="release" />
|
||||
</ivy-module>
|
||||
Loading…
Reference in New Issue