During flanking interval creation merging overlapping flanks so that on scatter the list doesn't accidentally genotype the same site twice.
Moved flanking interval utilies to IntervalUtils with UnitTests.
This commit is contained in:
parent
bad19779b9
commit
c50274e02e
|
|
@ -554,4 +554,54 @@ public class GenomeLocParser {
|
|||
return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a loc to the left (starting at the loc start + 1) of maxBasePairs size.
|
||||
* @param loc The original loc
|
||||
* @param maxBasePairs The maximum number of basePairs
|
||||
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig.
|
||||
*/
|
||||
@Requires({"loc != null", "maxBasePairs > 0"})
|
||||
public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) {
|
||||
if (GenomeLoc.isUnmapped(loc))
|
||||
return null;
|
||||
String contigName = loc.getContig();
|
||||
SAMSequenceRecord contig = contigInfo.getSequence(contigName);
|
||||
int contigIndex = contig.getSequenceIndex();
|
||||
|
||||
int start = loc.getStart() - maxBasePairs;
|
||||
int stop = loc.getStart() - 1;
|
||||
|
||||
if (start < 1)
|
||||
start = 1;
|
||||
if (stop < 1)
|
||||
return null;
|
||||
|
||||
return createGenomeLoc(contigName, contigIndex, start, stop, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a loc to the right (starting at the loc stop + 1) of maxBasePairs size.
|
||||
* @param loc The original loc
|
||||
* @param maxBasePairs The maximum number of basePairs
|
||||
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig.
|
||||
*/
|
||||
@Requires({"loc != null", "maxBasePairs > 0"})
|
||||
public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) {
|
||||
if (GenomeLoc.isUnmapped(loc))
|
||||
return null;
|
||||
String contigName = loc.getContig();
|
||||
SAMSequenceRecord contig = contigInfo.getSequence(contigName);
|
||||
int contigIndex = contig.getSequenceIndex();
|
||||
int contigLength = contig.getSequenceLength();
|
||||
|
||||
int start = loc.getStop() + 1;
|
||||
int stop = loc.getStop() + maxBasePairs;
|
||||
|
||||
if (start > contigLength)
|
||||
return null;
|
||||
if (stop > contigLength)
|
||||
stop = contigLength;
|
||||
|
||||
return createGenomeLoc(contigName, contigIndex, start, stop, true);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -233,8 +233,12 @@ public class IntervalUtils {
|
|||
*
|
||||
* Returns a null string if there are no differences, otherwise returns a string describing the difference
|
||||
* (useful for UnitTests). Assumes both lists are sorted
|
||||
*
|
||||
* @param masterArg sorted master genome locs
|
||||
* @param testArg sorted test genome locs
|
||||
* @return null string if there are no difference, otherwise a string describing the difference
|
||||
*/
|
||||
public static final String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
|
||||
public static String equateIntervals(List<GenomeLoc> masterArg, List<GenomeLoc> testArg) {
|
||||
LinkedList<GenomeLoc> master = new LinkedList<GenomeLoc>(masterArg);
|
||||
LinkedList<GenomeLoc> test = new LinkedList<GenomeLoc>(testArg);
|
||||
|
||||
|
|
@ -317,23 +321,6 @@ public class IntervalUtils {
|
|||
return lengths;
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts the number of interval files an interval list can be split into using scatterIntervalArguments.
|
||||
* @param locs The genome locs.
|
||||
* @return The maximum number of parts the intervals can be split into.
|
||||
*/
|
||||
public static int countContigIntervals(List<GenomeLoc> locs) {
|
||||
int maxFiles = 0;
|
||||
String contig = null;
|
||||
for (GenomeLoc loc: locs) {
|
||||
if (contig == null || !contig.equals(loc.getContig())) {
|
||||
maxFiles++;
|
||||
contig = loc.getContig();
|
||||
}
|
||||
}
|
||||
return maxFiles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits an interval list into multiple files.
|
||||
* @param fileHeader The sam file header.
|
||||
|
|
@ -373,7 +360,6 @@ public class IntervalUtils {
|
|||
* @return A list of lists of genome locs, split according to splits
|
||||
*/
|
||||
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
|
||||
int locIndex = 1;
|
||||
int start = 0;
|
||||
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
|
||||
for (Integer stop: splits) {
|
||||
|
|
@ -465,7 +451,7 @@ public class IntervalUtils {
|
|||
|
||||
@Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"})
|
||||
@Ensures({"result != null"})
|
||||
final static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
|
||||
static SplitLocusRecursive splitLocusIntervals1(LinkedList<GenomeLoc> remaining, long idealSplitSize) {
|
||||
final List<GenomeLoc> split = new ArrayList<GenomeLoc>();
|
||||
long size = 0;
|
||||
|
||||
|
|
@ -579,10 +565,101 @@ public class IntervalUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static final long intervalSize(final List<GenomeLoc> locs) {
|
||||
public static long intervalSize(final List<GenomeLoc> locs) {
|
||||
long size = 0;
|
||||
for ( final GenomeLoc loc : locs )
|
||||
size += loc.size();
|
||||
return size;
|
||||
}
|
||||
|
||||
public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) {
|
||||
ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference);
|
||||
GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference());
|
||||
List<GenomeLoc> originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath());
|
||||
|
||||
if (originalList.isEmpty())
|
||||
throw new UserException.MalformedFile(inputIntervals, "File contains no intervals");
|
||||
|
||||
List<GenomeLoc> flankingList = getFlankingIntervals(parser, originalList, basePairs);
|
||||
|
||||
if (flankingList.isEmpty())
|
||||
throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals");
|
||||
|
||||
SAMFileHeader samFileHeader = new SAMFileHeader();
|
||||
samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
|
||||
IntervalList intervalList = new IntervalList(samFileHeader);
|
||||
int i = 0;
|
||||
for (GenomeLoc loc: flankingList)
|
||||
intervalList.add(toInterval(loc, ++i));
|
||||
intervalList.write(flankingIntervals);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs.
|
||||
* @param parser A genome loc parser for creating the new intervals
|
||||
* @param locs Original genome locs
|
||||
* @param basePairs Number of base pairs on each side of loc
|
||||
* @return The list of intervals between the locs
|
||||
*/
|
||||
public static List<GenomeLoc> getFlankingIntervals(final GenomeLocParser parser, final List<GenomeLoc> locs, final int basePairs) {
|
||||
List<GenomeLoc> sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList();
|
||||
|
||||
if (sorted.size() == 0)
|
||||
return Collections.emptyList();
|
||||
|
||||
LinkedHashMap<String, List<GenomeLoc>> locsByContig = splitByContig(sorted);
|
||||
List<GenomeLoc> expanded = new ArrayList<GenomeLoc>();
|
||||
for (String contig: locsByContig.keySet()) {
|
||||
List<GenomeLoc> contigLocs = locsByContig.get(contig);
|
||||
int contigLocsSize = contigLocs.size();
|
||||
|
||||
GenomeLoc startLoc, stopLoc;
|
||||
|
||||
// Create loc at start of the list
|
||||
startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs);
|
||||
if (startLoc != null)
|
||||
expanded.add(startLoc);
|
||||
|
||||
// Create locs between each loc[i] and loc[i+1]
|
||||
for (int i = 0; i < contigLocsSize - 1; i++) {
|
||||
stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs);
|
||||
startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs);
|
||||
if (stopLoc.getStop() + 1 >= startLoc.getStart()) {
|
||||
// NOTE: This is different than GenomeLoc.merge()
|
||||
// merge() returns a loc which covers the entire range of stop and start,
|
||||
// possibly returning positions inside loc(i) or loc(i+1)
|
||||
// We want to make sure that the start of the stopLoc is used, and the stop of the startLoc
|
||||
GenomeLoc merged = parser.createGenomeLoc(
|
||||
stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop());
|
||||
expanded.add(merged);
|
||||
} else {
|
||||
expanded.add(stopLoc);
|
||||
expanded.add(startLoc);
|
||||
}
|
||||
}
|
||||
|
||||
// Create loc at the end of the list
|
||||
stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs);
|
||||
if (stopLoc != null)
|
||||
expanded.add(stopLoc);
|
||||
}
|
||||
return expanded;
|
||||
}
|
||||
|
||||
private static LinkedHashMap<String, List<GenomeLoc>> splitByContig(List<GenomeLoc> sorted) {
|
||||
LinkedHashMap<String, List<GenomeLoc>> splits = new LinkedHashMap<String, List<GenomeLoc>>();
|
||||
GenomeLoc last = null;
|
||||
List<GenomeLoc> contigLocs = null;
|
||||
for (GenomeLoc loc: sorted) {
|
||||
if (GenomeLoc.isUnmapped(loc))
|
||||
continue;
|
||||
if (last == null || !last.onSameContig(loc)) {
|
||||
contigLocs = new ArrayList<GenomeLoc>();
|
||||
splits.put(loc.getContig(), contigLocs);
|
||||
}
|
||||
contigLocs.add(loc);
|
||||
last = loc;
|
||||
}
|
||||
return splits;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils;
|
|||
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -11,6 +10,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
|||
import static org.testng.Assert.assertEquals;
|
||||
import static org.testng.Assert.assertTrue;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
|
|
@ -36,7 +36,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testGetContigIndexValid() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
|
||||
assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference
|
||||
}
|
||||
|
||||
|
|
@ -67,7 +66,6 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testGetContigInfoKnownContig() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
|
||||
assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference
|
||||
}
|
||||
|
||||
|
|
@ -191,4 +189,104 @@ public class GenomeLocParserUnitTest extends BaseTest {
|
|||
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
|
||||
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
|
||||
}
|
||||
|
||||
private static class FlankingGenomeLocTestData extends TestDataProvider {
|
||||
final GenomeLocParser parser;
|
||||
final int basePairs;
|
||||
final GenomeLoc original, flankStart, flankStop;
|
||||
|
||||
private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) {
|
||||
super(FlankingGenomeLocTestData.class, name);
|
||||
this.parser = parser;
|
||||
this.basePairs = basePairs;
|
||||
this.original = parse(parser, original);
|
||||
this.flankStart = flankStart == null ? null : parse(parser, flankStart);
|
||||
this.flankStop = flankStop == null ? null : parse(parser, flankStop);
|
||||
}
|
||||
|
||||
private static GenomeLoc parse(GenomeLocParser parser, String str) {
|
||||
return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "flankingGenomeLocs")
|
||||
public Object[][] getFlankingGenomeLocs() {
|
||||
int contigLength = 10000;
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength);
|
||||
GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
|
||||
new FlankingGenomeLocTestData("atStartBase1", parser, 1,
|
||||
"chr1:1", null, "chr1:2");
|
||||
|
||||
new FlankingGenomeLocTestData("atStartBase50", parser, 50,
|
||||
"chr1:1", null, "chr1:2-51");
|
||||
|
||||
new FlankingGenomeLocTestData("atStartRange50", parser, 50,
|
||||
"chr1:1-10", null, "chr1:11-60");
|
||||
|
||||
new FlankingGenomeLocTestData("atEndBase1", parser, 1,
|
||||
"chr1:" + contigLength, "chr1:" + (contigLength - 1), null);
|
||||
|
||||
new FlankingGenomeLocTestData("atEndBase50", parser, 50,
|
||||
"chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null);
|
||||
|
||||
new FlankingGenomeLocTestData("atEndRange50", parser, 50,
|
||||
String.format("chr1:%d-%d", contigLength - 10, contigLength),
|
||||
String.format("chr1:%d-%d", contigLength - 60, contigLength - 11),
|
||||
null);
|
||||
|
||||
new FlankingGenomeLocTestData("nearStartBase1", parser, 1,
|
||||
"chr1:2", "chr1:1", "chr1:3");
|
||||
|
||||
new FlankingGenomeLocTestData("nearStartRange50", parser, 50,
|
||||
"chr1:21-30", "chr1:1-20", "chr1:31-80");
|
||||
|
||||
new FlankingGenomeLocTestData("nearEndBase1", parser, 1,
|
||||
"chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength);
|
||||
|
||||
new FlankingGenomeLocTestData("nearEndRange50", parser, 50,
|
||||
String.format("chr1:%d-%d", contigLength - 30, contigLength - 21),
|
||||
String.format("chr1:%d-%d", contigLength - 80, contigLength - 31),
|
||||
String.format("chr1:%d-%d", contigLength - 20, contigLength));
|
||||
|
||||
new FlankingGenomeLocTestData("beyondStartBase1", parser, 1,
|
||||
"chr1:3", "chr1:2", "chr1:4");
|
||||
|
||||
new FlankingGenomeLocTestData("beyondStartRange50", parser, 50,
|
||||
"chr1:101-200", "chr1:51-100", "chr1:201-250");
|
||||
|
||||
new FlankingGenomeLocTestData("beyondEndBase1", parser, 1,
|
||||
"chr1:" + (contigLength - 3),
|
||||
"chr1:" + (contigLength - 4),
|
||||
"chr1:" + (contigLength - 2));
|
||||
|
||||
new FlankingGenomeLocTestData("beyondEndRange50", parser, 50,
|
||||
String.format("chr1:%d-%d", contigLength - 200, contigLength - 101),
|
||||
String.format("chr1:%d-%d", contigLength - 250, contigLength - 201),
|
||||
String.format("chr1:%d-%d", contigLength - 100, contigLength - 51));
|
||||
|
||||
new FlankingGenomeLocTestData("unmapped", parser, 50,
|
||||
"unmapped", null, null);
|
||||
|
||||
new FlankingGenomeLocTestData("fullContig", parser, 50,
|
||||
"chr1", null, null);
|
||||
|
||||
return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingGenomeLocs")
|
||||
public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) {
|
||||
GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs);
|
||||
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||
data.toString(), data.original, actual, data.flankStart);
|
||||
assertEquals(actual, data.flankStart, description);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingGenomeLocs")
|
||||
public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) {
|
||||
GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs);
|
||||
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||
data.toString(), data.original, actual, data.flankStop);
|
||||
assertEquals(actual, data.flankStop, description);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
package org.broadinstitute.sting.utils.interval;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.util.IntervalUtil;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
|
@ -762,4 +762,225 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
|||
List<GenomeLoc> merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
|
||||
Assert.assertEquals(merged.size(), 1);
|
||||
}
|
||||
|
||||
/*
|
||||
Split into tests that can be written to files and tested by writeFlankingIntervals,
|
||||
and lists that cannot but are still handled by getFlankingIntervals.
|
||||
*/
|
||||
private static abstract class FlankingIntervalsTestData extends TestDataProvider {
|
||||
final public File referenceFile;
|
||||
final public GenomeLocParser parser;
|
||||
final int basePairs;
|
||||
final List<GenomeLoc> original;
|
||||
final List<GenomeLoc> expected;
|
||||
|
||||
protected FlankingIntervalsTestData(Class<?> clazz, String name, File referenceFile, GenomeLocParser parser,
|
||||
int basePairs, List<String> original, List<String> expected) {
|
||||
super(clazz, name);
|
||||
this.referenceFile = referenceFile;
|
||||
this.parser = parser;
|
||||
this.basePairs = basePairs;
|
||||
this.original = parse(parser, original);
|
||||
this.expected = parse(parser, expected);
|
||||
}
|
||||
|
||||
private static List<GenomeLoc> parse(GenomeLocParser parser, List<String> locs) {
|
||||
List<GenomeLoc> parsed = new ArrayList<GenomeLoc>();
|
||||
for (String loc: locs)
|
||||
parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc));
|
||||
return parsed;
|
||||
}
|
||||
}
|
||||
|
||||
private static class FlankingIntervalsFile extends FlankingIntervalsTestData {
|
||||
public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser,
|
||||
int basePairs, List<String> original, List<String> expected) {
|
||||
super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected);
|
||||
}
|
||||
}
|
||||
|
||||
private static class FlankingIntervalsList extends FlankingIntervalsTestData {
|
||||
public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser,
|
||||
int basePairs, List<String> original, List<String> expected) {
|
||||
super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected);
|
||||
}
|
||||
}
|
||||
|
||||
/* Intervals where the original and the flanks can be written to files. */
|
||||
@DataProvider(name = "flankingIntervalsFiles")
|
||||
public Object[][] getFlankingIntervalsFiles() {
|
||||
File hg19ReferenceFile = new File(BaseTest.hg19Reference);
|
||||
int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength();
|
||||
|
||||
new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:1"),
|
||||
Arrays.asList("1:2"));
|
||||
|
||||
new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:1"),
|
||||
Arrays.asList("1:2-51"));
|
||||
|
||||
new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:1-10"),
|
||||
Arrays.asList("1:11-60"));
|
||||
|
||||
new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:" + hg19Length1),
|
||||
Arrays.asList("1:" + (hg19Length1 - 1)));
|
||||
|
||||
new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:" + hg19Length1),
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1)));
|
||||
|
||||
new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)),
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11)));
|
||||
|
||||
new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:2"),
|
||||
Arrays.asList("1:1", "1:3"));
|
||||
|
||||
new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:21-30"),
|
||||
Arrays.asList("1:1-20", "1:31-80"));
|
||||
|
||||
new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:" + (hg19Length1 - 1)),
|
||||
Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1));
|
||||
|
||||
new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)),
|
||||
Arrays.asList(
|
||||
String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31),
|
||||
String.format("1:%d-%d", hg19Length1 - 20, hg19Length1)));
|
||||
|
||||
new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:3"),
|
||||
Arrays.asList("1:2", "1:4"));
|
||||
|
||||
new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200"),
|
||||
Arrays.asList("1:51-100", "1:201-250"));
|
||||
|
||||
new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1,
|
||||
Arrays.asList("1:" + (hg19Length1 - 3)),
|
||||
Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2)));
|
||||
|
||||
new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)),
|
||||
Arrays.asList(
|
||||
String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201),
|
||||
String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51)));
|
||||
|
||||
new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:401-500"),
|
||||
Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550"));
|
||||
|
||||
new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:301-400"),
|
||||
Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
|
||||
|
||||
new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:271-400"),
|
||||
Arrays.asList("1:51-100", "1:201-270", "1:401-450"));
|
||||
|
||||
new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:221-400"),
|
||||
Arrays.asList("1:51-100", "1:201-220", "1:401-450"));
|
||||
|
||||
new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:121-400"),
|
||||
Arrays.asList("1:51-100", "1:401-450"));
|
||||
|
||||
new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "2:301-400"),
|
||||
Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450"));
|
||||
|
||||
// Explicit testing a problematic agilent target pair
|
||||
new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("2:74756257-74756411", "2:74756487-74756628"),
|
||||
// wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679")
|
||||
Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678"));
|
||||
|
||||
return TestDataProvider.getTests(FlankingIntervalsFile.class);
|
||||
}
|
||||
|
||||
/* Intervals where either the original and/or the flanks cannot be written to a file. */
|
||||
@DataProvider(name = "flankingIntervalsLists")
|
||||
public Object[][] getFlankingIntervalsLists() {
|
||||
File hg19ReferenceFile = new File(BaseTest.hg19Reference);
|
||||
List<String> empty = Collections.emptyList();
|
||||
|
||||
new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
empty,
|
||||
empty);
|
||||
|
||||
new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("unmapped"),
|
||||
empty);
|
||||
|
||||
new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1"),
|
||||
empty);
|
||||
|
||||
new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1", "2", "3"),
|
||||
empty);
|
||||
|
||||
new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50,
|
||||
Arrays.asList("1:101-200", "1:301-400", "unmapped"),
|
||||
Arrays.asList("1:51-100", "1:201-300", "1:401-450"));
|
||||
|
||||
return TestDataProvider.getTests(FlankingIntervalsList.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingIntervalsFiles")
|
||||
public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
|
||||
File originalFile = createTempFile("original.", ".intervals");
|
||||
File flankingFile = createTempFile("flanking.", ".intervals");
|
||||
try {
|
||||
List<String> lines = new ArrayList<String>();
|
||||
for (GenomeLoc loc: data.original)
|
||||
lines.add(loc.toString());
|
||||
FileUtils.writeLines(originalFile, lines);
|
||||
|
||||
IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
|
||||
|
||||
List<GenomeLoc> actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath());
|
||||
|
||||
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||
data.toString(), data.original, actual, data.expected);
|
||||
Assert.assertEquals(actual, data.expected, description);
|
||||
} finally {
|
||||
FileUtils.deleteQuietly(originalFile);
|
||||
FileUtils.deleteQuietly(flankingFile);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class)
|
||||
public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception {
|
||||
File originalFile = createTempFile("original.", ".intervals");
|
||||
File flankingFile = createTempFile("flanking.", ".intervals");
|
||||
try {
|
||||
List<String> lines = new ArrayList<String>();
|
||||
for (GenomeLoc loc: data.original)
|
||||
lines.add(loc.toString());
|
||||
FileUtils.writeLines(originalFile, lines);
|
||||
|
||||
// Should throw a user exception on bad input if either the original
|
||||
// intervals are empty or if the flanking intervals are empty
|
||||
IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs);
|
||||
} finally {
|
||||
FileUtils.deleteQuietly(originalFile);
|
||||
FileUtils.deleteQuietly(flankingFile);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "flankingIntervalsLists")
|
||||
public void testGetFlankingIntervals(FlankingIntervalsTestData data) {
|
||||
List<GenomeLoc> actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs);
|
||||
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
|
||||
data.toString(), data.original, actual, data.expected);
|
||||
Assert.assertEquals(actual, data.expected, description);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.queue.extensions.gatk
|
||||
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
import org.broadinstitute.sting.commandline.{Output, Argument, Input}
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||
|
||||
class WriteFlankingIntervalsFunction extends InProcessFunction {
|
||||
@Input(doc="The reference sequence")
|
||||
var reference : File = _
|
||||
|
||||
@Input(doc="The interval list to flank")
|
||||
var inputIntervals : File = _
|
||||
|
||||
@Output(doc="The output intervals file to write to")
|
||||
var outputIntervals: File = _
|
||||
|
||||
@Argument(doc="Number of base pair to flank the input intervals")
|
||||
var flankSize : Int = _
|
||||
|
||||
def run() {
|
||||
IntervalUtils.writeFlankingIntervals(reference, inputIntervals, outputIntervals, flankSize)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,135 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.library.ipf.intervals
|
||||
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
import org.broadinstitute.sting.commandline._
|
||||
import java.io.{PrintStream, File}
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import net.sf.picard.reference.FastaSequenceFile
|
||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
||||
import collection.immutable.TreeSet
|
||||
|
||||
// todo -- this is unsafe. Need to use a reference dictionary to ensure no off-contig targets are created
|
||||
class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ipType: String, opType: String) extends InProcessFunction {
|
||||
@Input(doc="The interval list to expand") val inList : File = in
|
||||
@Input(doc="The reference sequence") val refDict : File = ref
|
||||
@Argument(doc="Number of basepair to start the expanded interval") val startInt : Int = start
|
||||
@Argument(doc="Number of baispair to stop the expanded interval") val sizeInt : Int = size
|
||||
@Output(doc="The output intervals file to write to") val outList : File = out
|
||||
@Argument(doc="The output format for the intervals") val outTypeStr = opType
|
||||
@Argument(doc="The input format for the intervals") val inTypeStr = ipType
|
||||
|
||||
var output : PrintStream = _
|
||||
var parser : GenomeLocParser = _
|
||||
var xrl : XReadLines = _
|
||||
val outType = IntervalFormatType.convert(outTypeStr)
|
||||
val inType = IntervalFormatType.convert(inTypeStr)
|
||||
|
||||
var offsetIn : Int = 0
|
||||
var offsetOut : Int = 0
|
||||
|
||||
var first : Boolean = true
|
||||
var lastTwo : (GenomeLoc,GenomeLoc) = _
|
||||
|
||||
var intervalCache : TreeSet[GenomeLoc] = _
|
||||
val LINES_TO_CACHE : Int = 1000
|
||||
|
||||
def run = {
|
||||
output = new PrintStream(outList)
|
||||
intervalCache = new TreeSet[GenomeLoc]()(new Ordering[GenomeLoc]{
|
||||
def compare(o1: GenomeLoc, o2: GenomeLoc) : Int = { o1.compareTo(o2) }
|
||||
})
|
||||
parser = new GenomeLocParser(new FastaSequenceFile(ref,true))
|
||||
xrl = new XReadLines(inList)
|
||||
offsetIn = if (isBed(inType)) 1 else 0
|
||||
offsetOut = if( isBed(outType)) 1 else 0
|
||||
var line : String = xrl.next
|
||||
while ( line.startsWith("@") ) {
|
||||
line = xrl.next
|
||||
}
|
||||
var prevLoc: GenomeLoc = null
|
||||
var curLoc: GenomeLoc = null
|
||||
var nextLoc : GenomeLoc = parseGenomeInterval(line)
|
||||
var linesProcessed : Int = 1
|
||||
while ( prevLoc != null || curLoc != null || nextLoc != null ) {
|
||||
prevLoc = curLoc
|
||||
curLoc = nextLoc
|
||||
nextLoc = if ( xrl.hasNext ) parseGenomeInterval(xrl.next) else null
|
||||
if ( curLoc != null ) {
|
||||
val left: GenomeLoc = refine(expandLeft(curLoc),prevLoc)
|
||||
val right: GenomeLoc = refine(expandRight(curLoc),nextLoc)
|
||||
if ( left != null ) {
|
||||
intervalCache += left
|
||||
}
|
||||
if ( right != null ) {
|
||||
intervalCache += right
|
||||
}
|
||||
}
|
||||
linesProcessed += 1
|
||||
if ( linesProcessed % LINES_TO_CACHE == 0 ) {
|
||||
val toPrint = intervalCache.filter( u => (u.isBefore(prevLoc) && u.distance(prevLoc) > startInt+sizeInt))
|
||||
intervalCache = intervalCache -- toPrint
|
||||
toPrint.foreach(u => output.print("%s%n".format(repr(u))))
|
||||
}
|
||||
//System.out.printf("%s".format(if ( curLoc == null ) "null" else repr(curLoc)))
|
||||
}
|
||||
|
||||
intervalCache.foreach(u => output.print("%s%n".format(repr(u))))
|
||||
|
||||
output.close()
|
||||
}
|
||||
|
||||
def expandLeft(g: GenomeLoc) : GenomeLoc = {
|
||||
parser.createGenomeLoc(g.getContig,g.getStart-startInt-sizeInt,g.getStart-startInt)
|
||||
}
|
||||
|
||||
def expandRight(g: GenomeLoc) : GenomeLoc = {
|
||||
parser.createGenomeLoc(g.getContig,g.getStop+startInt,g.getStop+startInt+sizeInt)
|
||||
}
|
||||
|
||||
def refine(newG: GenomeLoc, borderG: GenomeLoc) : GenomeLoc = {
|
||||
if ( borderG == null || ! newG.overlapsP(borderG) ) {
|
||||
return newG
|
||||
} else {
|
||||
if ( newG.getStart < borderG.getStart ) {
|
||||
if ( borderG.getStart - startInt > newG.getStart ) {
|
||||
return parser.createGenomeLoc(newG.getContig,newG.getStart,borderG.getStart-startInt)
|
||||
}
|
||||
} else {
|
||||
if ( borderG.getStop + startInt < newG.getStop ){
|
||||
return parser.createGenomeLoc(newG.getContig,borderG.getStop+startInt,newG.getStop)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
null
|
||||
}
|
||||
|
||||
def repr(loc : GenomeLoc) : String = {
|
||||
if ( loc == null ) return "null"
|
||||
if ( outType == IntervalFormatType.INTERVALS ) {
|
||||
return "%s:%d-%d".format(loc.getContig,loc.getStart,loc.getStop)
|
||||
} else {
|
||||
return "%s\t%d\t%d".format(loc.getContig,loc.getStart-offsetOut,loc.getStop+offsetOut)
|
||||
}
|
||||
}
|
||||
|
||||
def isBed(t: IntervalFormatType.IntervalFormatType) : Boolean = {
|
||||
t == IntervalFormatType.BED
|
||||
}
|
||||
|
||||
def parseGenomeInterval( s : String ) : GenomeLoc = {
|
||||
val sp = s.split("\\s+")
|
||||
// todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted
|
||||
if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn)
|
||||
}
|
||||
|
||||
object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") {
|
||||
type IntervalFormatType = Value
|
||||
val INTERVALS,BED,TDF = Value
|
||||
|
||||
def convert(s : String) : IntervalFormatType = {
|
||||
if ( s.equals("INTERVALS") ) INTERVALS else { if (s.equals("BED") ) BED else TDF}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.library.ipf.intervals
|
||||
|
||||
import org.broadinstitute.sting.queue.function.InProcessFunction
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.commandline._
|
||||
import java.io.{PrintStream, File}
|
||||
import net.sf.samtools.{SAMSequenceRecord, SAMFileHeader, SAMSequenceDictionary}
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
|
||||
|
||||
class IntersectIntervals(iVals: List[File], outFile: File, bed: Boolean) extends InProcessFunction {
|
||||
@Input(doc="List of interval files to find the intersection of") val intervals : List[File] = iVals
|
||||
@Output(doc="Output interval file to which to write") val output : File = outFile
|
||||
@Argument(doc="Assume the input interval lists are sorted in the proper order") var assumeSorted = false
|
||||
@Argument(doc="Is the tdf in bed file (0-based clopen: 0 5 for {1,2,3,4}?") var isBed = bed
|
||||
|
||||
|
||||
var outStream : PrintStream = _
|
||||
var contigs : List[String] = Nil
|
||||
var dict : SAMSequenceDictionary = _
|
||||
var parser : GenomeLocParser = _
|
||||
|
||||
def run = {
|
||||
outStream = new PrintStream(output)
|
||||
dict = new SAMSequenceDictionary
|
||||
// note: memory hog
|
||||
val sources : List[(List[(String,Int,Int)],Int)] = intervals.map(g => asScalaIterator(new XReadLines(g)).map(u => parse(u)).toList).zipWithIndex
|
||||
sources.map(u => u._1).flatten.map(u => u._1).distinct.foreach(u => dict.addSequence(new SAMSequenceRecord(u,Integer.MAX_VALUE)))
|
||||
parser = new GenomeLocParser(dict)
|
||||
sources.map( (u: (List[(String,Int,Int)],Int)) => u._1.map(g => (newGenomeLoc(g),u._2))).flatten.sortWith( (a,b) => (a._1 compareTo b._1) < 0 ).foldLeft[List[List[(GenomeLoc,Int)]]](Nil)( (a,b) => overlapFold(a,b)).map(u => mapIntersect(u)).filter(h => h != null && h.size > 0).foreach(h => writeOut(h))
|
||||
outStream.close()
|
||||
}
|
||||
|
||||
def writeOut(g : GenomeLoc) : Unit = {
|
||||
outStream.print("%s%n".format(g.toString))
|
||||
}
|
||||
|
||||
def parse(s : String) : (String,Int,Int) = {
|
||||
if ( s.contains(":") ) {
|
||||
val split1 = s.split(":")
|
||||
val split2 = split1(1).split("-")
|
||||
return (split1(0),split2(0).toInt,split2(1).toInt)
|
||||
} else {
|
||||
val split = s.split("\\s+")
|
||||
return (split(0),split(1).toInt + (if(isBed) 1 else 0) ,split(2).toInt - (if(isBed) 1 else 0) )
|
||||
}
|
||||
}
|
||||
|
||||
def newGenomeLoc(coords : (String,Int,Int) ) : GenomeLoc = {
|
||||
parser.createGenomeLoc(coords._1,coords._2,coords._3)
|
||||
}
|
||||
|
||||
def overlapFold( a: List[List[(GenomeLoc,Int)]], b: (GenomeLoc,Int) ) : List[List[(GenomeLoc,Int)]] = {
|
||||
if ( a.last.forall(u => u._1.overlapsP(b._1)) ) {
|
||||
a.init :+ (a.last :+ b)
|
||||
} else {
|
||||
a :+ ( a.last.dropWhile(u => ! u._1.overlapsP(b._1)) :+ b)
|
||||
}
|
||||
}
|
||||
|
||||
def mapIntersect( u: List[(GenomeLoc,Int)]) : GenomeLoc = {
|
||||
if ( u.map(h => h._2).distinct.sum != range(1,intervals.size).sum ) { // if all sources not accounted for
|
||||
null
|
||||
}
|
||||
u.map(h => h._1).reduceLeft[GenomeLoc]( (a,b) => a.intersect(b) )
|
||||
}
|
||||
|
||||
def range(a: Int, b: Int) : Range = new Range(a,b+1,1)
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue