Refactored IntervalUtils used to parse and scatter intervals for Queue.

Scattering non-contig interval lists by number of loci in the intervals instead of just number of intervals.
Queue caches the list of locs and how to split them up instead of reloading them from disk repeatedly.
TODO: general purpose function to divide data evenly.
Skip over comments when parsing picard analysis files.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5687 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2011-04-26 00:06:00 +00:00
parent 6ca4e3cebf
commit f619dd3ca7
8 changed files with 409 additions and 292 deletions

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.utils.broad;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
@ -56,6 +57,8 @@ public class PicardAnalysisFiles {
this.path = path;
HashMap<String,Integer> headerIndexes = null;
for (String line: new XReadLines(new File(path))) {
if (line.startsWith("#"))
continue;
String[] values = line.split("\t");
if (headerIndexes == null) {
headerIndexes = new HashMap<String,Integer>();
@ -65,7 +68,10 @@ public class PicardAnalysisFiles {
}
} else {
for (String header: ANALYSIS_HEADERS) {
String value = values[headerIndexes.get(header)];
int index = headerIndexes.get(header);
if (values.length <= index)
throw new StingException(String.format("Unable to parse line in %s: %n%s", path, line));
String value = values[index];
headerValues.get(header).add(value);
}
}

View File

@ -6,7 +6,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*;
@ -189,162 +189,136 @@ public class IntervalUtils {
}
/**
* Returns the list of GenomeLocs from the list of intervals.
* @param referenceSource The reference for the intervals.
* @param intervals The interval as strings or file paths.
* @return The list of GenomeLocs.
*/
private static List<GenomeLoc> parseIntervalArguments(ReferenceDataSource referenceSource, List<String> intervals) {
GenomeLocParser parser = new GenomeLocParser(referenceSource.getReference());
GenomeLocSortedSet locs;
// TODO: Abstract genome analysis engine has richer logic for parsing. We need to use it!
if (intervals.size() == 0) {
locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary());
} else {
locs = new GenomeLocSortedSet(parser, IntervalUtils.parseIntervalArguments(parser, intervals, false));
}
if (locs == null || locs.size() == 0)
throw new UserException.MalformedFile("Intervals are empty: " + Utils.join(", ", intervals));
return locs.toList();
}
/**
* Returns the list of contigs from the list of intervals.
* Returns a map of contig names with their sizes.
* @param reference The reference for the intervals.
* @return The list of contig names.
* @return A map of contig names with their sizes.
*/
public static List<String> distinctContigs(File reference) {
return distinctContigs(reference, Collections.<String>emptyList());
}
/**
* Returns the list of contigs from the list of intervals.
* @param reference The reference for the intervals.
* @param intervals The interval as strings or file paths.
* @return The list of contig names.
*/
public static List<String> distinctContigs(File reference, List<String> intervals) {
public static Map<String, Long> getContigSizes(File reference) {
ReferenceDataSource referenceSource = new ReferenceDataSource(reference);
List<GenomeLoc> locs = parseIntervalArguments(referenceSource, intervals);
String contig = null;
List<String> contigs = new ArrayList<String>();
for (GenomeLoc loc: locs) {
if (contig == null || !contig.equals(loc.getContig())) {
contig = loc.getContig();
contigs.add(contig);
}
}
return contigs;
}
/**
* Returns a map of contig names with their lengths from the reference.
* @param reference The reference for the intervals.
* @return A map of contig names with their lengths.
*/
public static Map<String, Integer> getContigLengths(File reference) {
ReferenceDataSource referenceSource = new ReferenceDataSource(reference);
List<GenomeLoc> locs = parseIntervalArguments(referenceSource, Collections.<String>emptyList());
Map<String, Integer> lengths = new LinkedHashMap<String, Integer>();
List<GenomeLoc> locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()).toList();
Map<String, Long> lengths = new LinkedHashMap<String, Long>();
for (GenomeLoc loc: locs)
lengths.put(loc.getContig(), loc.getStop());
lengths.put(loc.getContig(), loc.size());
return lengths;
}
/**
* Counts the number of interval files an interval list can be split into using scatterIntervalArguments.
* @param reference The reference for the intervals.
* @param intervals The interval as strings or file paths.
* @param splitByContig If true then one contig will not be written to multiple files.
* @param locs The genome locs.
* @return The maximum number of parts the intervals can be split into.
*/
public static int countIntervalArguments(File reference, List<String> intervals, boolean splitByContig) {
ReferenceDataSource referenceSource = new ReferenceDataSource(reference);
List<GenomeLoc> locs = parseIntervalArguments(referenceSource, intervals);
public static int countContigIntervals(List<GenomeLoc> locs) {
int maxFiles = 0;
if (splitByContig) {
String contig = null;
for (GenomeLoc loc: locs) {
if (contig == null || !contig.equals(loc.getContig())) {
maxFiles++;
contig = loc.getContig();
}
String contig = null;
for (GenomeLoc loc: locs) {
if (contig == null || !contig.equals(loc.getContig())) {
maxFiles++;
contig = loc.getContig();
}
} else {
maxFiles = locs.size();
}
return maxFiles;
}
/**
* Splits an interval list into multiple files.
* @param reference The reference for the intervals.
* @param intervals The interval as strings or file paths.
* @param fileHeader The sam file header.
* @param locs The genome locs to split.
* @param scatterParts The output interval lists to write to.
* @param splitByContig If true then one contig will not be written to multiple files.
*/
public static void scatterIntervalArguments(File reference, List<String> intervals, List<File> scatterParts, boolean splitByContig) {
ReferenceDataSource referenceSource = new ReferenceDataSource(reference);
List<GenomeLoc> locs = parseIntervalArguments(referenceSource, intervals);
SAMFileHeader fileHeader = new SAMFileHeader();
fileHeader.setSequenceDictionary(referenceSource.getReference().getSequenceDictionary());
public static void scatterContigIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<File> scatterParts) {
IntervalList intervalList = null;
int fileIndex = -1;
int locIndex = 0;
if (splitByContig) {
String contig = null;
for (GenomeLoc loc: locs) {
// If there are still more files to write and the contig doesn't match...
if ((fileIndex+1 < scatterParts.size()) && (contig == null || !contig.equals(loc.getContig()))) {
// Then close the current file and start a new one.
if (intervalList != null) {
intervalList.write(scatterParts.get(fileIndex));
intervalList = null;
}
fileIndex++;
contig = loc.getContig();
String contig = null;
for (GenomeLoc loc: locs) {
// If there are still more files to write and the contig doesn't match...
if ((fileIndex+1 < scatterParts.size()) && (contig == null || !contig.equals(loc.getContig()))) {
// Then close the current file and start a new one.
if (intervalList != null) {
intervalList.write(scatterParts.get(fileIndex));
intervalList = null;
}
if (intervalList == null)
intervalList = new IntervalList(fileHeader);
intervalList.add(toInterval(loc, ++locIndex));
fileIndex++;
contig = loc.getContig();
}
if (intervalList != null)
intervalList.write(scatterParts.get(fileIndex));
} else {
int locsPerFile = locs.size() / scatterParts.size();
int locRemainder = locs.size() % scatterParts.size();
// At the start, put an extra loc per file
locsPerFile++;
int locsLeftFile = 0;
for (GenomeLoc loc: locs) {
if (locsLeftFile == 0) {
if (intervalList != null)
intervalList.write(scatterParts.get(fileIndex));
fileIndex++;
intervalList = new IntervalList(fileHeader);
// When we have put enough locs into each file,
// reduce the number of locs per file back
// to the original calculated value.
if (fileIndex == locRemainder)
locsPerFile -= 1;
locsLeftFile = locsPerFile;
}
locsLeftFile -= 1;
intervalList.add(toInterval(loc, ++locIndex));
}
if (intervalList != null)
intervalList.write(scatterParts.get(fileIndex));
if (intervalList == null)
intervalList = new IntervalList(fileHeader);
intervalList.add(toInterval(loc, ++locIndex));
}
if (intervalList != null)
intervalList.write(scatterParts.get(fileIndex));
if ((fileIndex + 1) != scatterParts.size())
throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size()));
}
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
* @param locs The genome locs to split.
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
* @param scatterParts The output interval lists to write to.
*/
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<Integer> splits, List<File> scatterParts) {
if (splits.size() != scatterParts.size())
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
int fileIndex = 0;
int locIndex = 1;
int start = 0;
for (Integer stop: splits) {
IntervalList intervalList = new IntervalList(fileHeader);
for (int i = start; i < stop; i++)
intervalList.add(toInterval(locs.get(i), locIndex++));
intervalList.write(scatterParts.get(fileIndex++));
start = stop;
}
}
/**
* Splits the genome locs up by size.
* @param locs Genome locs to split.
* @param numParts Number of parts to split the locs into.
* @return The stop points to split the genome locs.
*/
public static List<Integer> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
if (locs.size() < numParts)
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
long locsSize = 0;
for (GenomeLoc loc: locs)
locsSize += loc.size();
List<Integer> splitPoints = new ArrayList<Integer>();
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
Collections.sort(splitPoints);
splitPoints.add(locs.size());
return splitPoints;
}
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
if (numParts < 2)
return;
int halfParts = (numParts + 1) / 2;
Pair<Integer, Long> splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts);
int splitIndex = splitPoint.first;
long splitSize = splitPoint.second;
splitPoints.add(splitIndex);
addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts);
addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts);
}
private static Pair<Integer, Long> getFixedSplit(List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int minLocs) {
int splitIndex = startIndex;
long splitSize = 0;
for (int i = 0; i < minLocs; i++) {
splitSize += locs.get(splitIndex).size();
splitIndex++;
}
long halfSize = locsSize / 2;
while (splitIndex < stopIndex && splitSize < halfSize) {
splitSize += locs.get(splitIndex).size();
splitIndex++;
}
return new Pair<Integer, Long>(splitIndex, splitSize);
}
/**
* Converts a GenomeLoc to a picard interval.
* @param loc The GenomeLoc.

View File

@ -25,6 +25,14 @@ public class PicardAnalysisFilesUnitTest extends BaseTest {
Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list");
}
@Test
public void testParseValidWithComments() throws Exception {
PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_comments.txt");
Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta");
Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list");
Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list");
}
@Test(expectedExceptions = FileNotFoundException.class)
public void testParseBadPath() throws Exception {
new PicardAnalysisFiles(BaseTest.validationDataLocation + "non_existent_picard_analysis_file.txt");

View File

@ -22,6 +22,12 @@ public class PicardPipelineUnitTest {
validatePipeline(pipeline, FilenameUtils.getBaseName(tsv.getPath()));
}
@Test
public void testParseTsvWithPicardComments() throws Exception {
File tsv = writeTsv("C460", "HG01359");
PicardPipeline.parse(tsv);
}
@Test
public void testParseYaml() throws IOException {
File yaml = writeYaml("project_name", PROJECT, SAMPLE);

View File

@ -1,7 +1,10 @@
package org.broadinstitute.sting.utils.interval;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.testng.Assert;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -21,14 +24,20 @@ import java.util.*;
*/
public class IntervalUtilsUnitTest extends BaseTest {
// used to seed the genome loc parser with a sequence dictionary
private static File reference = new File(BaseTest.hg18Reference);
private SAMFileHeader header;
private GenomeLocParser genomeLocParser;
private List<GenomeLoc> referenceLocs;
@BeforeClass
public void init() {
File reference = new File(BaseTest.hg18Reference);
try {
ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference);
header = new SAMFileHeader();
header.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(reference);
genomeLocParser = new GenomeLocParser(seq);
referenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ;
}
catch(FileNotFoundException ex) {
throw new UserException.CouldNotReadInputFile(reference,ex);
@ -94,46 +103,41 @@ public class IntervalUtilsUnitTest extends BaseTest {
Assert.assertEquals(ret.size(), 20);
}
@Test
public void testCountContigs() {
List<String> chrs = new ArrayList<String>();
for (int i = 1; i <= 22; i++)
chrs.add("chr" + i);
chrs.add("chrX");
chrs.add("chrY");
List<String> chrsNoRandom = Arrays.asList("chr12", "chr14", "chr20", "chrY");
List<String> chrsWithRandom = new ArrayList<String>();
chrsWithRandom.add("chrM");
chrsWithRandom.addAll(chrs);
for (String chr: chrs)
if(!chrsNoRandom.contains(chr))
chrsWithRandom.add(chr + "_random");
Assert.assertEquals(IntervalUtils.distinctContigs(reference), chrsWithRandom);
Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList(BaseTest.validationDataLocation + "TCGA-06-0188.interval_list")), chrs);
Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList("chr1:1-1", "chr2:1-1", "chr3:2-2")), Arrays.asList("chr1","chr2","chr3"));
Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2")), Arrays.asList("chr1","chr2","chr3"));
}
@Test
public void testGetContigLengths() {
Map<String, Integer> lengths = IntervalUtils.getContigLengths(reference);
Assert.assertEquals((int)lengths.get("chr1"), 247249719);
Assert.assertEquals((int)lengths.get("chr2"), 242951149);
Assert.assertEquals((int)lengths.get("chr3"), 199501827);
Assert.assertEquals((int)lengths.get("chr20"), 62435964);
Assert.assertEquals((int)lengths.get("chrX"), 154913754);
Map<String, Long> lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference));
Assert.assertEquals((long)lengths.get("chr1"), 247249719);
Assert.assertEquals((long)lengths.get("chr2"), 242951149);
Assert.assertEquals((long)lengths.get("chr3"), 199501827);
Assert.assertEquals((long)lengths.get("chr20"), 62435964);
Assert.assertEquals((long)lengths.get("chrX"), 154913754);
}
private List<GenomeLoc> getLocs(String... intervals) {
return getLocs(Arrays.asList(intervals));
}
private List<GenomeLoc> getLocs(List<String> intervals) {
if (intervals.size() == 0)
return referenceLocs;
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
for (String interval: intervals)
locs.add(genomeLocParser.parseGenomeInterval(interval));
return locs;
}
@Test
public void testCountIntervals() {
Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Collections.<String>emptyList(), false), 45);
Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Collections.<String>emptyList(), true), 45);
Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), false), 3);
Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), true), 3);
Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"), false), 4);
Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"), true), 3);
public void testParseIntervalArguments() {
Assert.assertEquals(getLocs().size(), 45);
Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3);
Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4);
}
@Test(dependsOnMethods = "testParseIntervalArguments")
public void testCountIntervalsByContig() {
Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs()), 45);
Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1", "chr2", "chr3")), 3);
Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")), 3);
}
@Test
@ -153,14 +157,16 @@ public class IntervalUtilsUnitTest extends BaseTest {
}
@Test
public void testBasicScatter() {
public void testFixedScatterIntervalsBasic() {
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3");
List<File> files = testFiles("basic.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), files, false);
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3");
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(header, locs, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -176,7 +182,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
}
@Test
public void testScatterLessFiles() {
public void testScatterFixedIntervalsLessFiles() {
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3");
@ -184,117 +190,9 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<File> files = testFiles("less.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3", "chr4"), files, false);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 2);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 1);
Assert.assertEquals(locs1.get(0), chr1);
Assert.assertEquals(locs1.get(1), chr2);
Assert.assertEquals(locs2.get(0), chr3);
Assert.assertEquals(locs3.get(0), chr4);
}
@Test(expectedExceptions=UserException.BadArgumentValue.class)
public void testScatterMoreFiles() {
List<File> files = testFiles("more.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2"), files, false);
}
@Test
public void testScatterIntervals() {
List<String> intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2");
GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2");
GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2");
List<File> files = testFiles("split.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, intervals, files, true);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 2);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 1);
Assert.assertEquals(locs1.get(0), chr1a);
Assert.assertEquals(locs1.get(1), chr1b);
Assert.assertEquals(locs2.get(0), chr2);
Assert.assertEquals(locs3.get(0), chr3);
}
@Test(enabled=false) // disabled, GenomeLoc.compareTo() returns 0 for two locs with the same start, causing an exception in GLSS.add().
public void testScatterIntervalsWithTheSameStart() {
List<File> files = testFiles("sg.", 20, ".intervals");
IntervalUtils.scatterIntervalArguments(new File(hg18Reference), Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), files, false);
}
@Test
public void testScatterOrder() {
List<String> intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2");
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2");
List<File> files = testFiles("split.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, intervals, files, true);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 1);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 1);
Assert.assertEquals(locs1.get(0), chr1);
Assert.assertEquals(locs2.get(0), chr2);
Assert.assertEquals(locs3.get(0), chr3);
}
@Test
public void testBasicScatterByContig() {
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3");
List<File> files = testFiles("contig_basic.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), files, true);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 1);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 1);
Assert.assertEquals(locs1.get(0), chr1);
Assert.assertEquals(locs2.get(0), chr2);
Assert.assertEquals(locs3.get(0), chr3);
}
@Test
public void testScatterByContigLessFiles() {
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3");
GenomeLoc chr4 = genomeLocParser.parseGenomeInterval("chr4");
List<File> files = testFiles("contig_less.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3", "chr4"), files, true);
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3", "chr4");
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(header, locs, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -311,13 +209,219 @@ public class IntervalUtilsUnitTest extends BaseTest {
}
@Test(expectedExceptions=UserException.BadArgumentValue.class)
public void testScatterByContigMoreFiles() {
List<File> files = testFiles("contig_more.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2"), files, true);
public void testSplitFixedIntervalsMoreFiles() {
List<File> files = testFiles("more.", 3, ".intervals");
List<GenomeLoc> locs = getLocs("chr1", "chr2");
IntervalUtils.splitFixedIntervals(locs, files.size());
}
@Test(expectedExceptions=UserException.BadArgumentValue.class)
public void testScatterFixedIntervalsMoreFiles() {
List<File> files = testFiles("more.", 3, ".intervals");
List<GenomeLoc> locs = getLocs("chr1", "chr2");
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
IntervalUtils.scatterFixedIntervals(header, locs, splits, files);
}
@Test
public void testScatterFixedIntervalsStart() {
List<String> intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2");
GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2");
GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2");
List<File> files = testFiles("split.", 3, ".intervals");
List<GenomeLoc> locs = getLocs(intervals);
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(header, locs, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 1);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 2);
Assert.assertEquals(locs1.get(0), chr1a);
Assert.assertEquals(locs2.get(0), chr1b);
Assert.assertEquals(locs3.get(0), chr2);
Assert.assertEquals(locs3.get(1), chr3);
}
@Test
public void testScatterByContigIntervalsStart() {
public void testScatterFixedIntervalsMiddle() {
List<String> intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2");
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1");
GenomeLoc chr2a = genomeLocParser.parseGenomeInterval("chr2:1-2");
GenomeLoc chr2b = genomeLocParser.parseGenomeInterval("chr2:4-5");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2");
List<File> files = testFiles("split.", 3, ".intervals");
List<GenomeLoc> locs = getLocs(intervals);
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(header, locs, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 1);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 2);
Assert.assertEquals(locs1.get(0), chr1);
Assert.assertEquals(locs2.get(0), chr2a);
Assert.assertEquals(locs3.get(0), chr2b);
Assert.assertEquals(locs3.get(1), chr3);
}
@Test
public void testScatterFixedIntervalsEnd() {
List<String> intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5");
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:2-2");
GenomeLoc chr3a = genomeLocParser.parseGenomeInterval("chr3:1-2");
GenomeLoc chr3b = genomeLocParser.parseGenomeInterval("chr3:4-5");
List<File> files = testFiles("split.", 3, ".intervals");
List<GenomeLoc> locs = getLocs(intervals);
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(header, locs, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 2);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 1);
Assert.assertEquals(locs1.get(0), chr1);
Assert.assertEquals(locs1.get(1), chr2);
Assert.assertEquals(locs2.get(0), chr3a);
Assert.assertEquals(locs3.get(0), chr3b);
}
@Test
public void testScatterFixedIntervalsFile() {
List<File> files = testFiles("sg.", 20, ".intervals");
List<GenomeLoc> locs = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false);
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
int[] counts = {
5169, 5573, 10017, 10567, 10551,
5087, 4908, 10120, 10435, 10399,
5391, 4735, 10621, 10352, 10654,
5227, 5256, 10151, 9649, 9825
};
//String splitCounts = "";
for (int lastIndex = 0, i = 0; i < splits.size(); i++) {
int splitIndex = splits.get(i);
int splitCount = (splitIndex - lastIndex);
//splitCounts += ", " + splitCount;
lastIndex = splitIndex;
Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
}
//System.out.println(splitCounts.substring(2));
IntervalUtils.scatterFixedIntervals(header, locs, splits, files);
int locIndex = 0;
for (int i = 0; i < files.size(); i++) {
String file = files.get(i).toString();
List<GenomeLoc> parsedLocs = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(file), false);
Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file);
for (GenomeLoc parsedLoc: parsedLocs)
Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i));
}
Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs");
}
@Test
public void testScatterContigIntervalsOrder() {
List<String> intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2");
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2");
List<File> files = testFiles("split.", 3, ".intervals");
IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 1);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 1);
Assert.assertEquals(locs1.get(0), chr2);
Assert.assertEquals(locs2.get(0), chr1);
Assert.assertEquals(locs3.get(0), chr3);
}
@Test
public void testScatterContigIntervalsBasic() {
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3");
List<File> files = testFiles("contig_basic.", 3, ".intervals");
IntervalUtils.scatterContigIntervals(header, getLocs("chr1", "chr2", "chr3"), files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 1);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 1);
Assert.assertEquals(locs1.get(0), chr1);
Assert.assertEquals(locs2.get(0), chr2);
Assert.assertEquals(locs3.get(0), chr3);
}
@Test
public void testScatterContigIntervalsLessFiles() {
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2");
GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3");
GenomeLoc chr4 = genomeLocParser.parseGenomeInterval("chr4");
List<File> files = testFiles("contig_less.", 3, ".intervals");
IntervalUtils.scatterContigIntervals(header, getLocs("chr1", "chr2", "chr3", "chr4"), files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
List<GenomeLoc> locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false);
Assert.assertEquals(locs1.size(), 1);
Assert.assertEquals(locs2.size(), 1);
Assert.assertEquals(locs3.size(), 2);
Assert.assertEquals(locs1.get(0), chr1);
Assert.assertEquals(locs2.get(0), chr2);
Assert.assertEquals(locs3.get(0), chr3);
Assert.assertEquals(locs3.get(1), chr4);
}
@Test(expectedExceptions=UserException.BadArgumentValue.class)
public void testScatterContigIntervalsMoreFiles() {
List<File> files = testFiles("contig_more.", 3, ".intervals");
IntervalUtils.scatterContigIntervals(header, getLocs("chr1", "chr2"), files);
}
@Test
public void testScatterContigIntervalsStart() {
List<String> intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2");
GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2");
GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5");
@ -326,7 +430,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<File> files = testFiles("contig_split_start.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, intervals, files, true);
IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -343,7 +447,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
}
@Test
public void testScatterByContigIntervalsMiddle() {
public void testScatterContigIntervalsMiddle() {
List<String> intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2");
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1");
GenomeLoc chr2a = genomeLocParser.parseGenomeInterval("chr2:1-2");
@ -352,7 +456,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<File> files = testFiles("contig_split_middle.", 3, ".intervals");
IntervalUtils.scatterIntervalArguments(reference, intervals, files, true);
IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -369,7 +473,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
}
@Test
public void testScatterByContigIntervalsEnd() {
public void testScatterContigIntervalsEnd() {
List<String> intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5");
GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1");
GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:2-2");
@ -378,7 +482,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<File> files = testFiles("contig_split_end.", 3 ,".intervals");
IntervalUtils.scatterIntervalArguments(reference, intervals, files, true);
IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false);

View File

@ -35,10 +35,13 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction {
// Include unmapped reads by default.
this.includeUnmapped = true
protected override def maxIntervals =
IntervalUtils.countIntervalArguments(this.referenceSequence, this.intervals, true)
protected override def maxIntervals = {
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
IntervalUtils.countContigIntervals(gi.locs)
}
def run() {
IntervalUtils.scatterIntervalArguments(this.referenceSequence, this.intervals, this.scatterOutputFiles, true)
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles)
}
}

View File

@ -60,7 +60,7 @@ trait GATKScatterFunction extends ScatterFunction {
this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK]
this.referenceSequence = this.originalGATK.reference_sequence
if (this.originalGATK.intervals.isEmpty && this.originalGATK.intervalsString.isEmpty) {
this.intervals ++= IntervalUtils.distinctContigs(this.referenceSequence).toList
this.intervals ++= GATKScatterFunction.getGATKIntervals(this.referenceSequence, List.empty[String]).contigs
} else {
this.intervals ++= this.originalGATK.intervals.map(_.toString)
this.intervals ++= this.originalGATK.intervalsString.filterNot(interval => IntervalUtils.isUnmapped(interval))
@ -103,3 +103,17 @@ trait GATKScatterFunction extends ScatterFunction {
*/
protected def maxIntervals: Int
}
object GATKScatterFunction {
var gatkIntervals = List.empty[GATKIntervals]
def getGATKIntervals(reference: File, intervals: List[String]) = {
gatkIntervals.find(gi => gi.reference == reference && gi.intervals == intervals) match {
case Some(gi) => gi
case None =>
val gi = new GATKIntervals(reference, intervals)
gatkIntervals :+= gi
gi
}
}
}

View File

@ -33,9 +33,11 @@ import org.broadinstitute.sting.queue.function.InProcessFunction
*/
class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction {
protected override def maxIntervals =
IntervalUtils.countIntervalArguments(this.referenceSequence, this.intervals, false)
GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).locs.size
def run() {
IntervalUtils.scatterIntervalArguments(this.referenceSequence, this.intervals, this.scatterOutputFiles, false)
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs,
gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles)
}
}