diff --git a/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java b/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java index ef01e7e63..02381916e 100755 --- a/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java +++ b/java/src/org/broadinstitute/sting/utils/broad/PicardAnalysisFiles.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.broad; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.text.XReadLines; import java.io.File; @@ -56,6 +57,8 @@ public class PicardAnalysisFiles { this.path = path; HashMap headerIndexes = null; for (String line: new XReadLines(new File(path))) { + if (line.startsWith("#")) + continue; String[] values = line.split("\t"); if (headerIndexes == null) { headerIndexes = new HashMap(); @@ -65,7 +68,10 @@ public class PicardAnalysisFiles { } } else { for (String header: ANALYSIS_HEADERS) { - String value = values[headerIndexes.get(header)]; + int index = headerIndexes.get(header); + if (values.length <= index) + throw new StingException(String.format("Unable to parse line in %s: %n%s", path, line)); + String value = values[index]; headerValues.get(header).add(value); } } diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 41f8b5006..68243d066 100644 --- a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.*; @@ -189,162 +189,136 @@ public class IntervalUtils { } /** - * Returns the list of GenomeLocs from the list of intervals. - * @param referenceSource The reference for the intervals. - * @param intervals The interval as strings or file paths. - * @return The list of GenomeLocs. - */ - private static List parseIntervalArguments(ReferenceDataSource referenceSource, List intervals) { - GenomeLocParser parser = new GenomeLocParser(referenceSource.getReference()); - GenomeLocSortedSet locs; - // TODO: Abstract genome analysis engine has richer logic for parsing. We need to use it! - if (intervals.size() == 0) { - locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()); - } else { - locs = new GenomeLocSortedSet(parser, IntervalUtils.parseIntervalArguments(parser, intervals, false)); - } - if (locs == null || locs.size() == 0) - throw new UserException.MalformedFile("Intervals are empty: " + Utils.join(", ", intervals)); - return locs.toList(); - } - - /** - * Returns the list of contigs from the list of intervals. + * Returns a map of contig names with their sizes. * @param reference The reference for the intervals. - * @return The list of contig names. + * @return A map of contig names with their sizes. */ - public static List distinctContigs(File reference) { - return distinctContigs(reference, Collections.emptyList()); - } - - /** - * Returns the list of contigs from the list of intervals. - * @param reference The reference for the intervals. - * @param intervals The interval as strings or file paths. - * @return The list of contig names. - */ - public static List distinctContigs(File reference, List intervals) { + public static Map getContigSizes(File reference) { ReferenceDataSource referenceSource = new ReferenceDataSource(reference); - List locs = parseIntervalArguments(referenceSource, intervals); - String contig = null; - List contigs = new ArrayList(); - for (GenomeLoc loc: locs) { - if (contig == null || !contig.equals(loc.getContig())) { - contig = loc.getContig(); - contigs.add(contig); - } - } - return contigs; - } - - /** - * Returns a map of contig names with their lengths from the reference. - * @param reference The reference for the intervals. - * @return A map of contig names with their lengths. - */ - public static Map getContigLengths(File reference) { - ReferenceDataSource referenceSource = new ReferenceDataSource(reference); - List locs = parseIntervalArguments(referenceSource, Collections.emptyList()); - Map lengths = new LinkedHashMap(); + List locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()).toList(); + Map lengths = new LinkedHashMap(); for (GenomeLoc loc: locs) - lengths.put(loc.getContig(), loc.getStop()); + lengths.put(loc.getContig(), loc.size()); return lengths; } /** * Counts the number of interval files an interval list can be split into using scatterIntervalArguments. - * @param reference The reference for the intervals. - * @param intervals The interval as strings or file paths. - * @param splitByContig If true then one contig will not be written to multiple files. + * @param locs The genome locs. * @return The maximum number of parts the intervals can be split into. */ - public static int countIntervalArguments(File reference, List intervals, boolean splitByContig) { - ReferenceDataSource referenceSource = new ReferenceDataSource(reference); - List locs = parseIntervalArguments(referenceSource, intervals); + public static int countContigIntervals(List locs) { int maxFiles = 0; - if (splitByContig) { - String contig = null; - for (GenomeLoc loc: locs) { - if (contig == null || !contig.equals(loc.getContig())) { - maxFiles++; - contig = loc.getContig(); - } + String contig = null; + for (GenomeLoc loc: locs) { + if (contig == null || !contig.equals(loc.getContig())) { + maxFiles++; + contig = loc.getContig(); } - } else { - maxFiles = locs.size(); } return maxFiles; } /** * Splits an interval list into multiple files. - * @param reference The reference for the intervals. - * @param intervals The interval as strings or file paths. + * @param fileHeader The sam file header. + * @param locs The genome locs to split. * @param scatterParts The output interval lists to write to. - * @param splitByContig If true then one contig will not be written to multiple files. */ - public static void scatterIntervalArguments(File reference, List intervals, List scatterParts, boolean splitByContig) { - ReferenceDataSource referenceSource = new ReferenceDataSource(reference); - List locs = parseIntervalArguments(referenceSource, intervals); - SAMFileHeader fileHeader = new SAMFileHeader(); - fileHeader.setSequenceDictionary(referenceSource.getReference().getSequenceDictionary()); - + public static void scatterContigIntervals(SAMFileHeader fileHeader, List locs, List scatterParts) { IntervalList intervalList = null; int fileIndex = -1; int locIndex = 0; - - if (splitByContig) { - String contig = null; - for (GenomeLoc loc: locs) { - // If there are still more files to write and the contig doesn't match... - if ((fileIndex+1 < scatterParts.size()) && (contig == null || !contig.equals(loc.getContig()))) { - // Then close the current file and start a new one. - if (intervalList != null) { - intervalList.write(scatterParts.get(fileIndex)); - intervalList = null; - } - fileIndex++; - contig = loc.getContig(); + String contig = null; + for (GenomeLoc loc: locs) { + // If there are still more files to write and the contig doesn't match... + if ((fileIndex+1 < scatterParts.size()) && (contig == null || !contig.equals(loc.getContig()))) { + // Then close the current file and start a new one. + if (intervalList != null) { + intervalList.write(scatterParts.get(fileIndex)); + intervalList = null; } - if (intervalList == null) - intervalList = new IntervalList(fileHeader); - intervalList.add(toInterval(loc, ++locIndex)); + fileIndex++; + contig = loc.getContig(); } - if (intervalList != null) - intervalList.write(scatterParts.get(fileIndex)); - } else { - int locsPerFile = locs.size() / scatterParts.size(); - int locRemainder = locs.size() % scatterParts.size(); - - // At the start, put an extra loc per file - locsPerFile++; - int locsLeftFile = 0; - - for (GenomeLoc loc: locs) { - if (locsLeftFile == 0) { - if (intervalList != null) - intervalList.write(scatterParts.get(fileIndex)); - - fileIndex++; - intervalList = new IntervalList(fileHeader); - - // When we have put enough locs into each file, - // reduce the number of locs per file back - // to the original calculated value. - if (fileIndex == locRemainder) - locsPerFile -= 1; - locsLeftFile = locsPerFile; - } - locsLeftFile -= 1; - intervalList.add(toInterval(loc, ++locIndex)); - } - if (intervalList != null) - intervalList.write(scatterParts.get(fileIndex)); + if (intervalList == null) + intervalList = new IntervalList(fileHeader); + intervalList.add(toInterval(loc, ++locIndex)); } + if (intervalList != null) + intervalList.write(scatterParts.get(fileIndex)); if ((fileIndex + 1) != scatterParts.size()) throw new UserException.BadArgumentValue("scatterParts", String.format("Only able to write contigs into %d of %d files.", fileIndex + 1, scatterParts.size())); } + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param locs The genome locs to split. + * @param splits The stop points for the genome locs returned by splitFixedIntervals. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterFixedIntervals(SAMFileHeader fileHeader, List locs, List splits, List scatterParts) { + if (splits.size() != scatterParts.size()) + throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); + int fileIndex = 0; + int locIndex = 1; + int start = 0; + for (Integer stop: splits) { + IntervalList intervalList = new IntervalList(fileHeader); + for (int i = start; i < stop; i++) + intervalList.add(toInterval(locs.get(i), locIndex++)); + intervalList.write(scatterParts.get(fileIndex++)); + start = stop; + } + } + + /** + * Splits the genome locs up by size. + * @param locs Genome locs to split. + * @param numParts Number of parts to split the locs into. + * @return The stop points to split the genome locs. + */ + public static List splitFixedIntervals(List locs, int numParts) { + if (locs.size() < numParts) + throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); + long locsSize = 0; + for (GenomeLoc loc: locs) + locsSize += loc.size(); + List splitPoints = new ArrayList(); + addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); + Collections.sort(splitPoints); + splitPoints.add(locs.size()); + return splitPoints; + } + + private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { + if (numParts < 2) + return; + int halfParts = (numParts + 1) / 2; + Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts); + int splitIndex = splitPoint.first; + long splitSize = splitPoint.second; + splitPoints.add(splitIndex); + addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts); + addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts); + } + + private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs) { + int splitIndex = startIndex; + long splitSize = 0; + for (int i = 0; i < minLocs; i++) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; + } + long halfSize = locsSize / 2; + while (splitIndex < stopIndex && splitSize < halfSize) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; + } + return new Pair(splitIndex, splitSize); + } + /** * Converts a GenomeLoc to a picard interval. * @param loc The GenomeLoc. diff --git a/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java b/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java index f1e44000c..de58bf983 100755 --- a/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/broad/PicardAnalysisFilesUnitTest.java @@ -25,6 +25,14 @@ public class PicardAnalysisFilesUnitTest extends BaseTest { Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list"); } + @Test + public void testParseValidWithComments() throws Exception { + PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_comments.txt"); + Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"); + Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"); + Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list"); + } + @Test(expectedExceptions = FileNotFoundException.class) public void testParseBadPath() throws Exception { new PicardAnalysisFiles(BaseTest.validationDataLocation + "non_existent_picard_analysis_file.txt"); diff --git a/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java b/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java index a17074b01..a5701d671 100755 --- a/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/broad/PicardPipelineUnitTest.java @@ -22,6 +22,12 @@ public class PicardPipelineUnitTest { validatePipeline(pipeline, FilenameUtils.getBaseName(tsv.getPath())); } + @Test + public void testParseTsvWithPicardComments() throws Exception { + File tsv = writeTsv("C460", "HG01359"); + PicardPipeline.parse(tsv); + } + @Test public void testParseYaml() throws IOException { File yaml = writeYaml("project_name", PROJECT, SAMPLE); diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index 6c27e4f0b..e79580e21 100644 --- a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,7 +1,10 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.testng.Assert; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.GenomeLoc; @@ -21,14 +24,20 @@ import java.util.*; */ public class IntervalUtilsUnitTest extends BaseTest { // used to seed the genome loc parser with a sequence dictionary - private static File reference = new File(BaseTest.hg18Reference); + private SAMFileHeader header; private GenomeLocParser genomeLocParser; + private List referenceLocs; @BeforeClass public void init() { + File reference = new File(BaseTest.hg18Reference); try { + ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference); + header = new SAMFileHeader(); + header.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(reference); genomeLocParser = new GenomeLocParser(seq); + referenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; } catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(reference,ex); @@ -94,46 +103,41 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals(ret.size(), 20); } - @Test - public void testCountContigs() { - List chrs = new ArrayList(); - for (int i = 1; i <= 22; i++) - chrs.add("chr" + i); - chrs.add("chrX"); - chrs.add("chrY"); - - List chrsNoRandom = Arrays.asList("chr12", "chr14", "chr20", "chrY"); - List chrsWithRandom = new ArrayList(); - chrsWithRandom.add("chrM"); - chrsWithRandom.addAll(chrs); - for (String chr: chrs) - if(!chrsNoRandom.contains(chr)) - chrsWithRandom.add(chr + "_random"); - - Assert.assertEquals(IntervalUtils.distinctContigs(reference), chrsWithRandom); - Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList(BaseTest.validationDataLocation + "TCGA-06-0188.interval_list")), chrs); - Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList("chr1:1-1", "chr2:1-1", "chr3:2-2")), Arrays.asList("chr1","chr2","chr3")); - Assert.assertEquals(IntervalUtils.distinctContigs(reference, Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2")), Arrays.asList("chr1","chr2","chr3")); - } - @Test public void testGetContigLengths() { - Map lengths = IntervalUtils.getContigLengths(reference); - Assert.assertEquals((int)lengths.get("chr1"), 247249719); - Assert.assertEquals((int)lengths.get("chr2"), 242951149); - Assert.assertEquals((int)lengths.get("chr3"), 199501827); - Assert.assertEquals((int)lengths.get("chr20"), 62435964); - Assert.assertEquals((int)lengths.get("chrX"), 154913754); + Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); + Assert.assertEquals((long)lengths.get("chr1"), 247249719); + Assert.assertEquals((long)lengths.get("chr2"), 242951149); + Assert.assertEquals((long)lengths.get("chr3"), 199501827); + Assert.assertEquals((long)lengths.get("chr20"), 62435964); + Assert.assertEquals((long)lengths.get("chrX"), 154913754); + } + + private List getLocs(String... intervals) { + return getLocs(Arrays.asList(intervals)); + } + + private List getLocs(List intervals) { + if (intervals.size() == 0) + return referenceLocs; + List locs = new ArrayList(); + for (String interval: intervals) + locs.add(genomeLocParser.parseGenomeInterval(interval)); + return locs; } @Test - public void testCountIntervals() { - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Collections.emptyList(), false), 45); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Collections.emptyList(), true), 45); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), false), 3); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), true), 3); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"), false), 4); - Assert.assertEquals(IntervalUtils.countIntervalArguments(reference, Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"), true), 3); + public void testParseIntervalArguments() { + Assert.assertEquals(getLocs().size(), 45); + Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); + Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); + } + + @Test(dependsOnMethods = "testParseIntervalArguments") + public void testCountIntervalsByContig() { + Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs()), 45); + Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1", "chr2", "chr3")), 3); + Assert.assertEquals(IntervalUtils.countContigIntervals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")), 3); } @Test @@ -153,14 +157,16 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test - public void testBasicScatter() { + public void testFixedScatterIntervalsBasic() { GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); List files = testFiles("basic.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), files, false); + List locs = getLocs("chr1", "chr2", "chr3"); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -176,7 +182,7 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test - public void testScatterLessFiles() { + public void testScatterFixedIntervalsLessFiles() { GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); @@ -184,117 +190,9 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("less.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3", "chr4"), files, false); - - List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs1.get(1), chr2); - Assert.assertEquals(locs2.get(0), chr3); - Assert.assertEquals(locs3.get(0), chr4); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2"), files, false); - } - - @Test - public void testScatterIntervals() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2"); - GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5"); - GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); - GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); - - List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs1.get(1), chr1b); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test(enabled=false) // disabled, GenomeLoc.compareTo() returns 0 for two locs with the same start, causing an exception in GLSS.add(). - public void testScatterIntervalsWithTheSameStart() { - List files = testFiles("sg.", 20, ".intervals"); - IntervalUtils.scatterIntervalArguments(new File(hg18Reference), Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), files, false); - } - - @Test - public void testScatterOrder() { - List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); - GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); - GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); - GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); - - List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testBasicScatterByContig() { - GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); - GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); - GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); - - List files = testFiles("contig_basic.", 3, ".intervals"); - - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3"), files, true); - - List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterByContigLessFiles() { - GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); - GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); - GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); - GenomeLoc chr4 = genomeLocParser.parseGenomeInterval("chr4"); - - List files = testFiles("contig_less.", 3, ".intervals"); - - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2", "chr3", "chr4"), files, true); + List locs = getLocs("chr1", "chr2", "chr3", "chr4"); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -311,13 +209,219 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterByContigMoreFiles() { - List files = testFiles("contig_more.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, Arrays.asList("chr1", "chr2"), files, true); + public void testSplitFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + IntervalUtils.splitFixedIntervals(locs, files.size()); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testScatterFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + } + @Test + public void testScatterFixedIntervalsStart() { + List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); + GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2"); + GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1a); + Assert.assertEquals(locs2.get(0), chr1b); + Assert.assertEquals(locs3.get(0), chr2); + Assert.assertEquals(locs3.get(1), chr3); } @Test - public void testScatterByContigIntervalsStart() { + public void testScatterFixedIntervalsMiddle() { + List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); + GenomeLoc chr2a = genomeLocParser.parseGenomeInterval("chr2:1-2"); + GenomeLoc chr2b = genomeLocParser.parseGenomeInterval("chr2:4-5"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2a); + Assert.assertEquals(locs3.get(0), chr2b); + Assert.assertEquals(locs3.get(1), chr3); + } + + @Test + public void testScatterFixedIntervalsEnd() { + List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:2-2"); + GenomeLoc chr3a = genomeLocParser.parseGenomeInterval("chr3:1-2"); + GenomeLoc chr3b = genomeLocParser.parseGenomeInterval("chr3:4-5"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs1.get(1), chr2); + Assert.assertEquals(locs2.get(0), chr3a); + Assert.assertEquals(locs3.get(0), chr3b); + } + + @Test + public void testScatterFixedIntervalsFile() { + List files = testFiles("sg.", 20, ".intervals"); + List locs = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); + List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + + int[] counts = { + 5169, 5573, 10017, 10567, 10551, + 5087, 4908, 10120, 10435, 10399, + 5391, 4735, 10621, 10352, 10654, + 5227, 5256, 10151, 9649, 9825 + }; + + //String splitCounts = ""; + for (int lastIndex = 0, i = 0; i < splits.size(); i++) { + int splitIndex = splits.get(i); + int splitCount = (splitIndex - lastIndex); + //splitCounts += ", " + splitCount; + lastIndex = splitIndex; + Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); + } + //System.out.println(splitCounts.substring(2)); + + IntervalUtils.scatterFixedIntervals(header, locs, splits, files); + + int locIndex = 0; + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(file), false); + Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); + for (GenomeLoc parsedLoc: parsedLocs) + Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); + } + Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); + } + + @Test + public void testScatterContigIntervalsOrder() { + List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:1-1"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr2); + Assert.assertEquals(locs2.get(0), chr1); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsBasic() { + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); + + List files = testFiles("contig_basic.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(header, getLocs("chr1", "chr2", "chr3"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsLessFiles() { + GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1"); + GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2"); + GenomeLoc chr3 = genomeLocParser.parseGenomeInterval("chr3"); + GenomeLoc chr4 = genomeLocParser.parseGenomeInterval("chr4"); + + List files = testFiles("contig_less.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(header, getLocs("chr1", "chr2", "chr3", "chr4"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); + List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); + List locs3 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(2).toString()), false); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + Assert.assertEquals(locs3.get(1), chr4); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testScatterContigIntervalsMoreFiles() { + List files = testFiles("contig_more.", 3, ".intervals"); + IntervalUtils.scatterContigIntervals(header, getLocs("chr1", "chr2"), files); + } + + @Test + public void testScatterContigIntervalsStart() { List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); GenomeLoc chr1a = genomeLocParser.parseGenomeInterval("chr1:1-2"); GenomeLoc chr1b = genomeLocParser.parseGenomeInterval("chr1:4-5"); @@ -326,7 +430,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("contig_split_start.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); + IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -343,7 +447,7 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test - public void testScatterByContigIntervalsMiddle() { + public void testScatterContigIntervalsMiddle() { List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr2a = genomeLocParser.parseGenomeInterval("chr2:1-2"); @@ -352,7 +456,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("contig_split_middle.", 3, ".intervals"); - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); + IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); @@ -369,7 +473,7 @@ public class IntervalUtilsUnitTest extends BaseTest { } @Test - public void testScatterByContigIntervalsEnd() { + public void testScatterContigIntervalsEnd() { List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); GenomeLoc chr1 = genomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr2 = genomeLocParser.parseGenomeInterval("chr2:2-2"); @@ -378,7 +482,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("contig_split_end.", 3 ,".intervals"); - IntervalUtils.scatterIntervalArguments(reference, intervals, files, true); + IntervalUtils.scatterContigIntervals(header, getLocs(intervals), files); List locs1 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(0).toString()), false); List locs2 = IntervalUtils.parseIntervalArguments(genomeLocParser, Arrays.asList(files.get(1).toString()), false); diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala index f1fd4057b..1aa88187e 100755 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/ContigScatterFunction.scala @@ -35,10 +35,13 @@ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { // Include unmapped reads by default. this.includeUnmapped = true - protected override def maxIntervals = - IntervalUtils.countIntervalArguments(this.referenceSequence, this.intervals, true) + protected override def maxIntervals = { + val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + IntervalUtils.countContigIntervals(gi.locs) + } def run() { - IntervalUtils.scatterIntervalArguments(this.referenceSequence, this.intervals, this.scatterOutputFiles, true) + val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + IntervalUtils.scatterContigIntervals(gi.samFileHeader, gi.locs, this.scatterOutputFiles) } } diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala index 4dbd7c880..32e58ca6a 100644 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala @@ -60,7 +60,7 @@ trait GATKScatterFunction extends ScatterFunction { this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] this.referenceSequence = this.originalGATK.reference_sequence if (this.originalGATK.intervals.isEmpty && this.originalGATK.intervalsString.isEmpty) { - this.intervals ++= IntervalUtils.distinctContigs(this.referenceSequence).toList + this.intervals ++= GATKScatterFunction.getGATKIntervals(this.referenceSequence, List.empty[String]).contigs } else { this.intervals ++= this.originalGATK.intervals.map(_.toString) this.intervals ++= this.originalGATK.intervalsString.filterNot(interval => IntervalUtils.isUnmapped(interval)) @@ -103,3 +103,17 @@ trait GATKScatterFunction extends ScatterFunction { */ protected def maxIntervals: Int } + +object GATKScatterFunction { + var gatkIntervals = List.empty[GATKIntervals] + + def getGATKIntervals(reference: File, intervals: List[String]) = { + gatkIntervals.find(gi => gi.reference == reference && gi.intervals == intervals) match { + case Some(gi) => gi + case None => + val gi = new GATKIntervals(reference, intervals) + gatkIntervals :+= gi + gi + } + } +} diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index a651b3c12..d88d272b9 100644 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -33,9 +33,11 @@ import org.broadinstitute.sting.queue.function.InProcessFunction */ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction { protected override def maxIntervals = - IntervalUtils.countIntervalArguments(this.referenceSequence, this.intervals, false) + GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals).locs.size def run() { - IntervalUtils.scatterIntervalArguments(this.referenceSequence, this.intervals, this.scatterOutputFiles, false) + val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals) + IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs, + gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles) } }