From b134e0052fdbc33001660de7318c945817d24698 Mon Sep 17 00:00:00 2001 From: aaron Date: Wed, 23 Dec 2009 21:59:14 +0000 Subject: [PATCH] added changes to the code to allow different types of interval merging, 1: all overlapping and abutting intervals merged (ALL), 2: just overlapping, not abutting intervals (OVERLAPPING_ONLY), 3: no merging (NONE). This option is not currently allowed, it will throw an exception. Once we're more certain that unmerged lists are going to work in all cases in the GATK, we'll enable that. The command line option is --interval_merging or -im git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2437 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GATKArgumentCollection.java | 52 ++++++++++++++----- .../sting/gatk/GenomeAnalysisEngine.java | 14 ++--- .../walkers/fasta/PickSequenomProbes.java | 3 +- .../walkers/indels/IntervalMergerWalker.java | 3 +- .../sting/utils/GenomeLocParser.java | 25 +++++---- .../sting/utils/bed/BedParser.java | 10 +++- .../sting/utils/GenomeLocParserTest.java | 52 +++++++++++++------ .../sting/utils/bed/BedParserTest.java | 3 +- 8 files changed, 112 insertions(+), 50 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java index 945a0a69a..2bc802872 100755 --- a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java @@ -57,7 +57,7 @@ public class GATKArgumentCollection { public List samFiles = new ArrayList(); @ElementList(required = false) - @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually.", required=false) + @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually.", required = false) public List readFilters = new ArrayList(); @ElementList(required = false) @@ -136,13 +136,18 @@ public class GATKArgumentCollection { @Argument(fullName = "numthreads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) public int numberOfThreads = 1; + /** What rule should we use when merging intervals */ + @Element(required = false) + @Argument(fullName = "interval_merging", shortName = "im", doc = "What interval merging rule should we use {ALL [DEFAULT],OVERLAPPING_ONLY,NONE}.", required = false) + public INTERVAL_MERGING_RULE intervalMerging = INTERVAL_MERGING_RULE.ALL; + /** * marshal the data out to a object * * @param collection the GATKArgumentCollection to load into * @param outputFile the file to write to */ - public static void marshal( GATKArgumentCollection collection, String outputFile ) { + public static void marshal(GATKArgumentCollection collection, String outputFile) { Serializer serializer = new Persister(new Format(new HyphenStyle())); File result = new File(outputFile); try { @@ -158,7 +163,7 @@ public class GATKArgumentCollection { * @param collection the GATKArgumentCollection to load into * @param outputFile the stream to write to */ - public static void marshal( GATKArgumentCollection collection, PrintStream outputFile ) { + public static void marshal(GATKArgumentCollection collection, PrintStream outputFile) { Serializer serializer = new Persister(new Format(new HyphenStyle())); try { serializer.write(collection, outputFile); @@ -172,7 +177,7 @@ public class GATKArgumentCollection { * * @param filename the filename to marshal from */ - public static GATKArgumentCollection unmarshal( String filename ) { + public static GATKArgumentCollection unmarshal(String filename) { Serializer serializer = new Persister(new Format(new HyphenStyle())); File source = new File(filename); try { @@ -188,7 +193,7 @@ public class GATKArgumentCollection { * * @param file the inputstream to marshal from */ - public static GATKArgumentCollection unmarshal( InputStream file ) { + public static GATKArgumentCollection unmarshal(InputStream file) { Serializer serializer = new Persister(new Format(new HyphenStyle())); try { GATKArgumentCollection example = serializer.read(GATKArgumentCollection.class, file); @@ -207,7 +212,7 @@ public class GATKArgumentCollection { * * @return true if they're equal */ - public boolean equals( GATKArgumentCollection other ) { + public boolean equals(GATKArgumentCollection other) { if (other.samFiles.size() != samFiles.size()) { return false; } @@ -260,18 +265,18 @@ public class GATKArgumentCollection { return false; } if (other.readMaxPileup != this.readMaxPileup) { - return false; - } - if (( other.filterZeroMappingQualityReads == null && this.filterZeroMappingQualityReads != null ) || - ( other.filterZeroMappingQualityReads != null && !other.filterZeroMappingQualityReads.equals(this.filterZeroMappingQualityReads) )) { return false; } - if (( other.downsampleFraction == null && this.downsampleFraction != null ) || - ( other.downsampleFraction != null && !other.downsampleFraction.equals(this.downsampleFraction) )) { + if ((other.filterZeroMappingQualityReads == null && this.filterZeroMappingQualityReads != null) || + (other.filterZeroMappingQualityReads != null && !other.filterZeroMappingQualityReads.equals(this.filterZeroMappingQualityReads))) { return false; } - if (( other.downsampleCoverage == null && this.downsampleCoverage != null ) || - ( other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage) )) { + if ((other.downsampleFraction == null && this.downsampleFraction != null) || + (other.downsampleFraction != null && !other.downsampleFraction.equals(this.downsampleFraction))) { + return false; + } + if ((other.downsampleCoverage == null && this.downsampleCoverage != null) || + (other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) { return false; } if (!other.outFileName.equals(this.outFileName)) { @@ -286,7 +291,26 @@ public class GATKArgumentCollection { if (other.numberOfThreads != this.numberOfThreads) { return false; } + if (other.intervalMerging != this.intervalMerging) { + return false; + } return true; } + + /** + * a class we use to determine the merging rules for intervals passed to the GATK + */ + public enum INTERVAL_MERGING_RULE { + ALL, // we merge both overlapping intervals and abutting intervals + OVERLAPPING_ONLY, // We merge intervals that are overlapping, but NOT ones that only abut each other + NONE; // we merge neither overlapping or abutting intervals, the list of intervals is sorted, but not merged + + public boolean check() { + if (this.compareTo(NONE) == 0) + throw new UnsupportedOperationException("We Currently do not support INTERVAL_MERGING_RULE.NONE"); + return true; + } + } } + diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index a86e49edd..dce280eb8 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -153,7 +153,7 @@ public class GenomeAnalysisEngine { initializeOutputStreams(my_walker, microScheduler.getOutputTracker()); GenomeLocSortedSet locs = null; - if (argCollection.intervals != null) { + if (argCollection.intervals != null && argCollection.intervalMerging.check()) { locs = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.intervals)); } @@ -296,16 +296,16 @@ public class GenomeAnalysisEngine { for (String interval : intervals) { if (new File(interval).exists()) { // support for the bed style interval format - if (interval.endsWith(".bed") || interval.endsWith(".BED")) { - Utils.warnUser("Bed files are zero based half open intervals, which are converted to one based closed intervals in the GATK. " + - "Be aware that all output information and intervals are one based closed intervals."); + if (interval.toUpperCase().endsWith(".BED")) { + Utils.warnUser("Bed files are 0 based half-open intervals, which are converted to 1-based closed intervals in the GATK. " + + "Be aware that all output information and intervals are 1-based closed intervals."); BedParser parser = new BedParser(new File(interval)); - locs.addAll(parser.getSortedAndMergedLocations()); + locs.addAll(parser.getSortedAndMergedLocations(GenomeAnalysisEngine.instance.getArguments().intervalMerging)); } else { - locs.addAll(GenomeLocParser.intervalFileToList(interval)); + locs.addAll(GenomeLocParser.intervalFileToList(interval,GenomeAnalysisEngine.instance.getArguments().intervalMerging)); } } else { - locs.addAll(GenomeLocParser.parseGenomeLocs(interval)); + locs.addAll(GenomeLocParser.parseGenomeLocs(interval,GenomeAnalysisEngine.instance.getArguments().intervalMerging)); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java index 8021ce317..8544393d0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.fasta; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.*; @@ -27,7 +28,7 @@ public class PickSequenomProbes extends RefWalker { public void initialize() { if ( SNP_MASK != null ) { logger.info("Loading SNP mask... "); - mask_array = GenomeLocParser.parseIntervals(Arrays.asList(SNP_MASK)).toArray(); + mask_array = GenomeLocParser.parseIntervals(Arrays.asList(SNP_MASK), GenomeAnalysisEngine.instance.getArguments().intervalMerging).toArray(); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java index fc1a017be..0818be1c9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.walkers.*; @@ -56,7 +57,7 @@ public class IntervalMergerWalker extends ReadWalker { @Override public void initialize() { - intervals = new LinkedList(GenomeLocParser.parseIntervals(intervalsSource)); + intervals = new LinkedList(GenomeLocParser.parseIntervals(intervalsSource, GenomeAnalysisEngine.instance.getArguments().intervalMerging)); currentInterval = (intervals.size() > 0 ? intervals.removeFirst() : null); } diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 0ab26c2b5..c17b596ba 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -7,6 +7,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GATKArgumentCollection; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import java.io.File; @@ -133,12 +134,13 @@ public class GenomeLocParser { /** * Load one or more intervals sources, sorting and merging overlapping intervals. * @param intervalsSource Source of intervals. + * @param rule the merging rule we're using * @return a list of sorted, merged intervals. */ - public static List parseIntervals(List intervalsSource) { + public static List parseIntervals(List intervalsSource, GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { List parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource); Collections.sort(parsedIntervals); - return GenomeLocParser.mergeOverlappingLocations(parsedIntervals); + return GenomeLocParser.mergeIntervalLocations(parsedIntervals, rule); } /** @@ -196,10 +198,11 @@ public class GenomeLocParser { * array of GenomeLoc objects * * @param str String representation of genome locs. Null string corresponds to no filter. + * @param rule the merging rule we're using * * @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order */ - public static List parseGenomeLocs(final String str) { + public static List parseGenomeLocs(final String str, GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { // Null string means no filter. if (str == null) return null; @@ -212,7 +215,7 @@ public class GenomeLocParser { locs.add(parseGenomeLoc(loc.trim())); Collections.sort(locs); //logger.info(String.format("Going to process %d locations", locs.length)); - locs = mergeOverlappingLocations(locs); + locs = mergeIntervalLocations(locs, rule); logger.debug("Locations are:" + Utils.join(", ", locs)); return locs; } catch (Exception e) { // TODO: fix this so that it passes the message from the exception, and doesn't print it out @@ -235,12 +238,13 @@ public class GenomeLocParser { * merge a list of genome locs that may be overlapping, returning the list of unique genomic locations * * @param raw the unchecked genome loc list + * @param rule the merging rule we're using * * @return the list of merged locations */ - public static List mergeOverlappingLocations(final List raw) { + public static List mergeIntervalLocations(final List raw, GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { logger.debug(" Raw locations are: " + Utils.join(", ", raw)); - if (raw.size() <= 1) + if (raw.size() <= 1 || rule == GATKArgumentCollection.INTERVAL_MERGING_RULE.NONE) return raw; else { ArrayList merged = new ArrayList(); @@ -248,7 +252,9 @@ public class GenomeLocParser { GenomeLoc prev = it.next(); while (it.hasNext()) { GenomeLoc curr = it.next(); - if (prev.contiguousP(curr)) { + if (prev.overlapsP(curr)) { + prev = prev.merge(curr); + } else if (prev.contiguousP(curr) && rule == GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL) { prev = prev.merge(curr); } else { merged.add(prev); @@ -307,8 +313,9 @@ public class GenomeLocParser { * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' * * @param file_name + * @param rule also merge abutting intervals */ - public static List intervalFileToList(final String file_name) { + public static List intervalFileToList(final String file_name, GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { /** * first try to read it as an interval file since that's well structured * we'll fail quickly if it's not a valid file. Then try to parse it as @@ -343,7 +350,7 @@ public class GenomeLocParser { List lines = reader.readLines(); reader.close(); String locStr = Utils.join(";", lines); - ret = parseGenomeLocs(locStr); + ret = parseGenomeLocs(locStr, rule); for(GenomeLoc locus: ret) verifyGenomeLocBounds(locus); return ret; diff --git a/java/src/org/broadinstitute/sting/utils/bed/BedParser.java b/java/src/org/broadinstitute/sting/utils/bed/BedParser.java index d8aad4a8b..a8a8dcce8 100644 --- a/java/src/org/broadinstitute/sting/utils/bed/BedParser.java +++ b/java/src/org/broadinstitute/sting/utils/bed/BedParser.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.bed; +import org.broadinstitute.sting.gatk.GATKArgumentCollection; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -98,10 +99,15 @@ public class BedParser { return mLocations; } - public List getSortedAndMergedLocations() { + /** + * sort and merge the intervals, using the interval rule supplied + * @param rule the rule to merge intervals with + * @return a list of genome locs, sorted and merged + */ + public List getSortedAndMergedLocations(GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { List locs = new ArrayList(); locs.addAll(mLocations); Collections.sort(locs); - return GenomeLocParser.mergeOverlappingLocations(locs); + return GenomeLocParser.mergeIntervalLocations(locs, rule); } } diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java index abda2d95a..c8220ee38 100644 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java @@ -1,13 +1,18 @@ package org.broadinstitute.sting.utils; import static junit.framework.Assert.assertTrue; + import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GATKArgumentCollection; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; + import static org.junit.Assert.assertEquals; + import org.junit.BeforeClass; import org.junit.Test; -import java.util.Arrays; + +import java.util.List; /** * @author aaron @@ -78,17 +83,34 @@ public class GenomeLocParserTest extends BaseTest { @Test(expected = RuntimeException.class) public void testParseBadLocations() { - GenomeLocParser.parseGenomeLocs("chr1:1-1;badChr:1-0"); + GenomeLocParser.parseGenomeLocs("chr1:1-1;badChr:1-0", GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL); } @Test public void testParseGoodLocations() { - GenomeLocParser.parseGenomeLocs("chr1:1-1;chr1:5-9"); + GenomeLocParser.parseGenomeLocs("chr1:1-1;chr1:5-9", GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL); } @Test(expected = RuntimeException.class) public void testParseGoodLocationsTooManySemiColons() { - GenomeLocParser.parseGenomeLocs("chr1:1-1;;chr1:5-9;"); + GenomeLocParser.parseGenomeLocs("chr1:1-1;;chr1:5-9;", GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL); + } + + @Test + public void testOverlappingGoodLocationsWithAbuttingFlag() { + List locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", GATKArgumentCollection.INTERVAL_MERGING_RULE.OVERLAPPING_ONLY); + assertEquals(1, locs.size()); + } + + @Test + public void testAbuttingGoodLocationsWithAbuttingOffFlag() { + List locs = GenomeLocParser.parseGenomeLocs("chr1:1-4;chr1:5-9", GATKArgumentCollection.INTERVAL_MERGING_RULE.OVERLAPPING_ONLY); + assertEquals(2, locs.size()); + } + @Test + public void testAbuttingGoodLocationsWithNoneFlag() { + List locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", GATKArgumentCollection.INTERVAL_MERGING_RULE.NONE); + assertEquals(2, locs.size()); } @Test @@ -145,7 +167,7 @@ public class GenomeLocParserTest extends BaseTest { long start = System.currentTimeMillis(); List parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(new String[]{"/humgen/gsa-scr1/GATK_Data/Validation_Data/bigChr1IntervalList.list"})); Collections.sort(parsedIntervals); - LinkedList loc = new LinkedList(GenomeLocParser.mergeOverlappingLocations(parsedIntervals)); + LinkedList loc = new LinkedList(GenomeLocParser.mergeIntervalLocations(parsedIntervals)); long stop = System.currentTimeMillis(); logger.warn("Elapsed time = " + (stop - start)); }*/ @@ -158,22 +180,22 @@ public class GenomeLocParserTest extends BaseTest { assertEquals(1, loc.getStart()); } - @Test + @Test public void testGenomeLocParseOnlyChrome() { - GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1"); assertEquals(0, loc.getContigIndex()); assertEquals(10, loc.getStop()); // the size assertEquals(1, loc.getStart()); } - - @Test(expected = StingException.class) + + @Test(expected = StingException.class) public void testGenomeLocParseOnlyBadChrome() { - GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr12"); + GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr12"); assertEquals(0, loc.getContigIndex()); assertEquals(10, loc.getStop()); // the size assertEquals(1, loc.getStart()); } - + @Test(expected = StingException.class) public void testGenomeLocBad() { GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1-"); @@ -181,16 +203,16 @@ public class GenomeLocParserTest extends BaseTest { assertEquals(10, loc.getStop()); // the size assertEquals(1, loc.getStart()); } - - @Test(expected = StingException.class) + + @Test(expected = StingException.class) public void testGenomeLocBad2() { GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1-500-0"); assertEquals(0, loc.getContigIndex()); assertEquals(10, loc.getStop()); // the size assertEquals(1, loc.getStart()); } - - @Test(expected = StingException.class) + + @Test(expected = StingException.class) public void testGenomeLocBad3() { GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1--0"); assertEquals(0, loc.getContigIndex()); diff --git a/java/test/org/broadinstitute/sting/utils/bed/BedParserTest.java b/java/test/org/broadinstitute/sting/utils/bed/BedParserTest.java index cd98cc77a..b95657f91 100644 --- a/java/test/org/broadinstitute/sting/utils/bed/BedParserTest.java +++ b/java/test/org/broadinstitute/sting/utils/bed/BedParserTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.bed; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GATKArgumentCollection; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -69,7 +70,7 @@ public class BedParserTest extends BaseTest { @Test public void testLoadBedFileOverlapping() { BedParser parser = new BedParser(bedFile); - List location = parser.getSortedAndMergedLocations(); + List location = parser.getSortedAndMergedLocations(GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL); Assert.assertEquals(3, location.size()); } }