diff --git a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java index 945a0a69a..2bc802872 100755 --- a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java @@ -57,7 +57,7 @@ public class GATKArgumentCollection { public List samFiles = new ArrayList(); @ElementList(required = false) - @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually.", required=false) + @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually.", required = false) public List readFilters = new ArrayList(); @ElementList(required = false) @@ -136,13 +136,18 @@ public class GATKArgumentCollection { @Argument(fullName = "numthreads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) public int numberOfThreads = 1; + /** What rule should we use when merging intervals */ + @Element(required = false) + @Argument(fullName = "interval_merging", shortName = "im", doc = "What interval merging rule should we use {ALL [DEFAULT],OVERLAPPING_ONLY,NONE}.", required = false) + public INTERVAL_MERGING_RULE intervalMerging = INTERVAL_MERGING_RULE.ALL; + /** * marshal the data out to a object * * @param collection the GATKArgumentCollection to load into * @param outputFile the file to write to */ - public static void marshal( GATKArgumentCollection collection, String outputFile ) { + public static void marshal(GATKArgumentCollection collection, String outputFile) { Serializer serializer = new Persister(new Format(new HyphenStyle())); File result = new File(outputFile); try { @@ -158,7 +163,7 @@ public class GATKArgumentCollection { * @param collection the GATKArgumentCollection to load into * @param outputFile the stream to write to */ - public static void marshal( GATKArgumentCollection collection, PrintStream outputFile ) { + public static void marshal(GATKArgumentCollection collection, PrintStream outputFile) { Serializer serializer = new Persister(new Format(new HyphenStyle())); try { serializer.write(collection, outputFile); @@ -172,7 +177,7 @@ public class GATKArgumentCollection { * * @param filename the filename to marshal from */ - public static GATKArgumentCollection unmarshal( String filename ) { + public static GATKArgumentCollection unmarshal(String filename) { Serializer serializer = new Persister(new Format(new HyphenStyle())); File source = new File(filename); try { @@ -188,7 +193,7 @@ public class GATKArgumentCollection { * * @param file the inputstream to marshal from */ - public static GATKArgumentCollection unmarshal( InputStream file ) { + public static GATKArgumentCollection unmarshal(InputStream file) { Serializer serializer = new Persister(new Format(new HyphenStyle())); try { GATKArgumentCollection example = serializer.read(GATKArgumentCollection.class, file); @@ -207,7 +212,7 @@ public class GATKArgumentCollection { * * @return true if they're equal */ - public boolean equals( GATKArgumentCollection other ) { + public boolean equals(GATKArgumentCollection other) { if (other.samFiles.size() != samFiles.size()) { return false; } @@ -260,18 +265,18 @@ public class GATKArgumentCollection { return false; } if (other.readMaxPileup != this.readMaxPileup) { - return false; - } - if (( other.filterZeroMappingQualityReads == null && this.filterZeroMappingQualityReads != null ) || - ( other.filterZeroMappingQualityReads != null && !other.filterZeroMappingQualityReads.equals(this.filterZeroMappingQualityReads) )) { return false; } - if (( other.downsampleFraction == null && this.downsampleFraction != null ) || - ( other.downsampleFraction != null && !other.downsampleFraction.equals(this.downsampleFraction) )) { + if ((other.filterZeroMappingQualityReads == null && this.filterZeroMappingQualityReads != null) || + (other.filterZeroMappingQualityReads != null && !other.filterZeroMappingQualityReads.equals(this.filterZeroMappingQualityReads))) { return false; } - if (( other.downsampleCoverage == null && this.downsampleCoverage != null ) || - ( other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage) )) { + if ((other.downsampleFraction == null && this.downsampleFraction != null) || + (other.downsampleFraction != null && !other.downsampleFraction.equals(this.downsampleFraction))) { + return false; + } + if ((other.downsampleCoverage == null && this.downsampleCoverage != null) || + (other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) { return false; } if (!other.outFileName.equals(this.outFileName)) { @@ -286,7 +291,26 @@ public class GATKArgumentCollection { if (other.numberOfThreads != this.numberOfThreads) { return false; } + if (other.intervalMerging != this.intervalMerging) { + return false; + } return true; } + + /** + * a class we use to determine the merging rules for intervals passed to the GATK + */ + public enum INTERVAL_MERGING_RULE { + ALL, // we merge both overlapping intervals and abutting intervals + OVERLAPPING_ONLY, // We merge intervals that are overlapping, but NOT ones that only abut each other + NONE; // we merge neither overlapping or abutting intervals, the list of intervals is sorted, but not merged + + public boolean check() { + if (this.compareTo(NONE) == 0) + throw new UnsupportedOperationException("We Currently do not support INTERVAL_MERGING_RULE.NONE"); + return true; + } + } } + diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index a86e49edd..dce280eb8 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -153,7 +153,7 @@ public class GenomeAnalysisEngine { initializeOutputStreams(my_walker, microScheduler.getOutputTracker()); GenomeLocSortedSet locs = null; - if (argCollection.intervals != null) { + if (argCollection.intervals != null && argCollection.intervalMerging.check()) { locs = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.intervals)); } @@ -296,16 +296,16 @@ public class GenomeAnalysisEngine { for (String interval : intervals) { if (new File(interval).exists()) { // support for the bed style interval format - if (interval.endsWith(".bed") || interval.endsWith(".BED")) { - Utils.warnUser("Bed files are zero based half open intervals, which are converted to one based closed intervals in the GATK. " + - "Be aware that all output information and intervals are one based closed intervals."); + if (interval.toUpperCase().endsWith(".BED")) { + Utils.warnUser("Bed files are 0 based half-open intervals, which are converted to 1-based closed intervals in the GATK. " + + "Be aware that all output information and intervals are 1-based closed intervals."); BedParser parser = new BedParser(new File(interval)); - locs.addAll(parser.getSortedAndMergedLocations()); + locs.addAll(parser.getSortedAndMergedLocations(GenomeAnalysisEngine.instance.getArguments().intervalMerging)); } else { - locs.addAll(GenomeLocParser.intervalFileToList(interval)); + locs.addAll(GenomeLocParser.intervalFileToList(interval,GenomeAnalysisEngine.instance.getArguments().intervalMerging)); } } else { - locs.addAll(GenomeLocParser.parseGenomeLocs(interval)); + locs.addAll(GenomeLocParser.parseGenomeLocs(interval,GenomeAnalysisEngine.instance.getArguments().intervalMerging)); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java index 8021ce317..8544393d0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/fasta/PickSequenomProbes.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.fasta; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.*; @@ -27,7 +28,7 @@ public class PickSequenomProbes extends RefWalker { public void initialize() { if ( SNP_MASK != null ) { logger.info("Loading SNP mask... "); - mask_array = GenomeLocParser.parseIntervals(Arrays.asList(SNP_MASK)).toArray(); + mask_array = GenomeLocParser.parseIntervals(Arrays.asList(SNP_MASK), GenomeAnalysisEngine.instance.getArguments().intervalMerging).toArray(); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java index fc1a017be..0818be1c9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IntervalMergerWalker.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.walkers.*; @@ -56,7 +57,7 @@ public class IntervalMergerWalker extends ReadWalker { @Override public void initialize() { - intervals = new LinkedList(GenomeLocParser.parseIntervals(intervalsSource)); + intervals = new LinkedList(GenomeLocParser.parseIntervals(intervalsSource, GenomeAnalysisEngine.instance.getArguments().intervalMerging)); currentInterval = (intervals.size() > 0 ? intervals.removeFirst() : null); } diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 0ab26c2b5..c17b596ba 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -7,6 +7,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GATKArgumentCollection; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import java.io.File; @@ -133,12 +134,13 @@ public class GenomeLocParser { /** * Load one or more intervals sources, sorting and merging overlapping intervals. * @param intervalsSource Source of intervals. + * @param rule the merging rule we're using * @return a list of sorted, merged intervals. */ - public static List parseIntervals(List intervalsSource) { + public static List parseIntervals(List intervalsSource, GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { List parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource); Collections.sort(parsedIntervals); - return GenomeLocParser.mergeOverlappingLocations(parsedIntervals); + return GenomeLocParser.mergeIntervalLocations(parsedIntervals, rule); } /** @@ -196,10 +198,11 @@ public class GenomeLocParser { * array of GenomeLoc objects * * @param str String representation of genome locs. Null string corresponds to no filter. + * @param rule the merging rule we're using * * @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order */ - public static List parseGenomeLocs(final String str) { + public static List parseGenomeLocs(final String str, GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { // Null string means no filter. if (str == null) return null; @@ -212,7 +215,7 @@ public class GenomeLocParser { locs.add(parseGenomeLoc(loc.trim())); Collections.sort(locs); //logger.info(String.format("Going to process %d locations", locs.length)); - locs = mergeOverlappingLocations(locs); + locs = mergeIntervalLocations(locs, rule); logger.debug("Locations are:" + Utils.join(", ", locs)); return locs; } catch (Exception e) { // TODO: fix this so that it passes the message from the exception, and doesn't print it out @@ -235,12 +238,13 @@ public class GenomeLocParser { * merge a list of genome locs that may be overlapping, returning the list of unique genomic locations * * @param raw the unchecked genome loc list + * @param rule the merging rule we're using * * @return the list of merged locations */ - public static List mergeOverlappingLocations(final List raw) { + public static List mergeIntervalLocations(final List raw, GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { logger.debug(" Raw locations are: " + Utils.join(", ", raw)); - if (raw.size() <= 1) + if (raw.size() <= 1 || rule == GATKArgumentCollection.INTERVAL_MERGING_RULE.NONE) return raw; else { ArrayList merged = new ArrayList(); @@ -248,7 +252,9 @@ public class GenomeLocParser { GenomeLoc prev = it.next(); while (it.hasNext()) { GenomeLoc curr = it.next(); - if (prev.contiguousP(curr)) { + if (prev.overlapsP(curr)) { + prev = prev.merge(curr); + } else if (prev.contiguousP(curr) && rule == GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL) { prev = prev.merge(curr); } else { merged.add(prev); @@ -307,8 +313,9 @@ public class GenomeLocParser { * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' * * @param file_name + * @param rule also merge abutting intervals */ - public static List intervalFileToList(final String file_name) { + public static List intervalFileToList(final String file_name, GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { /** * first try to read it as an interval file since that's well structured * we'll fail quickly if it's not a valid file. Then try to parse it as @@ -343,7 +350,7 @@ public class GenomeLocParser { List lines = reader.readLines(); reader.close(); String locStr = Utils.join(";", lines); - ret = parseGenomeLocs(locStr); + ret = parseGenomeLocs(locStr, rule); for(GenomeLoc locus: ret) verifyGenomeLocBounds(locus); return ret; diff --git a/java/src/org/broadinstitute/sting/utils/bed/BedParser.java b/java/src/org/broadinstitute/sting/utils/bed/BedParser.java index d8aad4a8b..a8a8dcce8 100644 --- a/java/src/org/broadinstitute/sting/utils/bed/BedParser.java +++ b/java/src/org/broadinstitute/sting/utils/bed/BedParser.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.bed; +import org.broadinstitute.sting.gatk.GATKArgumentCollection; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -98,10 +99,15 @@ public class BedParser { return mLocations; } - public List getSortedAndMergedLocations() { + /** + * sort and merge the intervals, using the interval rule supplied + * @param rule the rule to merge intervals with + * @return a list of genome locs, sorted and merged + */ + public List getSortedAndMergedLocations(GATKArgumentCollection.INTERVAL_MERGING_RULE rule) { List locs = new ArrayList(); locs.addAll(mLocations); Collections.sort(locs); - return GenomeLocParser.mergeOverlappingLocations(locs); + return GenomeLocParser.mergeIntervalLocations(locs, rule); } } diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java index abda2d95a..c8220ee38 100644 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java @@ -1,13 +1,18 @@ package org.broadinstitute.sting.utils; import static junit.framework.Assert.assertTrue; + import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GATKArgumentCollection; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; + import static org.junit.Assert.assertEquals; + import org.junit.BeforeClass; import org.junit.Test; -import java.util.Arrays; + +import java.util.List; /** * @author aaron @@ -78,17 +83,34 @@ public class GenomeLocParserTest extends BaseTest { @Test(expected = RuntimeException.class) public void testParseBadLocations() { - GenomeLocParser.parseGenomeLocs("chr1:1-1;badChr:1-0"); + GenomeLocParser.parseGenomeLocs("chr1:1-1;badChr:1-0", GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL); } @Test public void testParseGoodLocations() { - GenomeLocParser.parseGenomeLocs("chr1:1-1;chr1:5-9"); + GenomeLocParser.parseGenomeLocs("chr1:1-1;chr1:5-9", GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL); } @Test(expected = RuntimeException.class) public void testParseGoodLocationsTooManySemiColons() { - GenomeLocParser.parseGenomeLocs("chr1:1-1;;chr1:5-9;"); + GenomeLocParser.parseGenomeLocs("chr1:1-1;;chr1:5-9;", GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL); + } + + @Test + public void testOverlappingGoodLocationsWithAbuttingFlag() { + List locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", GATKArgumentCollection.INTERVAL_MERGING_RULE.OVERLAPPING_ONLY); + assertEquals(1, locs.size()); + } + + @Test + public void testAbuttingGoodLocationsWithAbuttingOffFlag() { + List locs = GenomeLocParser.parseGenomeLocs("chr1:1-4;chr1:5-9", GATKArgumentCollection.INTERVAL_MERGING_RULE.OVERLAPPING_ONLY); + assertEquals(2, locs.size()); + } + @Test + public void testAbuttingGoodLocationsWithNoneFlag() { + List locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", GATKArgumentCollection.INTERVAL_MERGING_RULE.NONE); + assertEquals(2, locs.size()); } @Test @@ -145,7 +167,7 @@ public class GenomeLocParserTest extends BaseTest { long start = System.currentTimeMillis(); List parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(new String[]{"/humgen/gsa-scr1/GATK_Data/Validation_Data/bigChr1IntervalList.list"})); Collections.sort(parsedIntervals); - LinkedList loc = new LinkedList(GenomeLocParser.mergeOverlappingLocations(parsedIntervals)); + LinkedList loc = new LinkedList(GenomeLocParser.mergeIntervalLocations(parsedIntervals)); long stop = System.currentTimeMillis(); logger.warn("Elapsed time = " + (stop - start)); }*/ @@ -158,22 +180,22 @@ public class GenomeLocParserTest extends BaseTest { assertEquals(1, loc.getStart()); } - @Test + @Test public void testGenomeLocParseOnlyChrome() { - GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1"); assertEquals(0, loc.getContigIndex()); assertEquals(10, loc.getStop()); // the size assertEquals(1, loc.getStart()); } - - @Test(expected = StingException.class) + + @Test(expected = StingException.class) public void testGenomeLocParseOnlyBadChrome() { - GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr12"); + GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr12"); assertEquals(0, loc.getContigIndex()); assertEquals(10, loc.getStop()); // the size assertEquals(1, loc.getStart()); } - + @Test(expected = StingException.class) public void testGenomeLocBad() { GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1-"); @@ -181,16 +203,16 @@ public class GenomeLocParserTest extends BaseTest { assertEquals(10, loc.getStop()); // the size assertEquals(1, loc.getStart()); } - - @Test(expected = StingException.class) + + @Test(expected = StingException.class) public void testGenomeLocBad2() { GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1-500-0"); assertEquals(0, loc.getContigIndex()); assertEquals(10, loc.getStop()); // the size assertEquals(1, loc.getStart()); } - - @Test(expected = StingException.class) + + @Test(expected = StingException.class) public void testGenomeLocBad3() { GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1--0"); assertEquals(0, loc.getContigIndex()); diff --git a/java/test/org/broadinstitute/sting/utils/bed/BedParserTest.java b/java/test/org/broadinstitute/sting/utils/bed/BedParserTest.java index cd98cc77a..b95657f91 100644 --- a/java/test/org/broadinstitute/sting/utils/bed/BedParserTest.java +++ b/java/test/org/broadinstitute/sting/utils/bed/BedParserTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.bed; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GATKArgumentCollection; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -69,7 +70,7 @@ public class BedParserTest extends BaseTest { @Test public void testLoadBedFileOverlapping() { BedParser parser = new BedParser(bedFile); - List location = parser.getSortedAndMergedLocations(); + List location = parser.getSortedAndMergedLocations(GATKArgumentCollection.INTERVAL_MERGING_RULE.ALL); Assert.assertEquals(3, location.size()); } }