diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index a89725fcb..89c7d25cc 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -141,7 +141,7 @@ public class GenomeAnalysisEngine { throw new StingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null."); } - // validate our parameters + // validate our parameters if (my_walker == null) throw new StingException("The walker passed to GenomeAnalysisEngine can not be null."); @@ -174,35 +174,105 @@ public class GenomeAnalysisEngine { * Setup the intervals to be processed */ private void initializeIntervals() { - GenomeLocSortedSet excludeIntervals = null; - if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) { - List rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL); - excludeIntervals = GenomeLocSortedSet.createSetFromList(rawExcludeIntervals); - } - if (argCollection.intervals != null && argCollection.intervalMerging.check()) { - List parsedIntervals = parseIntervalRegion(argCollection.intervals); - intervals = (parsedIntervals == null) ? null: GenomeLocSortedSet.createSetFromList(parsedIntervals); - } - /* - if (argCollection.intervals != null && argCollection.intervalMerging.check()) { - intervals = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.intervals)); - } - */ - if ( excludeIntervals != null ) { - GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals; - long toPruneSize = toPrune.coveredSize(); - long toExcludeSize = excludeIntervals.coveredSize(); - logger.info(String.format("Initial include intervals cover %d bases", toPruneSize)); - logger.info(String.format("Initial exclude intervals cover %d bases", toExcludeSize)); - intervals = toPrune.substractRegions( excludeIntervals ); - long intervalSize = intervals.coveredSize(); - logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)", - toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); - } + // return null if no interval arguments at all + if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null)) + return; - if ( intervals != null ) - logger.info(String.format("Processing %d bases in intervals", intervals.coveredSize())); + else { + // if include argument isn't given, create new set of all possible intervals + GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ? + GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : + parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging)); + + // if no exclude arguments, can return parseIntervalArguments directly + if (argCollection.excludeIntervals == null) + intervals = includeSortedSet; + + // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets + else { + GenomeLocSortedSet excludeSortedSet = parseIntervalArguments(argCollection.excludeIntervals, argCollection.intervalMerging); + intervals = includeSortedSet.substractRegions(excludeSortedSet); + + // logging messages only printed when exclude (-XL) arguments are given + long toPruneSize = includeSortedSet.coveredSize(); + long toExcludeSize = excludeSortedSet.coveredSize(); + long intervalSize = intervals.coveredSize(); + logger.info(String.format("Initial include intervals cover %d bases", toPruneSize)); + logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)", + toExcludeSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); + } + + } + } + + /** + * Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL + * Set is sorted and merged + */ + + public static GenomeLocSortedSet parseIntervalArguments(final List intervals) { + return parseIntervalArguments(intervals, GenomeAnalysisEngine.instance.getArguments().intervalMerging); + } + + /** + * Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL + * Set is sorted and merged + */ + public static GenomeLocSortedSet parseIntervalArguments(List argList, IntervalMergingRule mergingRule) { + + List rawIntervals = new ArrayList(); // running list of raw GenomeLocs + + for (String argument : argList) { + + // if any interval argument is '-L all', consider all loci by returning no intervals + if (argument.equals("all")) { + if (argList.size() != 1) { + // throw error if '-L all' is not only interval - potentially conflicting commands + throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\"")); + } + return null; + } + + // separate argument on semicolon first + for (String fileOrInterval : argument.split(";")) { + + // if it's a file, add items to raw interval list + if (isFile(fileOrInterval)) + rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule)); + + // otherwise treat as an interval -> parse and add to raw interval list + else { + rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval)); + } + } + } + // redundant check => default no arguments is null, not empty list + if (rawIntervals.size() == 0) + return null; + + // sort raw interval list + Collections.sort(rawIntervals); + + // now merge raw interval list + rawIntervals = GenomeLocParser.mergeIntervalLocations(rawIntervals, mergingRule); + + return GenomeLocSortedSet.createSetFromList(rawIntervals); + + } + + /** + * Check if string argument was intented as a file + * Accepted file extensions: .list, .interval_list, .bed, .picard + */ + private static boolean isFile(String str) { + // should we define list of file extensions as a public array somewhere? + // is regex or endsiwth better? + if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") || + str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST") + || str.toUpperCase().endsWith(".INTERVALS")) + return true; + else return false; } /** @@ -319,53 +389,6 @@ public class GenomeAnalysisEngine { return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource,rodDataSources,argCollection.numberOfThreads); } - /** - * setup the interval regions, from either the interval file of the genome region string - * - * @param intervals the list of intervals to parse - * @return a list of genomeLoc representing the interval file - */ - public static List parseIntervalRegion(final List intervals) { - return parseIntervalRegion(intervals, GenomeAnalysisEngine.instance.getArguments().intervalMerging); - } - - /** - * setup the interval regions, from either the interval file of the genome region string - * - * @param intervals the list of intervals to parse - * @param mergingRule the rule for merging intervals - * @return a list of genomeLoc representing the interval file - */ - public static List parseIntervalRegion(final List intervals, IntervalMergingRule mergingRule) { - List locs = new ArrayList(); - for (String interval : intervals) { - // if any interval argument is '-L all', consider all loci by returning no intervals - if (interval.equals("all")) { - if (intervals.size() != 1) { - // throw error if '-L all' is not only interval - potentially conflicting commands - throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\"")); - } - return new ArrayList(); - } - - if (new File(interval).exists()) { - // support for the bed style interval format - if (interval.toUpperCase().endsWith(".BED")) { - Utils.warnUser("Bed files are 0 based half-open intervals, which are converted to 1-based closed intervals in the GATK. " + - "Be aware that all output information and intervals are 1-based closed intervals."); - BedParser parser = new BedParser(new File(interval)); - locs.addAll(parser.getSortedAndMergedLocations(mergingRule)); - } else { - locs.addAll(GenomeLocParser.intervalFileToList(interval,mergingRule)); - } - } else { - locs.addAll(GenomeLocParser.parseGenomeLocs(interval,mergingRule)); - } - - } - return locs; - } - /** * Gets a unique identifier for the reader sourcing this read. * @param read Read to examine. diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java b/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java index 3aafcb69b..99a0032ff 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java @@ -5,9 +5,10 @@ import java.util.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; public class IntervalRodIterator implements Iterator { - private List locations = null; + //private List locations = null; private Iterator iter; private String trackName = null; @@ -18,15 +19,15 @@ public class IntervalRodIterator implements Iterator { public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) { //System.out.printf("Parsing %s for intervals %s%n", file, trackName); - List locs = GenomeAnalysisEngine.parseIntervalRegion(Collections.singletonList(file.getPath())); + GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Collections.singletonList(file.getPath())); //System.out.printf(" => got %d entries %n", locs.size()); return new IntervalRodIterator(trackName, locs); } - public IntervalRodIterator(String trackName, List locs) { + public IntervalRodIterator(String trackName, GenomeLocSortedSet locs) { this.trackName = trackName; - locations = locs; - iter = locations.iterator(); + //locations = locs; + iter = locs.iterator(); } @Override diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 1fd808f5d..a8734617c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -126,8 +126,8 @@ public class IndelRealigner extends ReadWalker { throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); // read in the intervals for cleaning - List locs = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY); - intervals = GenomeLocSortedSet.createSetFromList(locs).iterator(); + GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Arrays.asList(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY); + intervals = locs.iterator(); currentInterval = intervals.hasNext() ? intervals.next() : null; // set up the output writer(s) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java index d5fc80e8e..60e2a30c8 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java @@ -118,13 +118,11 @@ public class VariantConcordanceROCCurveWalker extends RodWalker parseIntervals(List intervalsSource, IntervalMergingRule rule) { - List parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource); - Collections.sort(parsedIntervals); - return GenomeLocParser.mergeIntervalLocations(parsedIntervals, rule); - } + + public static GenomeLoc parseGenomeInterval(final String str) { + GenomeLoc ret = parseGenomeLoc(str); + exceptionOnInvalidGenomeLocBounds(ret); + return ret; + } /** * parse a genome location, from a location string + * + * Performs read-style validation: + * checks that start and stop are positive, start < stop, and the contig is valid + * does not check that genomeLoc is actually on the contig * * @param str the string to parse * * @return a GenomeLoc representing the String + * */ public static GenomeLoc parseGenomeLoc(final String str) { // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' @@ -183,53 +195,18 @@ public class GenomeLocParser { if (bad) throw new StingException("Failed to parse Genome Location string: " + str); - if (start < 0) - throw new StingException("Invalid Genome Location start < 0: " + str + ' ' + start); - if (stop < 0) - throw new StingException("Invalid Genome Location stop < 0: " + str + ' ' + stop); - if (contig == null) - throw new StingException("Invalid Genome Location contig == null : " + str); - if (start > stop) - throw new StingException("Invalid Genome Location string; start position comes after end position: " + str ); - - if (!isContigValid(contig)) + // is the contig valid? + if (!isContigValid(contig)) throw new StingException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference."); if (stop == Integer.MAX_VALUE && hasKnownContigOrdering()) // lookup the actually stop position! stop = getContigInfo(contig).getSequenceLength(); - - GenomeLoc loc = parseGenomeLoc(contig, start, stop); - return loc; - } - /** - * Useful utility function that parses a location string into a coordinate-order sorted - * array of GenomeLoc objects - * - * @param str String representation of genome locs. Null string corresponds to no filter. - * @param rule the merging rule we're using - * - * @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order - */ - public static List parseGenomeLocs(final String str, IntervalMergingRule rule) { - // Null string means no filter. - if (str == null) return null; - - // Of the form: loc1;loc2;... - // Where each locN can be: - // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' - try { - List locs = new ArrayList(); - for (String loc : str.split(";")) - locs.add(parseGenomeLoc(loc.trim())); - Collections.sort(locs); - locs = mergeIntervalLocations(locs, rule); - return locs; - } catch (Exception e) { // TODO: fix this so that it passes the message from the exception, and doesn't print it out - throw new StingException(String.format("Invalid locations string: %s, format is loc1;loc2; where loc1 < loc2. Each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str), e); - } + GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig,true), start, stop); + exceptionOnInvalidGenomeLoc(locus); + return locus; } // -------------------------------------------------------------------------------------------------------------- @@ -252,7 +229,7 @@ public class GenomeLocParser { * @return the list of merged locations */ public static List mergeIntervalLocations(final List raw, IntervalMergingRule rule) { - if (raw.size() <= 1 || rule == IntervalMergingRule.NONE) + if (raw.size() <= 1) return raw; else { ArrayList merged = new ArrayList(); @@ -282,19 +259,7 @@ public class GenomeLocParser { */ private static boolean isContigValid(String contig) { int contigIndex = contigInfo.getSequenceIndex(contig); - return isSequenceIndexValid(contigIndex); - } - - /** - * Determines whether the given sequence index is valid with respect to the sequence dictionary. - * - * @param sequenceIndex sequence index - * - * @return True if the sequence index is valid, false otherwise. - */ - private static boolean isSequenceIndexValid(int sequenceIndex) { - return sequenceIndex >= 0 && sequenceIndex < contigInfo.size(); - + return contigIndex >= 0 && contigIndex < contigInfo.size(); } /** @@ -305,6 +270,9 @@ public class GenomeLocParser { * @param stop Stop point. * * @return The genome location, or a MalformedGenomeLocException if unparseable. + * + * Validation: only checks that contig is valid + * start/stop could be anything */ public static GenomeLoc parseGenomeLoc(final String contig, long start, long stop) { if (!isContigValid(contig)) @@ -324,52 +292,70 @@ public class GenomeLocParser { * @param rule also merge abutting intervals */ public static List intervalFileToList(final String file_name, IntervalMergingRule rule) { + // try to open file + File inputFile = null; + try { + inputFile = new File(file_name); + } + catch (Exception e) { + throw new StingException("Could not open file", e); + } + + // check if file is empty + if (inputFile.exists() && inputFile.length() < 1) { + if (GenomeAnalysisEngine.instance.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST) + return new ArrayList(); + else { + Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " + + "may want to fix (or exclude) this file."); + return null; + } + } + + // case: BED file + if (file_name.toUpperCase().endsWith(".BED")) { + BedParser parser = new BedParser(inputFile); + return parser.getSortedAndMergedLocations(rule); + } + /** + * IF not a BED file: * first try to read it as an interval file since that's well structured * we'll fail quickly if it's not a valid file. Then try to parse it as * a location string file */ - List ret = null; try { - File inputFile = new File(file_name); - - // sometimes we see an empty file passed as a parameter, if so return an empty list - if (inputFile.exists() && inputFile.length() < 1) { - if (GenomeAnalysisEngine.instance.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST) - return new ArrayList(); - else { - Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " + - "may want to fix (or exclude) this file."); - return new ArrayList(); - } - } IntervalList il = IntervalList.fromFile(inputFile); // iterate through the list of merged intervals and add then as GenomeLocs - ret = new ArrayList(); + List ret = new ArrayList(); for (Interval interval : il.getUniqueIntervals()) { ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), interval.getStart(), interval.getEnd())); } - return ret; + // always return null instead of empty list + return ret.isEmpty() ? null : ret; - } catch (Exception e) { + } + + // if that didn't work, try parsing file as an old fashioned string file + catch (Exception e) { try { - ret = new ArrayList(); + List ret = new ArrayList(); xReadLines reader = new xReadLines(new File(file_name)); for(String line: reader) { - List loci = parseGenomeLocs(line, rule); - if(loci != null) - ret.addAll(loci); + try { + ret.add(parseGenomeInterval(line)); + } + catch (Exception e2) { + throw new StingException(String.format("Unable to parse interval: %s in file: %s", line, file_name)); + } } reader.close(); - if(ret.isEmpty()) - return null; - - for(GenomeLoc locus: ret) - exceptionOnInvalidGenomeLocBounds(locus); - return ret; - } catch (Exception e2) { + // always return null instead of empty list + return ret.isEmpty() ? null : ret; + } + catch (Exception e2) { logger.error("Attempt to parse interval file in GATK format failed: " + e2.getMessage()); e2.printStackTrace(); throw new StingException("Unable to parse out interval file in either format", e); @@ -471,10 +457,16 @@ public class GenomeLocParser { /** * verify the specified genome loc is valid, if it's not, throw an exception * Will not verify the location against contig bounds. + * + * + * Validation: + * checks that start and stop are positive, start < stop, and the contig is valid + * does not check that genomeLoc is actually on the contig, so start could be > end of contig * * @param toReturn the genome loc we're about to return * * @return the genome loc if it's valid, otherwise we throw an exception + * */ private static GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) { if (toReturn.getStart() < 0) { @@ -496,16 +488,24 @@ public class GenomeLocParser { /** * Verify the locus against the bounds of the contig. + * + * performs boundary validation for genome loc INTERVALS: + * start and stop are on contig and start <= stop + * does NOT check that start and stop > 0, or that contig is valid + * for that reason, this function should only be called AFTER exceptionOnInvalidGenomeLoc() + * exceptionOnInvalidGenomeLoc isn't included in this function to save time + * * @param locus Locus to verify. */ private static void exceptionOnInvalidGenomeLocBounds(GenomeLoc locus) { - exceptionOnInvalidGenomeLoc(locus); - int contigSize = contigInfo.getSequence(locus.getContigIndex()).getSequenceLength(); if(locus.getStart() > contigSize) throw new StingException(String.format("GenomeLoc is invalid: locus start %d is after the end of contig %s",locus.getStart(),locus.getContig())); if(locus.getStop() > contigSize) throw new StingException(String.format("GenomeLoc is invalid: locus stop %d is after the end of contig %s",locus.getStop(),locus.getContig())); + if (locus.getStart() > locus.getStop()) { + throw new StingException("Parameters to GenomeLocParser are incorrect: the start position is greater than the end position"); + } } /** @@ -514,6 +514,8 @@ public class GenomeLocParser { * @param loc the location to validate * * @return true if the passed in GenomeLoc represents a valid location + * + * performs interval-style validation: contig is valid and atart and stop less than the end */ public static boolean validGenomeLoc(GenomeLoc loc) { checkSetup(); @@ -541,6 +543,8 @@ public class GenomeLocParser { * @param stop the stop position * * @return true if it's valid, false otherwise + * + * performs interval-style validation: contig is valid and atart and stop less than the end */ public static boolean validGenomeLoc(String contig, long start, long stop) { checkSetup(); @@ -556,6 +560,8 @@ public class GenomeLocParser { * @param stop the stop position * * @return true if it's valid, false otherwise + * + * performs interval-style validation: contig is valid and atart and stop less than the end */ public static boolean validGenomeLoc(int contigIndex, long start, long stop) { checkSetup(); diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 6369f5d81..571d8beb0 100755 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -373,4 +373,31 @@ public class GenomeLocSortedSet extends AbstractSet { return s.toString(); } + + /** + * Check to see whether two genomeLocSortedSets are equal. + * Note that this implementation ignores the contigInfo object. + * + */ /* + @Override + public boolean equals(Object other) { + if(other == null) + return false; + if(other instanceof GenomeLocSortedSet) { + // send to a list, so we can ensure order correct + List otherList = ((GenomeLocSortedSet)other).toList(); + List thisList = this.toList(); + if (otherList.size() != this.size()) + return false; + + for (Integer i=0;i locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", IntervalMergingRule.OVERLAPPING_ONLY); - assertEquals(1, locs.size()); - } - - @Test - public void testAbuttingGoodLocationsWithAbuttingOffFlag() { - List locs = GenomeLocParser.parseGenomeLocs("chr1:1-4;chr1:5-9", IntervalMergingRule.OVERLAPPING_ONLY); - assertEquals(2, locs.size()); - } - @Test - public void testAbuttingGoodLocationsWithNoneFlag() { - List locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", IntervalMergingRule.NONE); - assertEquals(2, locs.size()); - } - + @Test public void testCreateGenomeLoc1() { GenomeLoc loc = GenomeLocParser.createGenomeLoc("chr1", 1, 100); @@ -162,16 +130,6 @@ public class GenomeLocParserTest extends BaseTest { assertEquals(1, copy.getStart()); } - /*@Test // - uncomment if you want to test speed - public void testGenomeLocParserList() { - long start = System.currentTimeMillis(); - List parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(new String[]{"/humgen/gsa-scr1/GATK_Data/Validation_Data/bigChr1IntervalList.list"})); - Collections.sort(parsedIntervals); - LinkedList loc = new LinkedList(GenomeLocParser.mergeIntervalLocations(parsedIntervals)); - long stop = System.currentTimeMillis(); - logger.warn("Elapsed time = " + (stop - start)); - }*/ - @Test public void testGenomeLocPlusSign() { GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1+");