Reorganizing the way interval arguments are processed

Most of the changes occur in GenomeAnalysisEngine.java and GenomeLocParser.java: -- parseIntervalRegion and parseGenomeLocs combined into parseIntervalArguments -- initializeIntervals modified -- some helper functions deprecated for cleanliness Includes new set of unit tests, GenomeAnalysisEngineTest.java New restrictions: -- all interval arguments are now checked to be on the reference contig -- all interval files must have one of the following extensions: .picard, .bed, .list, .intervals, .interval_list git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3106 348d0f76-0448-11de-a6fe-93d51630548a
2010-04-01 12:47:48 +00:00 · 2010-04-01 12:47:48 +00:00 · b4f6f54502
parent c3c6e632d1
commit b4f6f54502
7 changed files with 239 additions and 226 deletions
--- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@ -141,7 +141,7 @@ public class GenomeAnalysisEngine {
            throw new StingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null.");
        }

-        // validate our parameters
+        // validate our parameters              
        if (my_walker == null)
            throw new StingException("The walker passed to GenomeAnalysisEngine can not be null.");

@ -174,35 +174,105 @@ public class GenomeAnalysisEngine {
     * Setup the intervals to be processed
     */
    private void initializeIntervals() {
-        GenomeLocSortedSet excludeIntervals = null;
-        if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
-            List<GenomeLoc> rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL);
-            excludeIntervals = GenomeLocSortedSet.createSetFromList(rawExcludeIntervals);
-        }

-        if (argCollection.intervals != null && argCollection.intervalMerging.check()) {
-            List <GenomeLoc> parsedIntervals = parseIntervalRegion(argCollection.intervals);
-            intervals = (parsedIntervals == null) ? null: GenomeLocSortedSet.createSetFromList(parsedIntervals);
-        }
-        /*
-        if (argCollection.intervals != null && argCollection.intervalMerging.check()) {
-            intervals = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.intervals));
-        }
-          */
-        if ( excludeIntervals != null ) {
-            GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals;
-            long toPruneSize = toPrune.coveredSize();
-            long toExcludeSize = excludeIntervals.coveredSize();
-            logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
-            logger.info(String.format("Initial exclude intervals cover %d bases", toExcludeSize));
-            intervals = toPrune.substractRegions( excludeIntervals );
-            long intervalSize = intervals.coveredSize();
-            logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
-                    toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
-        }
+        // return null if no interval arguments at all
+        if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null))
+            return;

-        if ( intervals != null )
-            logger.info(String.format("Processing %d bases in intervals", intervals.coveredSize()));
+        else {
+            // if include argument isn't given, create new set of all possible intervals
+            GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
+                    GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) :
+                    parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging));
+
+            // if no exclude arguments, can return parseIntervalArguments directly
+            if (argCollection.excludeIntervals == null)
+                intervals = includeSortedSet;
+
+            // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
+            else {
+                GenomeLocSortedSet excludeSortedSet = parseIntervalArguments(argCollection.excludeIntervals, argCollection.intervalMerging);
+                intervals = includeSortedSet.substractRegions(excludeSortedSet);
+
+                // logging messages only printed when exclude (-XL) arguments are given
+                long toPruneSize = includeSortedSet.coveredSize();
+                long toExcludeSize = excludeSortedSet.coveredSize();
+                long intervalSize = intervals.coveredSize();
+                logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
+                logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
+                        toExcludeSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
+            }
+
+        }
+    }
+
+    /**
+     * Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL
+     * Set is sorted and merged
+     */
+
+    public static GenomeLocSortedSet parseIntervalArguments(final List<String> intervals) {
+        return parseIntervalArguments(intervals, GenomeAnalysisEngine.instance.getArguments().intervalMerging);
+    }
+
+    /**
+     * Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL
+     * Set is sorted and merged
+     */
+    public static GenomeLocSortedSet parseIntervalArguments(List <String> argList, IntervalMergingRule mergingRule) {
+
+        List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>();    // running list of raw GenomeLocs
+
+        for (String argument : argList) {
+
+            // if any interval argument is '-L all', consider all loci by returning no intervals
+            if (argument.equals("all")) {
+                if (argList.size() != 1) {
+                    // throw error if '-L all' is not only interval - potentially conflicting commands
+                    throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
+                }
+                return null;
+            }                         
+
+            // separate argument on semicolon first
+            for (String fileOrInterval : argument.split(";")) {
+
+                // if it's a file, add items to raw interval list
+                if (isFile(fileOrInterval))
+                    rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
+
+                    // otherwise treat as an interval -> parse and add to raw interval list
+                else {
+                    rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
+                }
+            }
+        }
+        // redundant check => default no arguments is null, not empty list
+        if (rawIntervals.size() == 0)
+            return null;
+
+        // sort raw interval list
+        Collections.sort(rawIntervals);
+
+        // now merge raw interval list
+        rawIntervals = GenomeLocParser.mergeIntervalLocations(rawIntervals, mergingRule);
+
+        return GenomeLocSortedSet.createSetFromList(rawIntervals);
+
+    }
+
+    /**
+     * Check if string argument was intented as a file
+     * Accepted file extensions: .list, .interval_list, .bed, .picard
+     */
+    private static boolean isFile(String str) {
+        // should we define list of file extensions as a public array somewhere?
+        // is regex or endsiwth better?
+        if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") ||
+                str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST")
+                || str.toUpperCase().endsWith(".INTERVALS"))
+            return true;
+        else return false;
    }

    /**
@ -319,53 +389,6 @@ public class GenomeAnalysisEngine {
        return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource,rodDataSources,argCollection.numberOfThreads);
    }

-    /**
-     * setup the interval regions, from either the interval file of the genome region string
-     *
-     * @param intervals the list of intervals to parse
-     * @return a list of genomeLoc representing the interval file
-     */
-    public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals) {
-        return parseIntervalRegion(intervals, GenomeAnalysisEngine.instance.getArguments().intervalMerging);
-    }
-
-    /**
-     * setup the interval regions, from either the interval file of the genome region string
-     *
-     * @param intervals the list of intervals to parse
-     * @param mergingRule the rule for merging intervals
-     * @return a list of genomeLoc representing the interval file
-     */
-    public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals, IntervalMergingRule mergingRule) {
-        List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
-        for (String interval : intervals) {
-            // if any interval argument is '-L all', consider all loci by returning no intervals
-            if (interval.equals("all")) {
-                if (intervals.size() != 1) {
-                    // throw error if '-L all' is not only interval - potentially conflicting commands
-                    throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
-                }
-                return new ArrayList<GenomeLoc>();
-            }
-
-            if (new File(interval).exists()) {
-                // support for the bed style interval format
-                if (interval.toUpperCase().endsWith(".BED")) {
-                    Utils.warnUser("Bed files are 0 based half-open intervals, which are converted to 1-based closed intervals in the GATK.  " +
-                            "Be aware that all output information and intervals are 1-based closed intervals.");
-                    BedParser parser = new BedParser(new File(interval));
-                    locs.addAll(parser.getSortedAndMergedLocations(mergingRule));
-                } else {
-                    locs.addAll(GenomeLocParser.intervalFileToList(interval,mergingRule));
-                }
-            } else {
-                locs.addAll(GenomeLocParser.parseGenomeLocs(interval,mergingRule));
-            }
-
-        }
-        return locs;
-    }
-
    /**
     * Gets a unique identifier for the reader sourcing this read.
     * @param read Read to examine.
--- a/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java
@ -5,9 +5,10 @@ import java.util.*;

 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocSortedSet;

 public class IntervalRodIterator implements Iterator<IntervalRod> {
-    private List<GenomeLoc> locations = null;
+    //private List<GenomeLoc> locations = null;
    private Iterator<GenomeLoc> iter;
    private String trackName = null;

@ -18,15 +19,15 @@ public class IntervalRodIterator implements Iterator<IntervalRod> {

    public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) {
        //System.out.printf("Parsing %s for intervals %s%n", file, trackName);
-        List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(Collections.singletonList(file.getPath()));
+        GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Collections.singletonList(file.getPath()));
        //System.out.printf(" => got %d entries %n", locs.size());
        return new IntervalRodIterator(trackName, locs);
    }

-	public IntervalRodIterator(String trackName, List<GenomeLoc> locs) {
+	public IntervalRodIterator(String trackName, GenomeLocSortedSet locs) {
        this.trackName = trackName;
-        locations = locs;
-        iter = locations.iterator();
+        //locations = locs;
+        iter = locs.iterator();
 	}

 	@Override
--- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@ -126,8 +126,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
            throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");

        // read in the intervals for cleaning
-        List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY);
-        intervals = GenomeLocSortedSet.createSetFromList(locs).iterator();
+        GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Arrays.asList(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY);
+        intervals = locs.iterator();
        currentInterval = intervals.hasNext() ? intervals.next() : null;

        // set up the output writer(s)
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java
@ -118,13 +118,11 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi

        for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) {
            if( vc != null && vc.getName().toUpperCase().startsWith("TRUTH") ) {
-                if( vc.isSNP() && !vc.isFiltered() ) {
-                    if( !vc.getGenotype(sampleName).isNoCall() ) {
-                        isInTruthSet = true;
+                if( !vc.getGenotype(sampleName).isNoCall() ) {
+                    isInTruthSet = true;

-                        if( !vc.getGenotype(sampleName).isHomRef() ) {
-                            isTrueVariant = true;
-                        }
+                    if( !vc.getGenotype(sampleName).isHomRef() ) {
+                        isTrueVariant = true;
                    }
                }
                //if( vc.isPolymorphic() ) { //BUGBUG: I don't think this is the right thing to do here, there are many polymorphic sites in the truth data because there are many samples
@ -259,7 +257,7 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
                final double sensitivity = ((double) truePos[curveIndex]) / ((double) truePos[curveIndex] + falseNegGlobal[curveIndex] + falseNeg[curveIndex]);
                final double specificity = ((double) trueNegGlobal[curveIndex] + trueNeg[curveIndex]) /
                                                ((double) falsePos[curveIndex] + trueNegGlobal[curveIndex] + trueNeg[curveIndex]);
-                outputFile.print( String.format("%.8f,%.8f,%.8f,", qualCut[curveIndex], sensitivity, 1.0 - specificity) );
+                outputFile.print( String.format("%.4f,%.4f,%.4f,", qualCut[curveIndex], sensitivity, 1.0 - specificity) );
                qualCut[curveIndex] += incrementQual[curveIndex];
                curveIndex++;
            }
--- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java
+++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java
@ -10,6 +10,7 @@ import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
 import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
+import org.broadinstitute.sting.utils.bed.BedParser;

 import java.io.File;
 import java.util.ArrayList;
@ -133,24 +134,35 @@ public class GenomeLocParser {
        return true;
    }

-    /**
-     * Load one or more intervals sources, sorting and merging overlapping intervals.
-     * @param intervalsSource Source of intervals.
-     * @param rule the merging rule we're using
-     * @return a list of sorted, merged intervals.
+     /**
+     * parse a genome interval, from a location string
+     *
+     * Performs interval-style validation:                                                                                                                                     
+     *
+     * contig is valid; start and stop less than the end; start <= sto
+     * @param str the string to parse
+     *
+     * @return a GenomeLoc representing the String
+     *
     */
-    public static List<GenomeLoc> parseIntervals(List<String> intervalsSource, IntervalMergingRule rule) {
-        List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource);
-	Collections.sort(parsedIntervals);
-	return GenomeLocParser.mergeIntervalLocations(parsedIntervals, rule);
-    }
+
+    public static GenomeLoc parseGenomeInterval(final String str) {
+         GenomeLoc ret = parseGenomeLoc(str);
+         exceptionOnInvalidGenomeLocBounds(ret);
+         return ret;
+     }

    /**
     * parse a genome location, from a location string
+     *               
+     * Performs read-style validation:    
+     * checks that start and stop are positive, start < stop, and the contig is valid
+     * does not check that genomeLoc is actually on the contig
     *
     * @param str the string to parse
     *
     * @return a GenomeLoc representing the String
+     *
     */
    public static GenomeLoc parseGenomeLoc(final String str) {
        // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
@ -183,53 +195,18 @@ public class GenomeLocParser {

        if (bad)
 		    throw new StingException("Failed to parse Genome Location string: " + str);
-        if (start < 0)
-		    throw new StingException("Invalid Genome Location start < 0: " + str + ' ' + start);
-        if (stop < 0)
-            throw new StingException("Invalid Genome Location stop < 0: " + str + ' ' + stop);
-        if (contig == null)
-            throw new StingException("Invalid Genome Location contig == null : " + str);

-        if (start > stop)
-		    throw new StingException("Invalid Genome Location string; start position comes after end position: " + str );
-		        
-		if (!isContigValid(contig))
+        // is the contig valid?
+        if (!isContigValid(contig))
            throw new StingException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference.");

 		if (stop == Integer.MAX_VALUE && hasKnownContigOrdering())
            // lookup the actually stop position!
            stop = getContigInfo(contig).getSequenceLength();
-        
-        GenomeLoc loc = parseGenomeLoc(contig, start, stop);
-        return loc;
-    }

-    /**
-     * Useful utility function that parses a location string into a coordinate-order sorted
-     * array of GenomeLoc objects
-     *
-     * @param str String representation of genome locs.  Null string corresponds to no filter.
-     * @param rule the merging rule we're using
-     *
-     * @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
-     */
-    public static List<GenomeLoc> parseGenomeLocs(final String str, IntervalMergingRule rule) {
-        // Null string means no filter.
-        if (str == null) return null;
-
-        // Of the form: loc1;loc2;...
-        // Where each locN can be:
-        // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
-        try {
-            List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
-            for (String loc : str.split(";"))
-                locs.add(parseGenomeLoc(loc.trim()));
-            Collections.sort(locs);
-            locs = mergeIntervalLocations(locs, rule);            
-            return locs;
-        } catch (Exception e) { // TODO: fix this so that it passes the message from the exception, and doesn't print it out
-            throw new StingException(String.format("Invalid locations string: %s, format is loc1;loc2; where loc1 < loc2. Each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str), e);
-        }                                                                                                                        
+        GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig,true), start, stop);
+        exceptionOnInvalidGenomeLoc(locus);
+        return locus;
    }

    // --------------------------------------------------------------------------------------------------------------
@ -252,7 +229,7 @@ public class GenomeLocParser {
     * @return the list of merged locations
     */
    public static List<GenomeLoc> mergeIntervalLocations(final List<GenomeLoc> raw, IntervalMergingRule rule) {
-        if (raw.size() <= 1 || rule == IntervalMergingRule.NONE)
+        if (raw.size() <= 1)
            return raw;
        else {
            ArrayList<GenomeLoc> merged = new ArrayList<GenomeLoc>();
@ -282,19 +259,7 @@ public class GenomeLocParser {
     */
    private static boolean isContigValid(String contig) {
        int contigIndex = contigInfo.getSequenceIndex(contig);
-        return isSequenceIndexValid(contigIndex);
-    }
-
-    /**
-     * Determines whether the given sequence index is valid with respect to the sequence dictionary.
-     *
-     * @param sequenceIndex sequence index
-     *
-     * @return True if the sequence index is valid, false otherwise.
-     */
-    private static boolean isSequenceIndexValid(int sequenceIndex) {
-        return sequenceIndex >= 0 && sequenceIndex < contigInfo.size();
-
+        return contigIndex >= 0 && contigIndex < contigInfo.size();
    }

    /**
@ -305,6 +270,9 @@ public class GenomeLocParser {
     * @param stop   Stop point.
     *
     * @return The genome location, or a MalformedGenomeLocException if unparseable.
+     *
+     * Validation: only checks that contig is valid
+     * start/stop could be anything
     */
    public static GenomeLoc parseGenomeLoc(final String contig, long start, long stop) {
        if (!isContigValid(contig))
@ -324,52 +292,70 @@ public class GenomeLocParser {
     * @param rule also merge abutting intervals
     */
    public static List<GenomeLoc> intervalFileToList(final String file_name, IntervalMergingRule rule) {
+        // try to open file
+        File inputFile = null;
+        try {
+            inputFile = new File(file_name);
+        }
+        catch (Exception e) {
+            throw new StingException("Could not open file", e);
+        }
+
+        // check if file is empty
+        if (inputFile.exists() && inputFile.length() < 1) {
+            if (GenomeAnalysisEngine.instance.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST)
+                return new ArrayList<GenomeLoc>();
+            else {
+                Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " +
+                        "may want to fix (or exclude) this file.");
+                return null;
+            }
+        }
+
+        // case: BED file
+        if (file_name.toUpperCase().endsWith(".BED")) {
+            BedParser parser = new BedParser(inputFile);
+            return parser.getSortedAndMergedLocations(rule);
+        }
+
        /**
+         * IF not a BED file:
         * first try to read it as an interval file since that's well structured
         * we'll fail quickly if it's not a valid file.  Then try to parse it as
         * a location string file
         */
-        List<GenomeLoc> ret = null;
        try {
-            File inputFile = new File(file_name);
-
-            // sometimes we see an empty file passed as a parameter, if so return an empty list
-            if (inputFile.exists() && inputFile.length() < 1) {
-                if (GenomeAnalysisEngine.instance.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST)
-                    return new ArrayList<GenomeLoc>();
-                else {
-                    Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " +
-                            "may want to fix (or exclude) this file.");
-                    return new ArrayList<GenomeLoc>();
-                }
-            }
            IntervalList il = IntervalList.fromFile(inputFile);

            // iterate through the list of merged intervals and add then as GenomeLocs
-            ret = new ArrayList<GenomeLoc>();
+            List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
            for (Interval interval : il.getUniqueIntervals()) {
                ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), interval.getStart(), interval.getEnd()));
            }
-            return ret;
+            // always return null instead of empty list
+            return ret.isEmpty() ? null : ret;

-        } catch (Exception e) {
+        }
+
+        // if that didn't work, try parsing file as an old fashioned string file
+        catch (Exception e) {
            try {
-                ret = new ArrayList<GenomeLoc>();
+                List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
                xReadLines reader = new xReadLines(new File(file_name));
                for(String line: reader) {
-                    List<GenomeLoc> loci = parseGenomeLocs(line, rule);
-                    if(loci != null)
-                        ret.addAll(loci);
+                    try {
+                        ret.add(parseGenomeInterval(line));
+                    }
+                    catch (Exception e2) {
+                        throw new StingException(String.format("Unable to parse interval: %s in file: %s", line, file_name));
+                    }
                }
                reader.close();

-                if(ret.isEmpty())
-                    return null;
-
-                for(GenomeLoc locus: ret)
-                    exceptionOnInvalidGenomeLocBounds(locus);
-                return ret;
-            } catch (Exception e2) {
+                // always return null instead of empty list
+                return ret.isEmpty() ? null : ret;
+            }
+            catch (Exception e2) {
                logger.error("Attempt to parse interval file in GATK format failed: " + e2.getMessage());
                e2.printStackTrace();
                throw new StingException("Unable to parse out interval file in either format", e);
@ -471,10 +457,16 @@ public class GenomeLocParser {
    /**
     * verify the specified genome loc is valid, if it's not, throw an exception
     * Will not verify the location against contig bounds.
+     *     
+     *
+     * Validation:  
+     * checks that start and stop are positive, start < stop, and the contig is valid                                                                      
+     * does not check that genomeLoc is actually on the contig, so start could be > end of contig  
     *
     * @param toReturn the genome loc we're about to return
     *
     * @return the genome loc if it's valid, otherwise we throw an exception
+     *
     */
    private static GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) {
        if (toReturn.getStart() < 0) {
@ -496,16 +488,24 @@ public class GenomeLocParser {

    /**
     * Verify the locus against the bounds of the contig.
+     *
+     * performs boundary validation for genome loc INTERVALS:
+     * start and stop are on contig and start <= stop
+     * does NOT check that start and stop > 0, or that contig is valid
+     * for that reason, this function should only be called AFTER exceptionOnInvalidGenomeLoc()
+     * exceptionOnInvalidGenomeLoc isn't included in this function to save time
+     *
     * @param locus Locus to verify.
     */
    private static void exceptionOnInvalidGenomeLocBounds(GenomeLoc locus) {
-        exceptionOnInvalidGenomeLoc(locus);
-
        int contigSize = contigInfo.getSequence(locus.getContigIndex()).getSequenceLength();
        if(locus.getStart() > contigSize)
            throw new StingException(String.format("GenomeLoc is invalid: locus start %d is after the end of contig %s",locus.getStart(),locus.getContig()));
        if(locus.getStop() > contigSize)
            throw new StingException(String.format("GenomeLoc is invalid: locus stop %d is after the end of contig %s",locus.getStop(),locus.getContig()));
+        if (locus.getStart() > locus.getStop()) {
+            throw new StingException("Parameters to GenomeLocParser are incorrect: the start position is greater than the end position");
+        }
    }

    /**
@ -514,6 +514,8 @@ public class GenomeLocParser {
     * @param loc the location to validate
     *
     * @return true if the passed in GenomeLoc represents a valid location
+     *
+     * performs interval-style validation: contig is valid and atart and stop less than the end
     */
    public static boolean validGenomeLoc(GenomeLoc loc) {
        checkSetup();
@ -541,6 +543,8 @@ public class GenomeLocParser {
     * @param stop   the stop position
     *
     * @return true if it's valid, false otherwise
+     *
+     * performs interval-style validation: contig is valid and atart and stop less than the end
     */
    public static boolean validGenomeLoc(String contig, long start, long stop) {
        checkSetup();
@ -556,6 +560,8 @@ public class GenomeLocParser {
     * @param stop        the stop position
     *
     * @return true if it's valid, false otherwise
+     *
+     * performs interval-style validation: contig is valid and atart and stop less than the end
     */
    public static boolean validGenomeLoc(int contigIndex, long start, long stop) {
        checkSetup();
--- a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
+++ b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
@ -373,4 +373,31 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {

        return s.toString();
    }
+
+     /**
+     * Check to see whether two genomeLocSortedSets are equal.
+     * Note that this implementation ignores the contigInfo object.
+     *
+     */  /*
+    @Override
+    public boolean equals(Object other) {
+        if(other == null)
+            return false;
+        if(other instanceof GenomeLocSortedSet) {
+            // send to a list, so we can ensure order correct
+            List otherList = ((GenomeLocSortedSet)other).toList();
+            List thisList = this.toList();
+            if (otherList.size() != this.size())
+                return false;
+
+            for (Integer i=0;i<thisList.size();i++) {
+                if (otherList.get(i).equals(thisList.get(i)))
+                    return false;
+            }
+            return true;
+        }
+        return false;
+
+    }   */
+
 }
--- a/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java
+++ b/java/test/org/broadinstitute/sting/utils/GenomeLocParserTest.java
@ -48,7 +48,7 @@ public class GenomeLocParserTest extends BaseTest {
    @Test(expected = RuntimeException.class)
    public void testGetContigIndex() {
        assertEquals(-1, GenomeLocParser.getContigIndex("blah",true)); // should not be in the reference
-    }
+    }                

    @Test
    public void testGetContigIndexValid() {
@ -80,39 +80,7 @@ public class GenomeLocParserTest extends BaseTest {
        assertEquals(100, loc.getStop());
        assertEquals(1, loc.getStart());
    }
-
-    @Test(expected = RuntimeException.class)
-    public void testParseBadLocations() {
-        GenomeLocParser.parseGenomeLocs("chr1:1-1;badChr:1-0", IntervalMergingRule.ALL);
-    }
-
-    @Test
-    public void testParseGoodLocations() {
-        GenomeLocParser.parseGenomeLocs("chr1:1-1;chr1:5-9", IntervalMergingRule.ALL);
-    }
-
-    @Test(expected = RuntimeException.class)
-    public void testParseGoodLocationsTooManySemiColons() {
-        GenomeLocParser.parseGenomeLocs("chr1:1-1;;chr1:5-9;", IntervalMergingRule.ALL);
-    }
-
-    @Test
-    public void testOverlappingGoodLocationsWithAbuttingFlag() {
-        List<GenomeLoc> locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", IntervalMergingRule.OVERLAPPING_ONLY);
-        assertEquals(1, locs.size());
-    }
-
-    @Test
-    public void testAbuttingGoodLocationsWithAbuttingOffFlag() {
-        List<GenomeLoc> locs = GenomeLocParser.parseGenomeLocs("chr1:1-4;chr1:5-9", IntervalMergingRule.OVERLAPPING_ONLY);
-        assertEquals(2, locs.size());
-    }
-    @Test
-    public void testAbuttingGoodLocationsWithNoneFlag() {
-        List<GenomeLoc> locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", IntervalMergingRule.NONE);
-        assertEquals(2, locs.size());
-    }
-
+    
    @Test
    public void testCreateGenomeLoc1() {
        GenomeLoc loc = GenomeLocParser.createGenomeLoc("chr1", 1, 100);
@ -162,16 +130,6 @@ public class GenomeLocParserTest extends BaseTest {
        assertEquals(1, copy.getStart());
    }

-    /*@Test // - uncomment if you want to test speed
-    public void testGenomeLocParserList() {
-        long start = System.currentTimeMillis();
-        List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(new String[]{"/humgen/gsa-scr1/GATK_Data/Validation_Data/bigChr1IntervalList.list"}));
-        Collections.sort(parsedIntervals);
-        LinkedList<GenomeLoc> loc = new LinkedList<GenomeLoc>(GenomeLocParser.mergeIntervalLocations(parsedIntervals));
-        long stop = System.currentTimeMillis();
-        logger.warn("Elapsed time = " + (stop - start));
-    }*/
-
    @Test
    public void testGenomeLocPlusSign() {
        GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1+");