diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 7c0d5e889..f4d0da95b 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -619,10 +619,8 @@ public class GenomeAnalysisEngine { // if include argument isn't given, create new set of all possible intervals GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? - GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : - loadIntervals(argCollection.intervals, - argCollection.intervalMerging, - genomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging))); + GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : + loadIntervals(argCollection.intervals, genomeLocParser.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging))); // if no exclude arguments, can return parseIntervalArguments directly if (argCollection.excludeIntervals == null) @@ -630,7 +628,7 @@ public class GenomeAnalysisEngine { // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets else { - GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging, null); + GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, null); intervals = includeSortedSet.subtractRegions(excludeSortedSet); // logging messages only printed when exclude (-XL) arguments are given @@ -646,27 +644,30 @@ public class GenomeAnalysisEngine { /** * Loads the intervals relevant to the current execution * @param argList String representation of arguments; might include 'all', filenames, intervals in samtools - * notation, or a combination of the - * @param mergingRule Technique to use when merging interval data. - * @param additionalIntervals a list of additional intervals to add to the returned set. Can be null. + * notation, or a combination of the above + * @param rodIntervals a list of ROD intervals to add to the returned set. Can be empty or null. * @return A sorted, merged list of all intervals specified in this arg list. */ - private GenomeLocSortedSet loadIntervals(List argList, - IntervalMergingRule mergingRule, - List additionalIntervals) { + protected GenomeLocSortedSet loadIntervals( List argList, List rodIntervals ) { - return IntervalUtils.sortAndMergeIntervals(genomeLocParser,IntervalUtils.mergeListsBySetOperator(additionalIntervals, - IntervalUtils.parseIntervalArguments(genomeLocParser,argList, - this.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST), - argCollection.BTIMergeRule), - mergingRule); + boolean allowEmptyIntervalList = (argCollection.unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST || + argCollection.unsafe == ValidationExclusion.TYPE.ALL); + + List nonRODIntervals = IntervalUtils.parseIntervalArguments(genomeLocParser, argList, allowEmptyIntervalList); + + genomeLocParser.validateGenomeLocList(nonRODIntervals); + genomeLocParser.validateGenomeLocList(rodIntervals); + + List allIntervals = IntervalUtils.mergeListsBySetOperator(rodIntervals, nonRODIntervals, argCollection.BTIMergeRule); + + return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging); } /** * if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs * @return ROD intervals as GenomeLocs */ - private List checkRODToIntervalArgument() { + private List getRODIntervals() { Map rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources); // Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag? List ret = new ArrayList(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 4c978468d..ce1b4b2b8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -251,7 +251,11 @@ public class IndelRealigner extends ReadWalker { intervals = merger; } else { // read in the whole list of intervals for cleaning - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(),IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(intervalsFile),this.getToolkit().getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST), IntervalMergingRule.OVERLAPPING_ONLY); + boolean allowEmptyIntervalList = (getToolkit().getArguments().unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST || + getToolkit().getArguments().unsafe == ValidationExclusion.TYPE.ALL); + GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), + IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(intervalsFile),allowEmptyIntervalList), + IntervalMergingRule.OVERLAPPING_ONLY); intervals = locs.iterator(); } currentInterval = intervals.hasNext() ? intervals.next() : null; diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 592ae2653..abc2b8926 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -182,7 +182,7 @@ public class GenomeLocParser { public int getContigIndex(final String contig, boolean exceptionOut) { int idx = contigInfo.getSequenceIndex(contig); if (idx == -1 && exceptionOut) - throw new UserException.CommandLineException(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); + throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); return idx; } @@ -359,73 +359,63 @@ public class GenomeLocParser { /** - * Read a file of genome locations to process. - * regions specified by the location string. The string is of the form: - * Of the form: loc1;loc2;... - * Where each locN can be: - * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' + * Read a file of genome locations to process. The file may be in BED, Picard, + * or GATK interval format. * * @param file_name interval file - * @param allowEmptyIntervalList if false empty interval lists will return null + * @param allowEmptyIntervalList if false an exception will be thrown for files that contain no intervals * @return List List of Genome Locs that have been parsed from file */ public List intervalFileToList(final String file_name, boolean allowEmptyIntervalList) { // try to open file File inputFile = new File(file_name); - - // check if file is empty - if (inputFile.exists() && inputFile.length() < 1) { - if (allowEmptyIntervalList) - return new ArrayList(); - else { - Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " + - "may want to fix (or exclude) this file."); - return null; - } - } + List ret = new ArrayList(); // case: BED file if (file_name.toUpperCase().endsWith(".BED")) { BedParser parser = new BedParser(this,inputFile); - return parser.getLocations(); + ret.addAll(parser.getLocations()); } - - /** - * IF not a BED file: - * first try to read it as an interval file since that's well structured - * we'll fail quickly if it's not a valid file. Then try to parse it as - * a location string file - */ - try { - IntervalList il = IntervalList.fromFile(inputFile); - - // iterate through the list of merged intervals and add then as GenomeLocs - List ret = new ArrayList(); - for (Interval interval : il.getIntervals()) { - ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), interval.getStart(), interval.getEnd())); - } - // always return null instead of empty list - return ret.isEmpty() ? null : ret; - - } - - // if that didn't work, try parsing file as an old fashioned string file - catch (Exception e) { + else { + /** + * IF not a BED file: + * first try to read it as a Picard interval file since that's well structured + * we'll fail quickly if it's not a valid file. + */ try { - List ret = new ArrayList(); - XReadLines reader = new XReadLines(new File(file_name)); - for(String line: reader) { - ret.add(parseGenomeInterval(line)); - } - reader.close(); + // Note: Picard will skip over intervals with contigs not in the sequence dictionary + IntervalList il = IntervalList.fromFile(inputFile); - // always return null instead of empty list - return ret.isEmpty() ? null : ret; + for (Interval interval : il.getIntervals()) { + ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), + interval.getStart(), interval.getEnd())); + } } - catch (IOException e2) { - throw new UserException.CouldNotReadInputFile(new File(file_name), e); + + // if that didn't work, try parsing file as a GATK interval file + catch (Exception e) { + try { + XReadLines reader = new XReadLines(new File(file_name)); + for(String line: reader) { + if ( line.trim().length() > 0 ) { + ret.add(parseGenomeInterval(line)); + } + } + reader.close(); + } + catch (IOException e2) { + throw new UserException.CouldNotReadInputFile(inputFile, e2); + } } } + + if ( ret.isEmpty() && ! allowEmptyIntervalList ) { + throw new UserException("The interval file " + inputFile.getAbsolutePath() + " contains no intervals " + + "that could be parsed, and the unsafe operation ALLOW_EMPTY_INTERVAL_LIST has " + + "not been enabled"); + } + + return ret; } /** @@ -475,6 +465,19 @@ public class GenomeLocParser { return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig,true), pos, pos)); } + /** + * Creates a new GenomeLoc without performing any validation on its contig or bounds. + * FOR UNIT TESTING PURPOSES ONLY! + * + * @param contig the contig name + * @param start start position of the interval + * @param stop stop position of the interval + * @return a new GenomeLoc representing the specified location + */ + public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) { + return new GenomeLoc(contig, getContigIndex(contig, false), start, stop); + } + /** * verify the specified genome loc is valid, if it's not, throw an exception * Will not verify the location against contig bounds. @@ -491,16 +494,20 @@ public class GenomeLocParser { */ private GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) { if (toReturn.getStart() < 0) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the start position is less than 0"); + throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the start position is less than 0 " + + "in interval: " + toReturn); } if ((toReturn.getStop() != -1) && (toReturn.getStop() < 0)) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the stop position is less than 0"); + throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the stop position is less than 0 " + + "in interval: " + toReturn); } if (toReturn.getContigIndex() < 0) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0"); + throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0 " + + "in interval: " + toReturn); } if (toReturn.getContigIndex() >= contigInfo.getNSequences()) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater then the stored sequence count"); + throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater than " + + "the stored sequence count in interval: " + toReturn); } return toReturn; @@ -525,7 +532,34 @@ public class GenomeLocParser { if(locus.getStop() > contigSize) throw new UserException.MalformedGenomeLoc("GenomeLoc is invalid: locus stop is after the end of contig",locus); if (locus.getStart() > locus.getStop()) { - throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect: the start position is greater than the end position", locus); + throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect: the start position is " + + "greater than the end position", locus); + } + } + + /** + * Validates each of the GenomeLocs in the list passed as an argument: each GenomeLoc must refer to a + * valid contig, and its bounds must be within the bounds of its contig. Throws a MalformedGenomeLoc + * exception if an invalid GenomeLoc is found. + * + * @param genomeLocList The list of GenomeLocs to validate + */ + public void validateGenomeLocList( List genomeLocList ) { + if ( genomeLocList == null ) { + return; + } + + for ( GenomeLoc loc : genomeLocList ) { + try { + exceptionOnInvalidGenomeLoc(loc); + exceptionOnInvalidGenomeLocBounds(loc); + } + catch ( UserException e ) { + throw e; + } + catch ( ReviewedStingException e ) { + throw new UserException.MalformedGenomeLoc(e.getMessage()); + } } } diff --git a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 62fca6524..82e22d8cf 100755 --- a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -59,6 +59,10 @@ public class UserException extends ReviewedStingException { public MalformedGenomeLoc(String message, GenomeLoc loc) { super(String.format("Badly formed genome loc: %s: %s", message, loc)); } + + public MalformedGenomeLoc(String message) { + super(String.format("Badly formed genome loc: %s", message)); + } } public static class BadInput extends UserException { diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 68243d066..4434ea11b 100644 --- a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -56,8 +56,11 @@ public class IntervalUtils { try { rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList)); } - catch (Exception e) { - throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in either format.", e); + catch ( UserException.MalformedGenomeLoc e ) { + throw e; + } + catch ( Exception e ) { + throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in any supported format.", e); } } diff --git a/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index 49d5d5cea..e4a714e4a 100644 --- a/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -24,18 +24,30 @@ package org.broadinstitute.sting.gatk; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.util.Interval; +import net.sf.picard.util.IntervalList; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ArgumentException; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.walkers.PrintReadsWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.io.PrintWriter; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; +import java.util.List; /** @@ -78,4 +90,89 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.validateSuppliedIntervals(); } + + @DataProvider(name="invalidIntervalTestData") + public Object[][] invalidIntervalDataProvider() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + GATKArgumentCollection argCollection = new GATKArgumentCollection(); + testEngine.setArguments(argCollection); + + File fastaFile = new File("testdata/exampleFASTA.fasta"); + GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); + testEngine.setGenomeLocParser(genomeLocParser); + + return new Object[][] { + new Object[] {testEngine, genomeLocParser, "chr1", 10000000, 20000000}, + new Object[] {testEngine, genomeLocParser, "chr2", 1, 2}, + new Object[] {testEngine, genomeLocParser, "chr1", -1, 50} + }; + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidRODIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + List intervalArgs = new ArrayList(); + List rodIntervals = Arrays.asList(genomeLocParser.createGenomeLocWithoutValidation(contig, intervalStart, intervalEnd)); + + testEngine.loadIntervals(intervalArgs, rodIntervals); + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidBedIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + // We need to adjust intervalStart, since BED intervals are 0-based. We don't need to adjust intervalEnd, + // since the ending point is an open interval. + File bedFile = createTempFile("testInvalidBedIntervalHandling", ".bed", + String.format("%s %d %d", contig, intervalStart - 1, intervalEnd)); + + List intervalArgs = Arrays.asList(bedFile.getAbsolutePath()); + List rodIntervals = new ArrayList(); + + testEngine.loadIntervals(intervalArgs, rodIntervals); + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidPicardIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + SAMFileHeader picardFileHeader = new SAMFileHeader(); + picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1")); + IntervalList picardIntervals = new IntervalList(picardFileHeader); + picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname")); + + File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals"); + picardIntervals.write(picardIntervalFile); + + List intervalArgs = Arrays.asList(picardIntervalFile.getAbsolutePath()); + List rodIntervals = new ArrayList(); + + testEngine.loadIntervals(intervalArgs, rodIntervals); + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidGATKFileIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals", + String.format("%s:%d-%d", contig, intervalStart, intervalEnd)); + + List intervalArgs = Arrays.asList(gatkIntervalFile.getAbsolutePath()); + List rodIntervals = new ArrayList(); + + testEngine.loadIntervals(intervalArgs, rodIntervals); + } + + private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { + File tempFile = File.createTempFile(tempFilePrefix, tempFileExtension); + tempFile.deleteOnExit(); + + PrintWriter out = new PrintWriter(tempFile); + for ( String line : lines ) { + out.println(line); + } + out.close(); + + return tempFile; + } }