From d650efd40ab20d38cf2139f34172fe8921ccec88 Mon Sep 17 00:00:00 2001 From: droazen Date: Wed, 27 Apr 2011 18:12:10 +0000 Subject: [PATCH] Fix for bug GSA-449: Intervals that are not in GATK format are not validated to the same standard as GATK format intervals. Full validation against contig bounds is now performed for all intervals, regardless of their source. Also fixed a few tests for validation exclusions that were backwards. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5698 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 35 +++-- .../gatk/walkers/indels/IndelRealigner.java | 6 +- .../sting/utils/GenomeLocParser.java | 146 +++++++++++------- .../sting/utils/exceptions/UserException.java | 4 + .../sting/utils/interval/IntervalUtils.java | 7 +- .../gatk/GenomeAnalysisEngineUnitTest.java | 97 ++++++++++++ 6 files changed, 219 insertions(+), 76 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 7c0d5e889..f4d0da95b 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -619,10 +619,8 @@ public class GenomeAnalysisEngine { // if include argument isn't given, create new set of all possible intervals GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? - GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : - loadIntervals(argCollection.intervals, - argCollection.intervalMerging, - genomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging))); + GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : + loadIntervals(argCollection.intervals, genomeLocParser.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging))); // if no exclude arguments, can return parseIntervalArguments directly if (argCollection.excludeIntervals == null) @@ -630,7 +628,7 @@ public class GenomeAnalysisEngine { // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets else { - GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging, null); + GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, null); intervals = includeSortedSet.subtractRegions(excludeSortedSet); // logging messages only printed when exclude (-XL) arguments are given @@ -646,27 +644,30 @@ public class GenomeAnalysisEngine { /** * Loads the intervals relevant to the current execution * @param argList String representation of arguments; might include 'all', filenames, intervals in samtools - * notation, or a combination of the - * @param mergingRule Technique to use when merging interval data. - * @param additionalIntervals a list of additional intervals to add to the returned set. Can be null. + * notation, or a combination of the above + * @param rodIntervals a list of ROD intervals to add to the returned set. Can be empty or null. * @return A sorted, merged list of all intervals specified in this arg list. */ - private GenomeLocSortedSet loadIntervals(List argList, - IntervalMergingRule mergingRule, - List additionalIntervals) { + protected GenomeLocSortedSet loadIntervals( List argList, List rodIntervals ) { - return IntervalUtils.sortAndMergeIntervals(genomeLocParser,IntervalUtils.mergeListsBySetOperator(additionalIntervals, - IntervalUtils.parseIntervalArguments(genomeLocParser,argList, - this.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST), - argCollection.BTIMergeRule), - mergingRule); + boolean allowEmptyIntervalList = (argCollection.unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST || + argCollection.unsafe == ValidationExclusion.TYPE.ALL); + + List nonRODIntervals = IntervalUtils.parseIntervalArguments(genomeLocParser, argList, allowEmptyIntervalList); + + genomeLocParser.validateGenomeLocList(nonRODIntervals); + genomeLocParser.validateGenomeLocList(rodIntervals); + + List allIntervals = IntervalUtils.mergeListsBySetOperator(rodIntervals, nonRODIntervals, argCollection.BTIMergeRule); + + return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging); } /** * if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs * @return ROD intervals as GenomeLocs */ - private List checkRODToIntervalArgument() { + private List getRODIntervals() { Map rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources); // Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag? List ret = new ArrayList(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 4c978468d..ce1b4b2b8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -251,7 +251,11 @@ public class IndelRealigner extends ReadWalker { intervals = merger; } else { // read in the whole list of intervals for cleaning - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(),IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(intervalsFile),this.getToolkit().getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST), IntervalMergingRule.OVERLAPPING_ONLY); + boolean allowEmptyIntervalList = (getToolkit().getArguments().unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST || + getToolkit().getArguments().unsafe == ValidationExclusion.TYPE.ALL); + GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), + IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(intervalsFile),allowEmptyIntervalList), + IntervalMergingRule.OVERLAPPING_ONLY); intervals = locs.iterator(); } currentInterval = intervals.hasNext() ? intervals.next() : null; diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 592ae2653..abc2b8926 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -182,7 +182,7 @@ public class GenomeLocParser { public int getContigIndex(final String contig, boolean exceptionOut) { int idx = contigInfo.getSequenceIndex(contig); if (idx == -1 && exceptionOut) - throw new UserException.CommandLineException(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); + throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); return idx; } @@ -359,73 +359,63 @@ public class GenomeLocParser { /** - * Read a file of genome locations to process. - * regions specified by the location string. The string is of the form: - * Of the form: loc1;loc2;... - * Where each locN can be: - * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' + * Read a file of genome locations to process. The file may be in BED, Picard, + * or GATK interval format. * * @param file_name interval file - * @param allowEmptyIntervalList if false empty interval lists will return null + * @param allowEmptyIntervalList if false an exception will be thrown for files that contain no intervals * @return List List of Genome Locs that have been parsed from file */ public List intervalFileToList(final String file_name, boolean allowEmptyIntervalList) { // try to open file File inputFile = new File(file_name); - - // check if file is empty - if (inputFile.exists() && inputFile.length() < 1) { - if (allowEmptyIntervalList) - return new ArrayList(); - else { - Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " + - "may want to fix (or exclude) this file."); - return null; - } - } + List ret = new ArrayList(); // case: BED file if (file_name.toUpperCase().endsWith(".BED")) { BedParser parser = new BedParser(this,inputFile); - return parser.getLocations(); + ret.addAll(parser.getLocations()); } - - /** - * IF not a BED file: - * first try to read it as an interval file since that's well structured - * we'll fail quickly if it's not a valid file. Then try to parse it as - * a location string file - */ - try { - IntervalList il = IntervalList.fromFile(inputFile); - - // iterate through the list of merged intervals and add then as GenomeLocs - List ret = new ArrayList(); - for (Interval interval : il.getIntervals()) { - ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), interval.getStart(), interval.getEnd())); - } - // always return null instead of empty list - return ret.isEmpty() ? null : ret; - - } - - // if that didn't work, try parsing file as an old fashioned string file - catch (Exception e) { + else { + /** + * IF not a BED file: + * first try to read it as a Picard interval file since that's well structured + * we'll fail quickly if it's not a valid file. + */ try { - List ret = new ArrayList(); - XReadLines reader = new XReadLines(new File(file_name)); - for(String line: reader) { - ret.add(parseGenomeInterval(line)); - } - reader.close(); + // Note: Picard will skip over intervals with contigs not in the sequence dictionary + IntervalList il = IntervalList.fromFile(inputFile); - // always return null instead of empty list - return ret.isEmpty() ? null : ret; + for (Interval interval : il.getIntervals()) { + ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), + interval.getStart(), interval.getEnd())); + } } - catch (IOException e2) { - throw new UserException.CouldNotReadInputFile(new File(file_name), e); + + // if that didn't work, try parsing file as a GATK interval file + catch (Exception e) { + try { + XReadLines reader = new XReadLines(new File(file_name)); + for(String line: reader) { + if ( line.trim().length() > 0 ) { + ret.add(parseGenomeInterval(line)); + } + } + reader.close(); + } + catch (IOException e2) { + throw new UserException.CouldNotReadInputFile(inputFile, e2); + } } } + + if ( ret.isEmpty() && ! allowEmptyIntervalList ) { + throw new UserException("The interval file " + inputFile.getAbsolutePath() + " contains no intervals " + + "that could be parsed, and the unsafe operation ALLOW_EMPTY_INTERVAL_LIST has " + + "not been enabled"); + } + + return ret; } /** @@ -475,6 +465,19 @@ public class GenomeLocParser { return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig,true), pos, pos)); } + /** + * Creates a new GenomeLoc without performing any validation on its contig or bounds. + * FOR UNIT TESTING PURPOSES ONLY! + * + * @param contig the contig name + * @param start start position of the interval + * @param stop stop position of the interval + * @return a new GenomeLoc representing the specified location + */ + public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) { + return new GenomeLoc(contig, getContigIndex(contig, false), start, stop); + } + /** * verify the specified genome loc is valid, if it's not, throw an exception * Will not verify the location against contig bounds. @@ -491,16 +494,20 @@ public class GenomeLocParser { */ private GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) { if (toReturn.getStart() < 0) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the start position is less than 0"); + throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the start position is less than 0 " + + "in interval: " + toReturn); } if ((toReturn.getStop() != -1) && (toReturn.getStop() < 0)) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the stop position is less than 0"); + throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the stop position is less than 0 " + + "in interval: " + toReturn); } if (toReturn.getContigIndex() < 0) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0"); + throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0 " + + "in interval: " + toReturn); } if (toReturn.getContigIndex() >= contigInfo.getNSequences()) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater then the stored sequence count"); + throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater than " + + "the stored sequence count in interval: " + toReturn); } return toReturn; @@ -525,7 +532,34 @@ public class GenomeLocParser { if(locus.getStop() > contigSize) throw new UserException.MalformedGenomeLoc("GenomeLoc is invalid: locus stop is after the end of contig",locus); if (locus.getStart() > locus.getStop()) { - throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect: the start position is greater than the end position", locus); + throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect: the start position is " + + "greater than the end position", locus); + } + } + + /** + * Validates each of the GenomeLocs in the list passed as an argument: each GenomeLoc must refer to a + * valid contig, and its bounds must be within the bounds of its contig. Throws a MalformedGenomeLoc + * exception if an invalid GenomeLoc is found. + * + * @param genomeLocList The list of GenomeLocs to validate + */ + public void validateGenomeLocList( List genomeLocList ) { + if ( genomeLocList == null ) { + return; + } + + for ( GenomeLoc loc : genomeLocList ) { + try { + exceptionOnInvalidGenomeLoc(loc); + exceptionOnInvalidGenomeLocBounds(loc); + } + catch ( UserException e ) { + throw e; + } + catch ( ReviewedStingException e ) { + throw new UserException.MalformedGenomeLoc(e.getMessage()); + } } } diff --git a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 62fca6524..82e22d8cf 100755 --- a/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -59,6 +59,10 @@ public class UserException extends ReviewedStingException { public MalformedGenomeLoc(String message, GenomeLoc loc) { super(String.format("Badly formed genome loc: %s: %s", message, loc)); } + + public MalformedGenomeLoc(String message) { + super(String.format("Badly formed genome loc: %s", message)); + } } public static class BadInput extends UserException { diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 68243d066..4434ea11b 100644 --- a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -56,8 +56,11 @@ public class IntervalUtils { try { rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList)); } - catch (Exception e) { - throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in either format.", e); + catch ( UserException.MalformedGenomeLoc e ) { + throw e; + } + catch ( Exception e ) { + throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in any supported format.", e); } } diff --git a/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index 49d5d5cea..e4a714e4a 100644 --- a/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -24,18 +24,30 @@ package org.broadinstitute.sting.gatk; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.util.Interval; +import net.sf.picard.util.IntervalList; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ArgumentException; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.walkers.PrintReadsWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.io.PrintWriter; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; +import java.util.List; /** @@ -78,4 +90,89 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.validateSuppliedIntervals(); } + + @DataProvider(name="invalidIntervalTestData") + public Object[][] invalidIntervalDataProvider() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + GATKArgumentCollection argCollection = new GATKArgumentCollection(); + testEngine.setArguments(argCollection); + + File fastaFile = new File("testdata/exampleFASTA.fasta"); + GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); + testEngine.setGenomeLocParser(genomeLocParser); + + return new Object[][] { + new Object[] {testEngine, genomeLocParser, "chr1", 10000000, 20000000}, + new Object[] {testEngine, genomeLocParser, "chr2", 1, 2}, + new Object[] {testEngine, genomeLocParser, "chr1", -1, 50} + }; + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidRODIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + List intervalArgs = new ArrayList(); + List rodIntervals = Arrays.asList(genomeLocParser.createGenomeLocWithoutValidation(contig, intervalStart, intervalEnd)); + + testEngine.loadIntervals(intervalArgs, rodIntervals); + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidBedIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + // We need to adjust intervalStart, since BED intervals are 0-based. We don't need to adjust intervalEnd, + // since the ending point is an open interval. + File bedFile = createTempFile("testInvalidBedIntervalHandling", ".bed", + String.format("%s %d %d", contig, intervalStart - 1, intervalEnd)); + + List intervalArgs = Arrays.asList(bedFile.getAbsolutePath()); + List rodIntervals = new ArrayList(); + + testEngine.loadIntervals(intervalArgs, rodIntervals); + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidPicardIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + SAMFileHeader picardFileHeader = new SAMFileHeader(); + picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1")); + IntervalList picardIntervals = new IntervalList(picardFileHeader); + picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname")); + + File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals"); + picardIntervals.write(picardIntervalFile); + + List intervalArgs = Arrays.asList(picardIntervalFile.getAbsolutePath()); + List rodIntervals = new ArrayList(); + + testEngine.loadIntervals(intervalArgs, rodIntervals); + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidGATKFileIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals", + String.format("%s:%d-%d", contig, intervalStart, intervalEnd)); + + List intervalArgs = Arrays.asList(gatkIntervalFile.getAbsolutePath()); + List rodIntervals = new ArrayList(); + + testEngine.loadIntervals(intervalArgs, rodIntervals); + } + + private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { + File tempFile = File.createTempFile(tempFilePrefix, tempFileExtension); + tempFile.deleteOnExit(); + + PrintWriter out = new PrintWriter(tempFile); + for ( String line : lines ) { + out.println(line); + } + out.close(); + + return tempFile; + } }