Fix for bug GSA-449: Intervals that are not in GATK format are not validated

to the same standard as GATK format intervals. Full validation against contig
bounds is now performed for all intervals, regardless of their source. Also
fixed a few tests for validation exclusions that were backwards.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5698 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
droazen 2011-04-27 18:12:10 +00:00
parent df35a143b2
commit d650efd40a
6 changed files with 219 additions and 76 deletions

View File

@ -619,10 +619,8 @@ public class GenomeAnalysisEngine {
// if include argument isn't given, create new set of all possible intervals
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
loadIntervals(argCollection.intervals,
argCollection.intervalMerging,
genomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging)));
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
loadIntervals(argCollection.intervals, genomeLocParser.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging)));
// if no exclude arguments, can return parseIntervalArguments directly
if (argCollection.excludeIntervals == null)
@ -630,7 +628,7 @@ public class GenomeAnalysisEngine {
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, argCollection.intervalMerging, null);
GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, null);
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
// logging messages only printed when exclude (-XL) arguments are given
@ -646,27 +644,30 @@ public class GenomeAnalysisEngine {
/**
* Loads the intervals relevant to the current execution
* @param argList String representation of arguments; might include 'all', filenames, intervals in samtools
* notation, or a combination of the
* @param mergingRule Technique to use when merging interval data.
* @param additionalIntervals a list of additional intervals to add to the returned set. Can be null.
* notation, or a combination of the above
* @param rodIntervals a list of ROD intervals to add to the returned set. Can be empty or null.
* @return A sorted, merged list of all intervals specified in this arg list.
*/
private GenomeLocSortedSet loadIntervals(List<String> argList,
IntervalMergingRule mergingRule,
List<GenomeLoc> additionalIntervals) {
protected GenomeLocSortedSet loadIntervals( List<String> argList, List<GenomeLoc> rodIntervals ) {
return IntervalUtils.sortAndMergeIntervals(genomeLocParser,IntervalUtils.mergeListsBySetOperator(additionalIntervals,
IntervalUtils.parseIntervalArguments(genomeLocParser,argList,
this.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST),
argCollection.BTIMergeRule),
mergingRule);
boolean allowEmptyIntervalList = (argCollection.unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST ||
argCollection.unsafe == ValidationExclusion.TYPE.ALL);
List<GenomeLoc> nonRODIntervals = IntervalUtils.parseIntervalArguments(genomeLocParser, argList, allowEmptyIntervalList);
genomeLocParser.validateGenomeLocList(nonRODIntervals);
genomeLocParser.validateGenomeLocList(rodIntervals);
List<GenomeLoc> allIntervals = IntervalUtils.mergeListsBySetOperator(rodIntervals, nonRODIntervals, argCollection.BTIMergeRule);
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging);
}
/**
* if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs
* @return ROD intervals as GenomeLocs
*/
private List<GenomeLoc> checkRODToIntervalArgument() {
private List<GenomeLoc> getRODIntervals() {
Map<String, ReferenceOrderedDataSource> rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources);
// Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag?
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();

View File

@ -251,7 +251,11 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
intervals = merger;
} else {
// read in the whole list of intervals for cleaning
GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(),IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(intervalsFile),this.getToolkit().getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST), IntervalMergingRule.OVERLAPPING_ONLY);
boolean allowEmptyIntervalList = (getToolkit().getArguments().unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST ||
getToolkit().getArguments().unsafe == ValidationExclusion.TYPE.ALL);
GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(),
IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(intervalsFile),allowEmptyIntervalList),
IntervalMergingRule.OVERLAPPING_ONLY);
intervals = locs.iterator();
}
currentInterval = intervals.hasNext() ? intervals.next() : null;

View File

@ -182,7 +182,7 @@ public class GenomeLocParser {
public int getContigIndex(final String contig, boolean exceptionOut) {
int idx = contigInfo.getSequenceIndex(contig);
if (idx == -1 && exceptionOut)
throw new UserException.CommandLineException(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig));
throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig));
return idx;
}
@ -359,73 +359,63 @@ public class GenomeLocParser {
/**
* Read a file of genome locations to process.
* regions specified by the location string. The string is of the form:
* Of the form: loc1;loc2;...
* Where each locN can be:
* 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
* Read a file of genome locations to process. The file may be in BED, Picard,
* or GATK interval format.
*
* @param file_name interval file
* @param allowEmptyIntervalList if false empty interval lists will return null
* @param allowEmptyIntervalList if false an exception will be thrown for files that contain no intervals
* @return List<GenomeLoc> List of Genome Locs that have been parsed from file
*/
public List<GenomeLoc> intervalFileToList(final String file_name, boolean allowEmptyIntervalList) {
// try to open file
File inputFile = new File(file_name);
// check if file is empty
if (inputFile.exists() && inputFile.length() < 1) {
if (allowEmptyIntervalList)
return new ArrayList<GenomeLoc>();
else {
Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " +
"may want to fix (or exclude) this file.");
return null;
}
}
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
// case: BED file
if (file_name.toUpperCase().endsWith(".BED")) {
BedParser parser = new BedParser(this,inputFile);
return parser.getLocations();
ret.addAll(parser.getLocations());
}
/**
* IF not a BED file:
* first try to read it as an interval file since that's well structured
* we'll fail quickly if it's not a valid file. Then try to parse it as
* a location string file
*/
try {
IntervalList il = IntervalList.fromFile(inputFile);
// iterate through the list of merged intervals and add then as GenomeLocs
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
for (Interval interval : il.getIntervals()) {
ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), interval.getStart(), interval.getEnd()));
}
// always return null instead of empty list
return ret.isEmpty() ? null : ret;
}
// if that didn't work, try parsing file as an old fashioned string file
catch (Exception e) {
else {
/**
* IF not a BED file:
* first try to read it as a Picard interval file since that's well structured
* we'll fail quickly if it's not a valid file.
*/
try {
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
XReadLines reader = new XReadLines(new File(file_name));
for(String line: reader) {
ret.add(parseGenomeInterval(line));
}
reader.close();
// Note: Picard will skip over intervals with contigs not in the sequence dictionary
IntervalList il = IntervalList.fromFile(inputFile);
// always return null instead of empty list
return ret.isEmpty() ? null : ret;
for (Interval interval : il.getIntervals()) {
ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true),
interval.getStart(), interval.getEnd()));
}
}
catch (IOException e2) {
throw new UserException.CouldNotReadInputFile(new File(file_name), e);
// if that didn't work, try parsing file as a GATK interval file
catch (Exception e) {
try {
XReadLines reader = new XReadLines(new File(file_name));
for(String line: reader) {
if ( line.trim().length() > 0 ) {
ret.add(parseGenomeInterval(line));
}
}
reader.close();
}
catch (IOException e2) {
throw new UserException.CouldNotReadInputFile(inputFile, e2);
}
}
}
if ( ret.isEmpty() && ! allowEmptyIntervalList ) {
throw new UserException("The interval file " + inputFile.getAbsolutePath() + " contains no intervals " +
"that could be parsed, and the unsafe operation ALLOW_EMPTY_INTERVAL_LIST has " +
"not been enabled");
}
return ret;
}
/**
@ -475,6 +465,19 @@ public class GenomeLocParser {
return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig,true), pos, pos));
}
/**
* Creates a new GenomeLoc without performing any validation on its contig or bounds.
* FOR UNIT TESTING PURPOSES ONLY!
*
* @param contig the contig name
* @param start start position of the interval
* @param stop stop position of the interval
* @return a new GenomeLoc representing the specified location
*/
public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) {
return new GenomeLoc(contig, getContigIndex(contig, false), start, stop);
}
/**
* verify the specified genome loc is valid, if it's not, throw an exception
* Will not verify the location against contig bounds.
@ -491,16 +494,20 @@ public class GenomeLocParser {
*/
private GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) {
if (toReturn.getStart() < 0) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the start position is less than 0");
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the start position is less than 0 " +
"in interval: " + toReturn);
}
if ((toReturn.getStop() != -1) && (toReturn.getStop() < 0)) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the stop position is less than 0");
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the stop position is less than 0 " +
"in interval: " + toReturn);
}
if (toReturn.getContigIndex() < 0) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0");
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0 " +
"in interval: " + toReturn);
}
if (toReturn.getContigIndex() >= contigInfo.getNSequences()) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater then the stored sequence count");
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater than " +
"the stored sequence count in interval: " + toReturn);
}
return toReturn;
@ -525,7 +532,34 @@ public class GenomeLocParser {
if(locus.getStop() > contigSize)
throw new UserException.MalformedGenomeLoc("GenomeLoc is invalid: locus stop is after the end of contig",locus);
if (locus.getStart() > locus.getStop()) {
throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect: the start position is greater than the end position", locus);
throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect: the start position is " +
"greater than the end position", locus);
}
}
/**
* Validates each of the GenomeLocs in the list passed as an argument: each GenomeLoc must refer to a
* valid contig, and its bounds must be within the bounds of its contig. Throws a MalformedGenomeLoc
* exception if an invalid GenomeLoc is found.
*
* @param genomeLocList The list of GenomeLocs to validate
*/
public void validateGenomeLocList( List<GenomeLoc> genomeLocList ) {
if ( genomeLocList == null ) {
return;
}
for ( GenomeLoc loc : genomeLocList ) {
try {
exceptionOnInvalidGenomeLoc(loc);
exceptionOnInvalidGenomeLocBounds(loc);
}
catch ( UserException e ) {
throw e;
}
catch ( ReviewedStingException e ) {
throw new UserException.MalformedGenomeLoc(e.getMessage());
}
}
}

View File

@ -59,6 +59,10 @@ public class UserException extends ReviewedStingException {
public MalformedGenomeLoc(String message, GenomeLoc loc) {
super(String.format("Badly formed genome loc: %s: %s", message, loc));
}
public MalformedGenomeLoc(String message) {
super(String.format("Badly formed genome loc: %s", message));
}
}
public static class BadInput extends UserException {

View File

@ -56,8 +56,11 @@ public class IntervalUtils {
try {
rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList));
}
catch (Exception e) {
throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in either format.", e);
catch ( UserException.MalformedGenomeLoc e ) {
throw e;
}
catch ( Exception e ) {
throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in any supported format.", e);
}
}

View File

@ -24,18 +24,30 @@
package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.util.Interval;
import net.sf.picard.util.IntervalList;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.ArgumentException;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.walkers.PrintReadsWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
/**
@ -78,4 +90,89 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
testEngine.validateSuppliedIntervals();
}
@DataProvider(name="invalidIntervalTestData")
public Object[][] invalidIntervalDataProvider() throws Exception {
GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine();
GATKArgumentCollection argCollection = new GATKArgumentCollection();
testEngine.setArguments(argCollection);
File fastaFile = new File("testdata/exampleFASTA.fasta");
GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile));
testEngine.setGenomeLocParser(genomeLocParser);
return new Object[][] {
new Object[] {testEngine, genomeLocParser, "chr1", 10000000, 20000000},
new Object[] {testEngine, genomeLocParser, "chr2", 1, 2},
new Object[] {testEngine, genomeLocParser, "chr1", -1, 50}
};
}
@Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData")
public void testInvalidRODIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser,
String contig, int intervalStart, int intervalEnd ) throws Exception {
List<String> intervalArgs = new ArrayList<String>();
List<GenomeLoc> rodIntervals = Arrays.asList(genomeLocParser.createGenomeLocWithoutValidation(contig, intervalStart, intervalEnd));
testEngine.loadIntervals(intervalArgs, rodIntervals);
}
@Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData")
public void testInvalidBedIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser,
String contig, int intervalStart, int intervalEnd ) throws Exception {
// We need to adjust intervalStart, since BED intervals are 0-based. We don't need to adjust intervalEnd,
// since the ending point is an open interval.
File bedFile = createTempFile("testInvalidBedIntervalHandling", ".bed",
String.format("%s %d %d", contig, intervalStart - 1, intervalEnd));
List<String> intervalArgs = Arrays.asList(bedFile.getAbsolutePath());
List<GenomeLoc> rodIntervals = new ArrayList<GenomeLoc>();
testEngine.loadIntervals(intervalArgs, rodIntervals);
}
@Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData")
public void testInvalidPicardIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser,
String contig, int intervalStart, int intervalEnd ) throws Exception {
SAMFileHeader picardFileHeader = new SAMFileHeader();
picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1"));
IntervalList picardIntervals = new IntervalList(picardFileHeader);
picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname"));
File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals");
picardIntervals.write(picardIntervalFile);
List<String> intervalArgs = Arrays.asList(picardIntervalFile.getAbsolutePath());
List<GenomeLoc> rodIntervals = new ArrayList<GenomeLoc>();
testEngine.loadIntervals(intervalArgs, rodIntervals);
}
@Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData")
public void testInvalidGATKFileIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser,
String contig, int intervalStart, int intervalEnd ) throws Exception {
File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals",
String.format("%s:%d-%d", contig, intervalStart, intervalEnd));
List<String> intervalArgs = Arrays.asList(gatkIntervalFile.getAbsolutePath());
List<GenomeLoc> rodIntervals = new ArrayList<GenomeLoc>();
testEngine.loadIntervals(intervalArgs, rodIntervals);
}
private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception {
File tempFile = File.createTempFile(tempFilePrefix, tempFileExtension);
tempFile.deleteOnExit();
PrintWriter out = new PrintWriter(tempFile);
for ( String line : lines ) {
out.println(line);
}
out.close();
return tempFile;
}
}