Reorganizing the way interval arguments are processed
Most of the changes occur in GenomeAnalysisEngine.java and GenomeLocParser.java: -- parseIntervalRegion and parseGenomeLocs combined into parseIntervalArguments -- initializeIntervals modified -- some helper functions deprecated for cleanliness Includes new set of unit tests, GenomeAnalysisEngineTest.java New restrictions: -- all interval arguments are now checked to be on the reference contig -- all interval files must have one of the following extensions: .picard, .bed, .list, .intervals, .interval_list git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3106 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c3c6e632d1
commit
b4f6f54502
|
|
@ -141,7 +141,7 @@ public class GenomeAnalysisEngine {
|
|||
throw new StingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null.");
|
||||
}
|
||||
|
||||
// validate our parameters
|
||||
// validate our parameters
|
||||
if (my_walker == null)
|
||||
throw new StingException("The walker passed to GenomeAnalysisEngine can not be null.");
|
||||
|
||||
|
|
@ -174,35 +174,105 @@ public class GenomeAnalysisEngine {
|
|||
* Setup the intervals to be processed
|
||||
*/
|
||||
private void initializeIntervals() {
|
||||
GenomeLocSortedSet excludeIntervals = null;
|
||||
if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
|
||||
List<GenomeLoc> rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL);
|
||||
excludeIntervals = GenomeLocSortedSet.createSetFromList(rawExcludeIntervals);
|
||||
}
|
||||
|
||||
if (argCollection.intervals != null && argCollection.intervalMerging.check()) {
|
||||
List <GenomeLoc> parsedIntervals = parseIntervalRegion(argCollection.intervals);
|
||||
intervals = (parsedIntervals == null) ? null: GenomeLocSortedSet.createSetFromList(parsedIntervals);
|
||||
}
|
||||
/*
|
||||
if (argCollection.intervals != null && argCollection.intervalMerging.check()) {
|
||||
intervals = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.intervals));
|
||||
}
|
||||
*/
|
||||
if ( excludeIntervals != null ) {
|
||||
GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals;
|
||||
long toPruneSize = toPrune.coveredSize();
|
||||
long toExcludeSize = excludeIntervals.coveredSize();
|
||||
logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
|
||||
logger.info(String.format("Initial exclude intervals cover %d bases", toExcludeSize));
|
||||
intervals = toPrune.substractRegions( excludeIntervals );
|
||||
long intervalSize = intervals.coveredSize();
|
||||
logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
|
||||
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
||||
}
|
||||
// return null if no interval arguments at all
|
||||
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null))
|
||||
return;
|
||||
|
||||
if ( intervals != null )
|
||||
logger.info(String.format("Processing %d bases in intervals", intervals.coveredSize()));
|
||||
else {
|
||||
// if include argument isn't given, create new set of all possible intervals
|
||||
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
|
||||
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) :
|
||||
parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging));
|
||||
|
||||
// if no exclude arguments, can return parseIntervalArguments directly
|
||||
if (argCollection.excludeIntervals == null)
|
||||
intervals = includeSortedSet;
|
||||
|
||||
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
|
||||
else {
|
||||
GenomeLocSortedSet excludeSortedSet = parseIntervalArguments(argCollection.excludeIntervals, argCollection.intervalMerging);
|
||||
intervals = includeSortedSet.substractRegions(excludeSortedSet);
|
||||
|
||||
// logging messages only printed when exclude (-XL) arguments are given
|
||||
long toPruneSize = includeSortedSet.coveredSize();
|
||||
long toExcludeSize = excludeSortedSet.coveredSize();
|
||||
long intervalSize = intervals.coveredSize();
|
||||
logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
|
||||
logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
|
||||
toExcludeSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL
|
||||
* Set is sorted and merged
|
||||
*/
|
||||
|
||||
public static GenomeLocSortedSet parseIntervalArguments(final List<String> intervals) {
|
||||
return parseIntervalArguments(intervals, GenomeAnalysisEngine.instance.getArguments().intervalMerging);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL
|
||||
* Set is sorted and merged
|
||||
*/
|
||||
public static GenomeLocSortedSet parseIntervalArguments(List <String> argList, IntervalMergingRule mergingRule) {
|
||||
|
||||
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>(); // running list of raw GenomeLocs
|
||||
|
||||
for (String argument : argList) {
|
||||
|
||||
// if any interval argument is '-L all', consider all loci by returning no intervals
|
||||
if (argument.equals("all")) {
|
||||
if (argList.size() != 1) {
|
||||
// throw error if '-L all' is not only interval - potentially conflicting commands
|
||||
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// separate argument on semicolon first
|
||||
for (String fileOrInterval : argument.split(";")) {
|
||||
|
||||
// if it's a file, add items to raw interval list
|
||||
if (isFile(fileOrInterval))
|
||||
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
|
||||
|
||||
// otherwise treat as an interval -> parse and add to raw interval list
|
||||
else {
|
||||
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
|
||||
}
|
||||
}
|
||||
}
|
||||
// redundant check => default no arguments is null, not empty list
|
||||
if (rawIntervals.size() == 0)
|
||||
return null;
|
||||
|
||||
// sort raw interval list
|
||||
Collections.sort(rawIntervals);
|
||||
|
||||
// now merge raw interval list
|
||||
rawIntervals = GenomeLocParser.mergeIntervalLocations(rawIntervals, mergingRule);
|
||||
|
||||
return GenomeLocSortedSet.createSetFromList(rawIntervals);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if string argument was intented as a file
|
||||
* Accepted file extensions: .list, .interval_list, .bed, .picard
|
||||
*/
|
||||
private static boolean isFile(String str) {
|
||||
// should we define list of file extensions as a public array somewhere?
|
||||
// is regex or endsiwth better?
|
||||
if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") ||
|
||||
str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST")
|
||||
|| str.toUpperCase().endsWith(".INTERVALS"))
|
||||
return true;
|
||||
else return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -319,53 +389,6 @@ public class GenomeAnalysisEngine {
|
|||
return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource,rodDataSources,argCollection.numberOfThreads);
|
||||
}
|
||||
|
||||
/**
|
||||
* setup the interval regions, from either the interval file of the genome region string
|
||||
*
|
||||
* @param intervals the list of intervals to parse
|
||||
* @return a list of genomeLoc representing the interval file
|
||||
*/
|
||||
public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals) {
|
||||
return parseIntervalRegion(intervals, GenomeAnalysisEngine.instance.getArguments().intervalMerging);
|
||||
}
|
||||
|
||||
/**
|
||||
* setup the interval regions, from either the interval file of the genome region string
|
||||
*
|
||||
* @param intervals the list of intervals to parse
|
||||
* @param mergingRule the rule for merging intervals
|
||||
* @return a list of genomeLoc representing the interval file
|
||||
*/
|
||||
public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals, IntervalMergingRule mergingRule) {
|
||||
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
||||
for (String interval : intervals) {
|
||||
// if any interval argument is '-L all', consider all loci by returning no intervals
|
||||
if (interval.equals("all")) {
|
||||
if (intervals.size() != 1) {
|
||||
// throw error if '-L all' is not only interval - potentially conflicting commands
|
||||
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
||||
}
|
||||
return new ArrayList<GenomeLoc>();
|
||||
}
|
||||
|
||||
if (new File(interval).exists()) {
|
||||
// support for the bed style interval format
|
||||
if (interval.toUpperCase().endsWith(".BED")) {
|
||||
Utils.warnUser("Bed files are 0 based half-open intervals, which are converted to 1-based closed intervals in the GATK. " +
|
||||
"Be aware that all output information and intervals are 1-based closed intervals.");
|
||||
BedParser parser = new BedParser(new File(interval));
|
||||
locs.addAll(parser.getSortedAndMergedLocations(mergingRule));
|
||||
} else {
|
||||
locs.addAll(GenomeLocParser.intervalFileToList(interval,mergingRule));
|
||||
}
|
||||
} else {
|
||||
locs.addAll(GenomeLocParser.parseGenomeLocs(interval,mergingRule));
|
||||
}
|
||||
|
||||
}
|
||||
return locs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a unique identifier for the reader sourcing this read.
|
||||
* @param read Read to examine.
|
||||
|
|
|
|||
|
|
@ -5,9 +5,10 @@ import java.util.*;
|
|||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
public class IntervalRodIterator implements Iterator<IntervalRod> {
|
||||
private List<GenomeLoc> locations = null;
|
||||
//private List<GenomeLoc> locations = null;
|
||||
private Iterator<GenomeLoc> iter;
|
||||
private String trackName = null;
|
||||
|
||||
|
|
@ -18,15 +19,15 @@ public class IntervalRodIterator implements Iterator<IntervalRod> {
|
|||
|
||||
public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) {
|
||||
//System.out.printf("Parsing %s for intervals %s%n", file, trackName);
|
||||
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(Collections.singletonList(file.getPath()));
|
||||
GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Collections.singletonList(file.getPath()));
|
||||
//System.out.printf(" => got %d entries %n", locs.size());
|
||||
return new IntervalRodIterator(trackName, locs);
|
||||
}
|
||||
|
||||
public IntervalRodIterator(String trackName, List<GenomeLoc> locs) {
|
||||
public IntervalRodIterator(String trackName, GenomeLocSortedSet locs) {
|
||||
this.trackName = trackName;
|
||||
locations = locs;
|
||||
iter = locations.iterator();
|
||||
//locations = locs;
|
||||
iter = locs.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -126,8 +126,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");
|
||||
|
||||
// read in the intervals for cleaning
|
||||
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
intervals = GenomeLocSortedSet.createSetFromList(locs).iterator();
|
||||
GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Arrays.asList(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
intervals = locs.iterator();
|
||||
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
||||
|
||||
// set up the output writer(s)
|
||||
|
|
|
|||
|
|
@ -118,13 +118,11 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
|
|||
|
||||
for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) {
|
||||
if( vc != null && vc.getName().toUpperCase().startsWith("TRUTH") ) {
|
||||
if( vc.isSNP() && !vc.isFiltered() ) {
|
||||
if( !vc.getGenotype(sampleName).isNoCall() ) {
|
||||
isInTruthSet = true;
|
||||
if( !vc.getGenotype(sampleName).isNoCall() ) {
|
||||
isInTruthSet = true;
|
||||
|
||||
if( !vc.getGenotype(sampleName).isHomRef() ) {
|
||||
isTrueVariant = true;
|
||||
}
|
||||
if( !vc.getGenotype(sampleName).isHomRef() ) {
|
||||
isTrueVariant = true;
|
||||
}
|
||||
}
|
||||
//if( vc.isPolymorphic() ) { //BUGBUG: I don't think this is the right thing to do here, there are many polymorphic sites in the truth data because there are many samples
|
||||
|
|
@ -259,7 +257,7 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
|
|||
final double sensitivity = ((double) truePos[curveIndex]) / ((double) truePos[curveIndex] + falseNegGlobal[curveIndex] + falseNeg[curveIndex]);
|
||||
final double specificity = ((double) trueNegGlobal[curveIndex] + trueNeg[curveIndex]) /
|
||||
((double) falsePos[curveIndex] + trueNegGlobal[curveIndex] + trueNeg[curveIndex]);
|
||||
outputFile.print( String.format("%.8f,%.8f,%.8f,", qualCut[curveIndex], sensitivity, 1.0 - specificity) );
|
||||
outputFile.print( String.format("%.4f,%.4f,%.4f,", qualCut[curveIndex], sensitivity, 1.0 - specificity) );
|
||||
qualCut[curveIndex] += incrementQual[curveIndex];
|
||||
curveIndex++;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.utils.bed.BedParser;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -133,24 +134,35 @@ public class GenomeLocParser {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load one or more intervals sources, sorting and merging overlapping intervals.
|
||||
* @param intervalsSource Source of intervals.
|
||||
* @param rule the merging rule we're using
|
||||
* @return a list of sorted, merged intervals.
|
||||
/**
|
||||
* parse a genome interval, from a location string
|
||||
*
|
||||
* Performs interval-style validation:
|
||||
*
|
||||
* contig is valid; start and stop less than the end; start <= sto
|
||||
* @param str the string to parse
|
||||
*
|
||||
* @return a GenomeLoc representing the String
|
||||
*
|
||||
*/
|
||||
public static List<GenomeLoc> parseIntervals(List<String> intervalsSource, IntervalMergingRule rule) {
|
||||
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource);
|
||||
Collections.sort(parsedIntervals);
|
||||
return GenomeLocParser.mergeIntervalLocations(parsedIntervals, rule);
|
||||
}
|
||||
|
||||
public static GenomeLoc parseGenomeInterval(final String str) {
|
||||
GenomeLoc ret = parseGenomeLoc(str);
|
||||
exceptionOnInvalidGenomeLocBounds(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse a genome location, from a location string
|
||||
*
|
||||
* Performs read-style validation:
|
||||
* checks that start and stop are positive, start < stop, and the contig is valid
|
||||
* does not check that genomeLoc is actually on the contig
|
||||
*
|
||||
* @param str the string to parse
|
||||
*
|
||||
* @return a GenomeLoc representing the String
|
||||
*
|
||||
*/
|
||||
public static GenomeLoc parseGenomeLoc(final String str) {
|
||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||
|
|
@ -183,53 +195,18 @@ public class GenomeLocParser {
|
|||
|
||||
if (bad)
|
||||
throw new StingException("Failed to parse Genome Location string: " + str);
|
||||
if (start < 0)
|
||||
throw new StingException("Invalid Genome Location start < 0: " + str + ' ' + start);
|
||||
if (stop < 0)
|
||||
throw new StingException("Invalid Genome Location stop < 0: " + str + ' ' + stop);
|
||||
if (contig == null)
|
||||
throw new StingException("Invalid Genome Location contig == null : " + str);
|
||||
|
||||
if (start > stop)
|
||||
throw new StingException("Invalid Genome Location string; start position comes after end position: " + str );
|
||||
|
||||
if (!isContigValid(contig))
|
||||
// is the contig valid?
|
||||
if (!isContigValid(contig))
|
||||
throw new StingException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference.");
|
||||
|
||||
if (stop == Integer.MAX_VALUE && hasKnownContigOrdering())
|
||||
// lookup the actually stop position!
|
||||
stop = getContigInfo(contig).getSequenceLength();
|
||||
|
||||
GenomeLoc loc = parseGenomeLoc(contig, start, stop);
|
||||
return loc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Useful utility function that parses a location string into a coordinate-order sorted
|
||||
* array of GenomeLoc objects
|
||||
*
|
||||
* @param str String representation of genome locs. Null string corresponds to no filter.
|
||||
* @param rule the merging rule we're using
|
||||
*
|
||||
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
|
||||
*/
|
||||
public static List<GenomeLoc> parseGenomeLocs(final String str, IntervalMergingRule rule) {
|
||||
// Null string means no filter.
|
||||
if (str == null) return null;
|
||||
|
||||
// Of the form: loc1;loc2;...
|
||||
// Where each locN can be:
|
||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||
try {
|
||||
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
||||
for (String loc : str.split(";"))
|
||||
locs.add(parseGenomeLoc(loc.trim()));
|
||||
Collections.sort(locs);
|
||||
locs = mergeIntervalLocations(locs, rule);
|
||||
return locs;
|
||||
} catch (Exception e) { // TODO: fix this so that it passes the message from the exception, and doesn't print it out
|
||||
throw new StingException(String.format("Invalid locations string: %s, format is loc1;loc2; where loc1 < loc2. Each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str), e);
|
||||
}
|
||||
GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig,true), start, stop);
|
||||
exceptionOnInvalidGenomeLoc(locus);
|
||||
return locus;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -252,7 +229,7 @@ public class GenomeLocParser {
|
|||
* @return the list of merged locations
|
||||
*/
|
||||
public static List<GenomeLoc> mergeIntervalLocations(final List<GenomeLoc> raw, IntervalMergingRule rule) {
|
||||
if (raw.size() <= 1 || rule == IntervalMergingRule.NONE)
|
||||
if (raw.size() <= 1)
|
||||
return raw;
|
||||
else {
|
||||
ArrayList<GenomeLoc> merged = new ArrayList<GenomeLoc>();
|
||||
|
|
@ -282,19 +259,7 @@ public class GenomeLocParser {
|
|||
*/
|
||||
private static boolean isContigValid(String contig) {
|
||||
int contigIndex = contigInfo.getSequenceIndex(contig);
|
||||
return isSequenceIndexValid(contigIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the given sequence index is valid with respect to the sequence dictionary.
|
||||
*
|
||||
* @param sequenceIndex sequence index
|
||||
*
|
||||
* @return True if the sequence index is valid, false otherwise.
|
||||
*/
|
||||
private static boolean isSequenceIndexValid(int sequenceIndex) {
|
||||
return sequenceIndex >= 0 && sequenceIndex < contigInfo.size();
|
||||
|
||||
return contigIndex >= 0 && contigIndex < contigInfo.size();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -305,6 +270,9 @@ public class GenomeLocParser {
|
|||
* @param stop Stop point.
|
||||
*
|
||||
* @return The genome location, or a MalformedGenomeLocException if unparseable.
|
||||
*
|
||||
* Validation: only checks that contig is valid
|
||||
* start/stop could be anything
|
||||
*/
|
||||
public static GenomeLoc parseGenomeLoc(final String contig, long start, long stop) {
|
||||
if (!isContigValid(contig))
|
||||
|
|
@ -324,52 +292,70 @@ public class GenomeLocParser {
|
|||
* @param rule also merge abutting intervals
|
||||
*/
|
||||
public static List<GenomeLoc> intervalFileToList(final String file_name, IntervalMergingRule rule) {
|
||||
// try to open file
|
||||
File inputFile = null;
|
||||
try {
|
||||
inputFile = new File(file_name);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new StingException("Could not open file", e);
|
||||
}
|
||||
|
||||
// check if file is empty
|
||||
if (inputFile.exists() && inputFile.length() < 1) {
|
||||
if (GenomeAnalysisEngine.instance.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST)
|
||||
return new ArrayList<GenomeLoc>();
|
||||
else {
|
||||
Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " +
|
||||
"may want to fix (or exclude) this file.");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// case: BED file
|
||||
if (file_name.toUpperCase().endsWith(".BED")) {
|
||||
BedParser parser = new BedParser(inputFile);
|
||||
return parser.getSortedAndMergedLocations(rule);
|
||||
}
|
||||
|
||||
/**
|
||||
* IF not a BED file:
|
||||
* first try to read it as an interval file since that's well structured
|
||||
* we'll fail quickly if it's not a valid file. Then try to parse it as
|
||||
* a location string file
|
||||
*/
|
||||
List<GenomeLoc> ret = null;
|
||||
try {
|
||||
File inputFile = new File(file_name);
|
||||
|
||||
// sometimes we see an empty file passed as a parameter, if so return an empty list
|
||||
if (inputFile.exists() && inputFile.length() < 1) {
|
||||
if (GenomeAnalysisEngine.instance.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST)
|
||||
return new ArrayList<GenomeLoc>();
|
||||
else {
|
||||
Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " +
|
||||
"may want to fix (or exclude) this file.");
|
||||
return new ArrayList<GenomeLoc>();
|
||||
}
|
||||
}
|
||||
IntervalList il = IntervalList.fromFile(inputFile);
|
||||
|
||||
// iterate through the list of merged intervals and add then as GenomeLocs
|
||||
ret = new ArrayList<GenomeLoc>();
|
||||
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
|
||||
for (Interval interval : il.getUniqueIntervals()) {
|
||||
ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), interval.getStart(), interval.getEnd()));
|
||||
}
|
||||
return ret;
|
||||
// always return null instead of empty list
|
||||
return ret.isEmpty() ? null : ret;
|
||||
|
||||
} catch (Exception e) {
|
||||
}
|
||||
|
||||
// if that didn't work, try parsing file as an old fashioned string file
|
||||
catch (Exception e) {
|
||||
try {
|
||||
ret = new ArrayList<GenomeLoc>();
|
||||
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
|
||||
xReadLines reader = new xReadLines(new File(file_name));
|
||||
for(String line: reader) {
|
||||
List<GenomeLoc> loci = parseGenomeLocs(line, rule);
|
||||
if(loci != null)
|
||||
ret.addAll(loci);
|
||||
try {
|
||||
ret.add(parseGenomeInterval(line));
|
||||
}
|
||||
catch (Exception e2) {
|
||||
throw new StingException(String.format("Unable to parse interval: %s in file: %s", line, file_name));
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
|
||||
if(ret.isEmpty())
|
||||
return null;
|
||||
|
||||
for(GenomeLoc locus: ret)
|
||||
exceptionOnInvalidGenomeLocBounds(locus);
|
||||
return ret;
|
||||
} catch (Exception e2) {
|
||||
// always return null instead of empty list
|
||||
return ret.isEmpty() ? null : ret;
|
||||
}
|
||||
catch (Exception e2) {
|
||||
logger.error("Attempt to parse interval file in GATK format failed: " + e2.getMessage());
|
||||
e2.printStackTrace();
|
||||
throw new StingException("Unable to parse out interval file in either format", e);
|
||||
|
|
@ -471,10 +457,16 @@ public class GenomeLocParser {
|
|||
/**
|
||||
* verify the specified genome loc is valid, if it's not, throw an exception
|
||||
* Will not verify the location against contig bounds.
|
||||
*
|
||||
*
|
||||
* Validation:
|
||||
* checks that start and stop are positive, start < stop, and the contig is valid
|
||||
* does not check that genomeLoc is actually on the contig, so start could be > end of contig
|
||||
*
|
||||
* @param toReturn the genome loc we're about to return
|
||||
*
|
||||
* @return the genome loc if it's valid, otherwise we throw an exception
|
||||
*
|
||||
*/
|
||||
private static GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) {
|
||||
if (toReturn.getStart() < 0) {
|
||||
|
|
@ -496,16 +488,24 @@ public class GenomeLocParser {
|
|||
|
||||
/**
|
||||
* Verify the locus against the bounds of the contig.
|
||||
*
|
||||
* performs boundary validation for genome loc INTERVALS:
|
||||
* start and stop are on contig and start <= stop
|
||||
* does NOT check that start and stop > 0, or that contig is valid
|
||||
* for that reason, this function should only be called AFTER exceptionOnInvalidGenomeLoc()
|
||||
* exceptionOnInvalidGenomeLoc isn't included in this function to save time
|
||||
*
|
||||
* @param locus Locus to verify.
|
||||
*/
|
||||
private static void exceptionOnInvalidGenomeLocBounds(GenomeLoc locus) {
|
||||
exceptionOnInvalidGenomeLoc(locus);
|
||||
|
||||
int contigSize = contigInfo.getSequence(locus.getContigIndex()).getSequenceLength();
|
||||
if(locus.getStart() > contigSize)
|
||||
throw new StingException(String.format("GenomeLoc is invalid: locus start %d is after the end of contig %s",locus.getStart(),locus.getContig()));
|
||||
if(locus.getStop() > contigSize)
|
||||
throw new StingException(String.format("GenomeLoc is invalid: locus stop %d is after the end of contig %s",locus.getStop(),locus.getContig()));
|
||||
if (locus.getStart() > locus.getStop()) {
|
||||
throw new StingException("Parameters to GenomeLocParser are incorrect: the start position is greater than the end position");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -514,6 +514,8 @@ public class GenomeLocParser {
|
|||
* @param loc the location to validate
|
||||
*
|
||||
* @return true if the passed in GenomeLoc represents a valid location
|
||||
*
|
||||
* performs interval-style validation: contig is valid and atart and stop less than the end
|
||||
*/
|
||||
public static boolean validGenomeLoc(GenomeLoc loc) {
|
||||
checkSetup();
|
||||
|
|
@ -541,6 +543,8 @@ public class GenomeLocParser {
|
|||
* @param stop the stop position
|
||||
*
|
||||
* @return true if it's valid, false otherwise
|
||||
*
|
||||
* performs interval-style validation: contig is valid and atart and stop less than the end
|
||||
*/
|
||||
public static boolean validGenomeLoc(String contig, long start, long stop) {
|
||||
checkSetup();
|
||||
|
|
@ -556,6 +560,8 @@ public class GenomeLocParser {
|
|||
* @param stop the stop position
|
||||
*
|
||||
* @return true if it's valid, false otherwise
|
||||
*
|
||||
* performs interval-style validation: contig is valid and atart and stop less than the end
|
||||
*/
|
||||
public static boolean validGenomeLoc(int contigIndex, long start, long stop) {
|
||||
checkSetup();
|
||||
|
|
|
|||
|
|
@ -373,4 +373,31 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check to see whether two genomeLocSortedSets are equal.
|
||||
* Note that this implementation ignores the contigInfo object.
|
||||
*
|
||||
*/ /*
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if(other == null)
|
||||
return false;
|
||||
if(other instanceof GenomeLocSortedSet) {
|
||||
// send to a list, so we can ensure order correct
|
||||
List otherList = ((GenomeLocSortedSet)other).toList();
|
||||
List thisList = this.toList();
|
||||
if (otherList.size() != this.size())
|
||||
return false;
|
||||
|
||||
for (Integer i=0;i<thisList.size();i++) {
|
||||
if (otherList.get(i).equals(thisList.get(i)))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
} */
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ public class GenomeLocParserTest extends BaseTest {
|
|||
@Test(expected = RuntimeException.class)
|
||||
public void testGetContigIndex() {
|
||||
assertEquals(-1, GenomeLocParser.getContigIndex("blah",true)); // should not be in the reference
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetContigIndexValid() {
|
||||
|
|
@ -80,39 +80,7 @@ public class GenomeLocParserTest extends BaseTest {
|
|||
assertEquals(100, loc.getStop());
|
||||
assertEquals(1, loc.getStart());
|
||||
}
|
||||
|
||||
@Test(expected = RuntimeException.class)
|
||||
public void testParseBadLocations() {
|
||||
GenomeLocParser.parseGenomeLocs("chr1:1-1;badChr:1-0", IntervalMergingRule.ALL);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseGoodLocations() {
|
||||
GenomeLocParser.parseGenomeLocs("chr1:1-1;chr1:5-9", IntervalMergingRule.ALL);
|
||||
}
|
||||
|
||||
@Test(expected = RuntimeException.class)
|
||||
public void testParseGoodLocationsTooManySemiColons() {
|
||||
GenomeLocParser.parseGenomeLocs("chr1:1-1;;chr1:5-9;", IntervalMergingRule.ALL);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOverlappingGoodLocationsWithAbuttingFlag() {
|
||||
List<GenomeLoc> locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
assertEquals(1, locs.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAbuttingGoodLocationsWithAbuttingOffFlag() {
|
||||
List<GenomeLoc> locs = GenomeLocParser.parseGenomeLocs("chr1:1-4;chr1:5-9", IntervalMergingRule.OVERLAPPING_ONLY);
|
||||
assertEquals(2, locs.size());
|
||||
}
|
||||
@Test
|
||||
public void testAbuttingGoodLocationsWithNoneFlag() {
|
||||
List<GenomeLoc> locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", IntervalMergingRule.NONE);
|
||||
assertEquals(2, locs.size());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testCreateGenomeLoc1() {
|
||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc("chr1", 1, 100);
|
||||
|
|
@ -162,16 +130,6 @@ public class GenomeLocParserTest extends BaseTest {
|
|||
assertEquals(1, copy.getStart());
|
||||
}
|
||||
|
||||
/*@Test // - uncomment if you want to test speed
|
||||
public void testGenomeLocParserList() {
|
||||
long start = System.currentTimeMillis();
|
||||
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(new String[]{"/humgen/gsa-scr1/GATK_Data/Validation_Data/bigChr1IntervalList.list"}));
|
||||
Collections.sort(parsedIntervals);
|
||||
LinkedList<GenomeLoc> loc = new LinkedList<GenomeLoc>(GenomeLocParser.mergeIntervalLocations(parsedIntervals));
|
||||
long stop = System.currentTimeMillis();
|
||||
logger.warn("Elapsed time = " + (stop - start));
|
||||
}*/
|
||||
|
||||
@Test
|
||||
public void testGenomeLocPlusSign() {
|
||||
GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1+");
|
||||
|
|
|
|||
Loading…
Reference in New Issue