Reorganizing the way interval arguments are processed
Most of the changes occur in GenomeAnalysisEngine.java and GenomeLocParser.java: -- parseIntervalRegion and parseGenomeLocs combined into parseIntervalArguments -- initializeIntervals modified -- some helper functions deprecated for cleanliness Includes new set of unit tests, GenomeAnalysisEngineTest.java New restrictions: -- all interval arguments are now checked to be on the reference contig -- all interval files must have one of the following extensions: .picard, .bed, .list, .intervals, .interval_list git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3106 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c3c6e632d1
commit
b4f6f54502
|
|
@ -141,7 +141,7 @@ public class GenomeAnalysisEngine {
|
||||||
throw new StingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null.");
|
throw new StingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// validate our parameters
|
// validate our parameters
|
||||||
if (my_walker == null)
|
if (my_walker == null)
|
||||||
throw new StingException("The walker passed to GenomeAnalysisEngine can not be null.");
|
throw new StingException("The walker passed to GenomeAnalysisEngine can not be null.");
|
||||||
|
|
||||||
|
|
@ -174,35 +174,105 @@ public class GenomeAnalysisEngine {
|
||||||
* Setup the intervals to be processed
|
* Setup the intervals to be processed
|
||||||
*/
|
*/
|
||||||
private void initializeIntervals() {
|
private void initializeIntervals() {
|
||||||
GenomeLocSortedSet excludeIntervals = null;
|
|
||||||
if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
|
|
||||||
List<GenomeLoc> rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL);
|
|
||||||
excludeIntervals = GenomeLocSortedSet.createSetFromList(rawExcludeIntervals);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argCollection.intervals != null && argCollection.intervalMerging.check()) {
|
// return null if no interval arguments at all
|
||||||
List <GenomeLoc> parsedIntervals = parseIntervalRegion(argCollection.intervals);
|
if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null))
|
||||||
intervals = (parsedIntervals == null) ? null: GenomeLocSortedSet.createSetFromList(parsedIntervals);
|
return;
|
||||||
}
|
|
||||||
/*
|
|
||||||
if (argCollection.intervals != null && argCollection.intervalMerging.check()) {
|
|
||||||
intervals = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.intervals));
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
if ( excludeIntervals != null ) {
|
|
||||||
GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals;
|
|
||||||
long toPruneSize = toPrune.coveredSize();
|
|
||||||
long toExcludeSize = excludeIntervals.coveredSize();
|
|
||||||
logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
|
|
||||||
logger.info(String.format("Initial exclude intervals cover %d bases", toExcludeSize));
|
|
||||||
intervals = toPrune.substractRegions( excludeIntervals );
|
|
||||||
long intervalSize = intervals.coveredSize();
|
|
||||||
logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
|
|
||||||
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( intervals != null )
|
else {
|
||||||
logger.info(String.format("Processing %d bases in intervals", intervals.coveredSize()));
|
// if include argument isn't given, create new set of all possible intervals
|
||||||
|
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
|
||||||
|
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) :
|
||||||
|
parseIntervalArguments(argCollection.intervals, argCollection.intervalMerging));
|
||||||
|
|
||||||
|
// if no exclude arguments, can return parseIntervalArguments directly
|
||||||
|
if (argCollection.excludeIntervals == null)
|
||||||
|
intervals = includeSortedSet;
|
||||||
|
|
||||||
|
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
|
||||||
|
else {
|
||||||
|
GenomeLocSortedSet excludeSortedSet = parseIntervalArguments(argCollection.excludeIntervals, argCollection.intervalMerging);
|
||||||
|
intervals = includeSortedSet.substractRegions(excludeSortedSet);
|
||||||
|
|
||||||
|
// logging messages only printed when exclude (-XL) arguments are given
|
||||||
|
long toPruneSize = includeSortedSet.coveredSize();
|
||||||
|
long toExcludeSize = excludeSortedSet.coveredSize();
|
||||||
|
long intervalSize = intervals.coveredSize();
|
||||||
|
logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
|
||||||
|
logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
|
||||||
|
toExcludeSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL
|
||||||
|
* Set is sorted and merged
|
||||||
|
*/
|
||||||
|
|
||||||
|
public static GenomeLocSortedSet parseIntervalArguments(final List<String> intervals) {
|
||||||
|
return parseIntervalArguments(intervals, GenomeAnalysisEngine.instance.getArguments().intervalMerging);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a GenomeLocSortedSet from a set of LIKE arguments - either -L or -XL
|
||||||
|
* Set is sorted and merged
|
||||||
|
*/
|
||||||
|
public static GenomeLocSortedSet parseIntervalArguments(List <String> argList, IntervalMergingRule mergingRule) {
|
||||||
|
|
||||||
|
List<GenomeLoc> rawIntervals = new ArrayList<GenomeLoc>(); // running list of raw GenomeLocs
|
||||||
|
|
||||||
|
for (String argument : argList) {
|
||||||
|
|
||||||
|
// if any interval argument is '-L all', consider all loci by returning no intervals
|
||||||
|
if (argument.equals("all")) {
|
||||||
|
if (argList.size() != 1) {
|
||||||
|
// throw error if '-L all' is not only interval - potentially conflicting commands
|
||||||
|
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// separate argument on semicolon first
|
||||||
|
for (String fileOrInterval : argument.split(";")) {
|
||||||
|
|
||||||
|
// if it's a file, add items to raw interval list
|
||||||
|
if (isFile(fileOrInterval))
|
||||||
|
rawIntervals.addAll(GenomeLocParser.intervalFileToList(fileOrInterval, mergingRule));
|
||||||
|
|
||||||
|
// otherwise treat as an interval -> parse and add to raw interval list
|
||||||
|
else {
|
||||||
|
rawIntervals.add(GenomeLocParser.parseGenomeInterval(fileOrInterval));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// redundant check => default no arguments is null, not empty list
|
||||||
|
if (rawIntervals.size() == 0)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// sort raw interval list
|
||||||
|
Collections.sort(rawIntervals);
|
||||||
|
|
||||||
|
// now merge raw interval list
|
||||||
|
rawIntervals = GenomeLocParser.mergeIntervalLocations(rawIntervals, mergingRule);
|
||||||
|
|
||||||
|
return GenomeLocSortedSet.createSetFromList(rawIntervals);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if string argument was intented as a file
|
||||||
|
* Accepted file extensions: .list, .interval_list, .bed, .picard
|
||||||
|
*/
|
||||||
|
private static boolean isFile(String str) {
|
||||||
|
// should we define list of file extensions as a public array somewhere?
|
||||||
|
// is regex or endsiwth better?
|
||||||
|
if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") ||
|
||||||
|
str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST")
|
||||||
|
|| str.toUpperCase().endsWith(".INTERVALS"))
|
||||||
|
return true;
|
||||||
|
else return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -319,53 +389,6 @@ public class GenomeAnalysisEngine {
|
||||||
return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource,rodDataSources,argCollection.numberOfThreads);
|
return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource,rodDataSources,argCollection.numberOfThreads);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* setup the interval regions, from either the interval file of the genome region string
|
|
||||||
*
|
|
||||||
* @param intervals the list of intervals to parse
|
|
||||||
* @return a list of genomeLoc representing the interval file
|
|
||||||
*/
|
|
||||||
public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals) {
|
|
||||||
return parseIntervalRegion(intervals, GenomeAnalysisEngine.instance.getArguments().intervalMerging);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* setup the interval regions, from either the interval file of the genome region string
|
|
||||||
*
|
|
||||||
* @param intervals the list of intervals to parse
|
|
||||||
* @param mergingRule the rule for merging intervals
|
|
||||||
* @return a list of genomeLoc representing the interval file
|
|
||||||
*/
|
|
||||||
public static List<GenomeLoc> parseIntervalRegion(final List<String> intervals, IntervalMergingRule mergingRule) {
|
|
||||||
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
|
||||||
for (String interval : intervals) {
|
|
||||||
// if any interval argument is '-L all', consider all loci by returning no intervals
|
|
||||||
if (interval.equals("all")) {
|
|
||||||
if (intervals.size() != 1) {
|
|
||||||
// throw error if '-L all' is not only interval - potentially conflicting commands
|
|
||||||
throw new StingException(String.format("Conflicting arguments: Intervals given along with \"-L all\""));
|
|
||||||
}
|
|
||||||
return new ArrayList<GenomeLoc>();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (new File(interval).exists()) {
|
|
||||||
// support for the bed style interval format
|
|
||||||
if (interval.toUpperCase().endsWith(".BED")) {
|
|
||||||
Utils.warnUser("Bed files are 0 based half-open intervals, which are converted to 1-based closed intervals in the GATK. " +
|
|
||||||
"Be aware that all output information and intervals are 1-based closed intervals.");
|
|
||||||
BedParser parser = new BedParser(new File(interval));
|
|
||||||
locs.addAll(parser.getSortedAndMergedLocations(mergingRule));
|
|
||||||
} else {
|
|
||||||
locs.addAll(GenomeLocParser.intervalFileToList(interval,mergingRule));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
locs.addAll(GenomeLocParser.parseGenomeLocs(interval,mergingRule));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
return locs;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets a unique identifier for the reader sourcing this read.
|
* Gets a unique identifier for the reader sourcing this read.
|
||||||
* @param read Read to examine.
|
* @param read Read to examine.
|
||||||
|
|
|
||||||
|
|
@ -5,9 +5,10 @@ import java.util.*;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||||
|
|
||||||
public class IntervalRodIterator implements Iterator<IntervalRod> {
|
public class IntervalRodIterator implements Iterator<IntervalRod> {
|
||||||
private List<GenomeLoc> locations = null;
|
//private List<GenomeLoc> locations = null;
|
||||||
private Iterator<GenomeLoc> iter;
|
private Iterator<GenomeLoc> iter;
|
||||||
private String trackName = null;
|
private String trackName = null;
|
||||||
|
|
||||||
|
|
@ -18,15 +19,15 @@ public class IntervalRodIterator implements Iterator<IntervalRod> {
|
||||||
|
|
||||||
public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) {
|
public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) {
|
||||||
//System.out.printf("Parsing %s for intervals %s%n", file, trackName);
|
//System.out.printf("Parsing %s for intervals %s%n", file, trackName);
|
||||||
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(Collections.singletonList(file.getPath()));
|
GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Collections.singletonList(file.getPath()));
|
||||||
//System.out.printf(" => got %d entries %n", locs.size());
|
//System.out.printf(" => got %d entries %n", locs.size());
|
||||||
return new IntervalRodIterator(trackName, locs);
|
return new IntervalRodIterator(trackName, locs);
|
||||||
}
|
}
|
||||||
|
|
||||||
public IntervalRodIterator(String trackName, List<GenomeLoc> locs) {
|
public IntervalRodIterator(String trackName, GenomeLocSortedSet locs) {
|
||||||
this.trackName = trackName;
|
this.trackName = trackName;
|
||||||
locations = locs;
|
//locations = locs;
|
||||||
iter = locations.iterator();
|
iter = locs.iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
||||||
|
|
@ -126,8 +126,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||||
throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");
|
throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");
|
||||||
|
|
||||||
// read in the intervals for cleaning
|
// read in the intervals for cleaning
|
||||||
List<GenomeLoc> locs = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY);
|
GenomeLocSortedSet locs = GenomeAnalysisEngine.parseIntervalArguments(Arrays.asList(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY);
|
||||||
intervals = GenomeLocSortedSet.createSetFromList(locs).iterator();
|
intervals = locs.iterator();
|
||||||
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
||||||
|
|
||||||
// set up the output writer(s)
|
// set up the output writer(s)
|
||||||
|
|
|
||||||
|
|
@ -118,13 +118,11 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
|
||||||
|
|
||||||
for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) {
|
for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) {
|
||||||
if( vc != null && vc.getName().toUpperCase().startsWith("TRUTH") ) {
|
if( vc != null && vc.getName().toUpperCase().startsWith("TRUTH") ) {
|
||||||
if( vc.isSNP() && !vc.isFiltered() ) {
|
if( !vc.getGenotype(sampleName).isNoCall() ) {
|
||||||
if( !vc.getGenotype(sampleName).isNoCall() ) {
|
isInTruthSet = true;
|
||||||
isInTruthSet = true;
|
|
||||||
|
|
||||||
if( !vc.getGenotype(sampleName).isHomRef() ) {
|
if( !vc.getGenotype(sampleName).isHomRef() ) {
|
||||||
isTrueVariant = true;
|
isTrueVariant = true;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//if( vc.isPolymorphic() ) { //BUGBUG: I don't think this is the right thing to do here, there are many polymorphic sites in the truth data because there are many samples
|
//if( vc.isPolymorphic() ) { //BUGBUG: I don't think this is the right thing to do here, there are many polymorphic sites in the truth data because there are many samples
|
||||||
|
|
@ -259,7 +257,7 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
|
||||||
final double sensitivity = ((double) truePos[curveIndex]) / ((double) truePos[curveIndex] + falseNegGlobal[curveIndex] + falseNeg[curveIndex]);
|
final double sensitivity = ((double) truePos[curveIndex]) / ((double) truePos[curveIndex] + falseNegGlobal[curveIndex] + falseNeg[curveIndex]);
|
||||||
final double specificity = ((double) trueNegGlobal[curveIndex] + trueNeg[curveIndex]) /
|
final double specificity = ((double) trueNegGlobal[curveIndex] + trueNeg[curveIndex]) /
|
||||||
((double) falsePos[curveIndex] + trueNegGlobal[curveIndex] + trueNeg[curveIndex]);
|
((double) falsePos[curveIndex] + trueNegGlobal[curveIndex] + trueNeg[curveIndex]);
|
||||||
outputFile.print( String.format("%.8f,%.8f,%.8f,", qualCut[curveIndex], sensitivity, 1.0 - specificity) );
|
outputFile.print( String.format("%.4f,%.4f,%.4f,", qualCut[curveIndex], sensitivity, 1.0 - specificity) );
|
||||||
qualCut[curveIndex] += incrementQual[curveIndex];
|
qualCut[curveIndex] += incrementQual[curveIndex];
|
||||||
curveIndex++;
|
curveIndex++;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
|
import org.broadinstitute.sting.gatk.arguments.IntervalMergingRule;
|
||||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||||
|
import org.broadinstitute.sting.utils.bed.BedParser;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
@ -133,24 +134,35 @@ public class GenomeLocParser {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load one or more intervals sources, sorting and merging overlapping intervals.
|
* parse a genome interval, from a location string
|
||||||
* @param intervalsSource Source of intervals.
|
*
|
||||||
* @param rule the merging rule we're using
|
* Performs interval-style validation:
|
||||||
* @return a list of sorted, merged intervals.
|
*
|
||||||
|
* contig is valid; start and stop less than the end; start <= sto
|
||||||
|
* @param str the string to parse
|
||||||
|
*
|
||||||
|
* @return a GenomeLoc representing the String
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public static List<GenomeLoc> parseIntervals(List<String> intervalsSource, IntervalMergingRule rule) {
|
|
||||||
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(intervalsSource);
|
public static GenomeLoc parseGenomeInterval(final String str) {
|
||||||
Collections.sort(parsedIntervals);
|
GenomeLoc ret = parseGenomeLoc(str);
|
||||||
return GenomeLocParser.mergeIntervalLocations(parsedIntervals, rule);
|
exceptionOnInvalidGenomeLocBounds(ret);
|
||||||
}
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* parse a genome location, from a location string
|
* parse a genome location, from a location string
|
||||||
|
*
|
||||||
|
* Performs read-style validation:
|
||||||
|
* checks that start and stop are positive, start < stop, and the contig is valid
|
||||||
|
* does not check that genomeLoc is actually on the contig
|
||||||
*
|
*
|
||||||
* @param str the string to parse
|
* @param str the string to parse
|
||||||
*
|
*
|
||||||
* @return a GenomeLoc representing the String
|
* @return a GenomeLoc representing the String
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public static GenomeLoc parseGenomeLoc(final String str) {
|
public static GenomeLoc parseGenomeLoc(final String str) {
|
||||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||||
|
|
@ -183,53 +195,18 @@ public class GenomeLocParser {
|
||||||
|
|
||||||
if (bad)
|
if (bad)
|
||||||
throw new StingException("Failed to parse Genome Location string: " + str);
|
throw new StingException("Failed to parse Genome Location string: " + str);
|
||||||
if (start < 0)
|
|
||||||
throw new StingException("Invalid Genome Location start < 0: " + str + ' ' + start);
|
|
||||||
if (stop < 0)
|
|
||||||
throw new StingException("Invalid Genome Location stop < 0: " + str + ' ' + stop);
|
|
||||||
if (contig == null)
|
|
||||||
throw new StingException("Invalid Genome Location contig == null : " + str);
|
|
||||||
|
|
||||||
if (start > stop)
|
// is the contig valid?
|
||||||
throw new StingException("Invalid Genome Location string; start position comes after end position: " + str );
|
if (!isContigValid(contig))
|
||||||
|
|
||||||
if (!isContigValid(contig))
|
|
||||||
throw new StingException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference.");
|
throw new StingException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference.");
|
||||||
|
|
||||||
if (stop == Integer.MAX_VALUE && hasKnownContigOrdering())
|
if (stop == Integer.MAX_VALUE && hasKnownContigOrdering())
|
||||||
// lookup the actually stop position!
|
// lookup the actually stop position!
|
||||||
stop = getContigInfo(contig).getSequenceLength();
|
stop = getContigInfo(contig).getSequenceLength();
|
||||||
|
|
||||||
GenomeLoc loc = parseGenomeLoc(contig, start, stop);
|
|
||||||
return loc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig,true), start, stop);
|
||||||
* Useful utility function that parses a location string into a coordinate-order sorted
|
exceptionOnInvalidGenomeLoc(locus);
|
||||||
* array of GenomeLoc objects
|
return locus;
|
||||||
*
|
|
||||||
* @param str String representation of genome locs. Null string corresponds to no filter.
|
|
||||||
* @param rule the merging rule we're using
|
|
||||||
*
|
|
||||||
* @return Array of GenomeLoc objects corresponding to the locations in the string, sorted by coordinate order
|
|
||||||
*/
|
|
||||||
public static List<GenomeLoc> parseGenomeLocs(final String str, IntervalMergingRule rule) {
|
|
||||||
// Null string means no filter.
|
|
||||||
if (str == null) return null;
|
|
||||||
|
|
||||||
// Of the form: loc1;loc2;...
|
|
||||||
// Where each locN can be:
|
|
||||||
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
|
||||||
try {
|
|
||||||
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
|
|
||||||
for (String loc : str.split(";"))
|
|
||||||
locs.add(parseGenomeLoc(loc.trim()));
|
|
||||||
Collections.sort(locs);
|
|
||||||
locs = mergeIntervalLocations(locs, rule);
|
|
||||||
return locs;
|
|
||||||
} catch (Exception e) { // TODO: fix this so that it passes the message from the exception, and doesn't print it out
|
|
||||||
throw new StingException(String.format("Invalid locations string: %s, format is loc1;loc2; where loc1 < loc2. Each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str), e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -252,7 +229,7 @@ public class GenomeLocParser {
|
||||||
* @return the list of merged locations
|
* @return the list of merged locations
|
||||||
*/
|
*/
|
||||||
public static List<GenomeLoc> mergeIntervalLocations(final List<GenomeLoc> raw, IntervalMergingRule rule) {
|
public static List<GenomeLoc> mergeIntervalLocations(final List<GenomeLoc> raw, IntervalMergingRule rule) {
|
||||||
if (raw.size() <= 1 || rule == IntervalMergingRule.NONE)
|
if (raw.size() <= 1)
|
||||||
return raw;
|
return raw;
|
||||||
else {
|
else {
|
||||||
ArrayList<GenomeLoc> merged = new ArrayList<GenomeLoc>();
|
ArrayList<GenomeLoc> merged = new ArrayList<GenomeLoc>();
|
||||||
|
|
@ -282,19 +259,7 @@ public class GenomeLocParser {
|
||||||
*/
|
*/
|
||||||
private static boolean isContigValid(String contig) {
|
private static boolean isContigValid(String contig) {
|
||||||
int contigIndex = contigInfo.getSequenceIndex(contig);
|
int contigIndex = contigInfo.getSequenceIndex(contig);
|
||||||
return isSequenceIndexValid(contigIndex);
|
return contigIndex >= 0 && contigIndex < contigInfo.size();
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines whether the given sequence index is valid with respect to the sequence dictionary.
|
|
||||||
*
|
|
||||||
* @param sequenceIndex sequence index
|
|
||||||
*
|
|
||||||
* @return True if the sequence index is valid, false otherwise.
|
|
||||||
*/
|
|
||||||
private static boolean isSequenceIndexValid(int sequenceIndex) {
|
|
||||||
return sequenceIndex >= 0 && sequenceIndex < contigInfo.size();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -305,6 +270,9 @@ public class GenomeLocParser {
|
||||||
* @param stop Stop point.
|
* @param stop Stop point.
|
||||||
*
|
*
|
||||||
* @return The genome location, or a MalformedGenomeLocException if unparseable.
|
* @return The genome location, or a MalformedGenomeLocException if unparseable.
|
||||||
|
*
|
||||||
|
* Validation: only checks that contig is valid
|
||||||
|
* start/stop could be anything
|
||||||
*/
|
*/
|
||||||
public static GenomeLoc parseGenomeLoc(final String contig, long start, long stop) {
|
public static GenomeLoc parseGenomeLoc(final String contig, long start, long stop) {
|
||||||
if (!isContigValid(contig))
|
if (!isContigValid(contig))
|
||||||
|
|
@ -324,52 +292,70 @@ public class GenomeLocParser {
|
||||||
* @param rule also merge abutting intervals
|
* @param rule also merge abutting intervals
|
||||||
*/
|
*/
|
||||||
public static List<GenomeLoc> intervalFileToList(final String file_name, IntervalMergingRule rule) {
|
public static List<GenomeLoc> intervalFileToList(final String file_name, IntervalMergingRule rule) {
|
||||||
|
// try to open file
|
||||||
|
File inputFile = null;
|
||||||
|
try {
|
||||||
|
inputFile = new File(file_name);
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
throw new StingException("Could not open file", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if file is empty
|
||||||
|
if (inputFile.exists() && inputFile.length() < 1) {
|
||||||
|
if (GenomeAnalysisEngine.instance.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST)
|
||||||
|
return new ArrayList<GenomeLoc>();
|
||||||
|
else {
|
||||||
|
Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " +
|
||||||
|
"may want to fix (or exclude) this file.");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// case: BED file
|
||||||
|
if (file_name.toUpperCase().endsWith(".BED")) {
|
||||||
|
BedParser parser = new BedParser(inputFile);
|
||||||
|
return parser.getSortedAndMergedLocations(rule);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* IF not a BED file:
|
||||||
* first try to read it as an interval file since that's well structured
|
* first try to read it as an interval file since that's well structured
|
||||||
* we'll fail quickly if it's not a valid file. Then try to parse it as
|
* we'll fail quickly if it's not a valid file. Then try to parse it as
|
||||||
* a location string file
|
* a location string file
|
||||||
*/
|
*/
|
||||||
List<GenomeLoc> ret = null;
|
|
||||||
try {
|
try {
|
||||||
File inputFile = new File(file_name);
|
|
||||||
|
|
||||||
// sometimes we see an empty file passed as a parameter, if so return an empty list
|
|
||||||
if (inputFile.exists() && inputFile.length() < 1) {
|
|
||||||
if (GenomeAnalysisEngine.instance.getArguments().unsafe != ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST)
|
|
||||||
return new ArrayList<GenomeLoc>();
|
|
||||||
else {
|
|
||||||
Utils.warnUser("The interval file " + file_name + " is empty. The GATK will continue processing but you " +
|
|
||||||
"may want to fix (or exclude) this file.");
|
|
||||||
return new ArrayList<GenomeLoc>();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
IntervalList il = IntervalList.fromFile(inputFile);
|
IntervalList il = IntervalList.fromFile(inputFile);
|
||||||
|
|
||||||
// iterate through the list of merged intervals and add then as GenomeLocs
|
// iterate through the list of merged intervals and add then as GenomeLocs
|
||||||
ret = new ArrayList<GenomeLoc>();
|
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
|
||||||
for (Interval interval : il.getUniqueIntervals()) {
|
for (Interval interval : il.getUniqueIntervals()) {
|
||||||
ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), interval.getStart(), interval.getEnd()));
|
ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), interval.getStart(), interval.getEnd()));
|
||||||
}
|
}
|
||||||
return ret;
|
// always return null instead of empty list
|
||||||
|
return ret.isEmpty() ? null : ret;
|
||||||
|
|
||||||
} catch (Exception e) {
|
}
|
||||||
|
|
||||||
|
// if that didn't work, try parsing file as an old fashioned string file
|
||||||
|
catch (Exception e) {
|
||||||
try {
|
try {
|
||||||
ret = new ArrayList<GenomeLoc>();
|
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
|
||||||
xReadLines reader = new xReadLines(new File(file_name));
|
xReadLines reader = new xReadLines(new File(file_name));
|
||||||
for(String line: reader) {
|
for(String line: reader) {
|
||||||
List<GenomeLoc> loci = parseGenomeLocs(line, rule);
|
try {
|
||||||
if(loci != null)
|
ret.add(parseGenomeInterval(line));
|
||||||
ret.addAll(loci);
|
}
|
||||||
|
catch (Exception e2) {
|
||||||
|
throw new StingException(String.format("Unable to parse interval: %s in file: %s", line, file_name));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
reader.close();
|
reader.close();
|
||||||
|
|
||||||
if(ret.isEmpty())
|
// always return null instead of empty list
|
||||||
return null;
|
return ret.isEmpty() ? null : ret;
|
||||||
|
}
|
||||||
for(GenomeLoc locus: ret)
|
catch (Exception e2) {
|
||||||
exceptionOnInvalidGenomeLocBounds(locus);
|
|
||||||
return ret;
|
|
||||||
} catch (Exception e2) {
|
|
||||||
logger.error("Attempt to parse interval file in GATK format failed: " + e2.getMessage());
|
logger.error("Attempt to parse interval file in GATK format failed: " + e2.getMessage());
|
||||||
e2.printStackTrace();
|
e2.printStackTrace();
|
||||||
throw new StingException("Unable to parse out interval file in either format", e);
|
throw new StingException("Unable to parse out interval file in either format", e);
|
||||||
|
|
@ -471,10 +457,16 @@ public class GenomeLocParser {
|
||||||
/**
|
/**
|
||||||
* verify the specified genome loc is valid, if it's not, throw an exception
|
* verify the specified genome loc is valid, if it's not, throw an exception
|
||||||
* Will not verify the location against contig bounds.
|
* Will not verify the location against contig bounds.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Validation:
|
||||||
|
* checks that start and stop are positive, start < stop, and the contig is valid
|
||||||
|
* does not check that genomeLoc is actually on the contig, so start could be > end of contig
|
||||||
*
|
*
|
||||||
* @param toReturn the genome loc we're about to return
|
* @param toReturn the genome loc we're about to return
|
||||||
*
|
*
|
||||||
* @return the genome loc if it's valid, otherwise we throw an exception
|
* @return the genome loc if it's valid, otherwise we throw an exception
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
private static GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) {
|
private static GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) {
|
||||||
if (toReturn.getStart() < 0) {
|
if (toReturn.getStart() < 0) {
|
||||||
|
|
@ -496,16 +488,24 @@ public class GenomeLocParser {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verify the locus against the bounds of the contig.
|
* Verify the locus against the bounds of the contig.
|
||||||
|
*
|
||||||
|
* performs boundary validation for genome loc INTERVALS:
|
||||||
|
* start and stop are on contig and start <= stop
|
||||||
|
* does NOT check that start and stop > 0, or that contig is valid
|
||||||
|
* for that reason, this function should only be called AFTER exceptionOnInvalidGenomeLoc()
|
||||||
|
* exceptionOnInvalidGenomeLoc isn't included in this function to save time
|
||||||
|
*
|
||||||
* @param locus Locus to verify.
|
* @param locus Locus to verify.
|
||||||
*/
|
*/
|
||||||
private static void exceptionOnInvalidGenomeLocBounds(GenomeLoc locus) {
|
private static void exceptionOnInvalidGenomeLocBounds(GenomeLoc locus) {
|
||||||
exceptionOnInvalidGenomeLoc(locus);
|
|
||||||
|
|
||||||
int contigSize = contigInfo.getSequence(locus.getContigIndex()).getSequenceLength();
|
int contigSize = contigInfo.getSequence(locus.getContigIndex()).getSequenceLength();
|
||||||
if(locus.getStart() > contigSize)
|
if(locus.getStart() > contigSize)
|
||||||
throw new StingException(String.format("GenomeLoc is invalid: locus start %d is after the end of contig %s",locus.getStart(),locus.getContig()));
|
throw new StingException(String.format("GenomeLoc is invalid: locus start %d is after the end of contig %s",locus.getStart(),locus.getContig()));
|
||||||
if(locus.getStop() > contigSize)
|
if(locus.getStop() > contigSize)
|
||||||
throw new StingException(String.format("GenomeLoc is invalid: locus stop %d is after the end of contig %s",locus.getStop(),locus.getContig()));
|
throw new StingException(String.format("GenomeLoc is invalid: locus stop %d is after the end of contig %s",locus.getStop(),locus.getContig()));
|
||||||
|
if (locus.getStart() > locus.getStop()) {
|
||||||
|
throw new StingException("Parameters to GenomeLocParser are incorrect: the start position is greater than the end position");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -514,6 +514,8 @@ public class GenomeLocParser {
|
||||||
* @param loc the location to validate
|
* @param loc the location to validate
|
||||||
*
|
*
|
||||||
* @return true if the passed in GenomeLoc represents a valid location
|
* @return true if the passed in GenomeLoc represents a valid location
|
||||||
|
*
|
||||||
|
* performs interval-style validation: contig is valid and atart and stop less than the end
|
||||||
*/
|
*/
|
||||||
public static boolean validGenomeLoc(GenomeLoc loc) {
|
public static boolean validGenomeLoc(GenomeLoc loc) {
|
||||||
checkSetup();
|
checkSetup();
|
||||||
|
|
@ -541,6 +543,8 @@ public class GenomeLocParser {
|
||||||
* @param stop the stop position
|
* @param stop the stop position
|
||||||
*
|
*
|
||||||
* @return true if it's valid, false otherwise
|
* @return true if it's valid, false otherwise
|
||||||
|
*
|
||||||
|
* performs interval-style validation: contig is valid and atart and stop less than the end
|
||||||
*/
|
*/
|
||||||
public static boolean validGenomeLoc(String contig, long start, long stop) {
|
public static boolean validGenomeLoc(String contig, long start, long stop) {
|
||||||
checkSetup();
|
checkSetup();
|
||||||
|
|
@ -556,6 +560,8 @@ public class GenomeLocParser {
|
||||||
* @param stop the stop position
|
* @param stop the stop position
|
||||||
*
|
*
|
||||||
* @return true if it's valid, false otherwise
|
* @return true if it's valid, false otherwise
|
||||||
|
*
|
||||||
|
* performs interval-style validation: contig is valid and atart and stop less than the end
|
||||||
*/
|
*/
|
||||||
public static boolean validGenomeLoc(int contigIndex, long start, long stop) {
|
public static boolean validGenomeLoc(int contigIndex, long start, long stop) {
|
||||||
checkSetup();
|
checkSetup();
|
||||||
|
|
|
||||||
|
|
@ -373,4 +373,31 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
|
|
||||||
return s.toString();
|
return s.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check to see whether two genomeLocSortedSets are equal.
|
||||||
|
* Note that this implementation ignores the contigInfo object.
|
||||||
|
*
|
||||||
|
*/ /*
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if(other == null)
|
||||||
|
return false;
|
||||||
|
if(other instanceof GenomeLocSortedSet) {
|
||||||
|
// send to a list, so we can ensure order correct
|
||||||
|
List otherList = ((GenomeLocSortedSet)other).toList();
|
||||||
|
List thisList = this.toList();
|
||||||
|
if (otherList.size() != this.size())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (Integer i=0;i<thisList.size();i++) {
|
||||||
|
if (otherList.get(i).equals(thisList.get(i)))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
|
} */
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ public class GenomeLocParserTest extends BaseTest {
|
||||||
@Test(expected = RuntimeException.class)
|
@Test(expected = RuntimeException.class)
|
||||||
public void testGetContigIndex() {
|
public void testGetContigIndex() {
|
||||||
assertEquals(-1, GenomeLocParser.getContigIndex("blah",true)); // should not be in the reference
|
assertEquals(-1, GenomeLocParser.getContigIndex("blah",true)); // should not be in the reference
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetContigIndexValid() {
|
public void testGetContigIndexValid() {
|
||||||
|
|
@ -80,39 +80,7 @@ public class GenomeLocParserTest extends BaseTest {
|
||||||
assertEquals(100, loc.getStop());
|
assertEquals(100, loc.getStop());
|
||||||
assertEquals(1, loc.getStart());
|
assertEquals(1, loc.getStart());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expected = RuntimeException.class)
|
|
||||||
public void testParseBadLocations() {
|
|
||||||
GenomeLocParser.parseGenomeLocs("chr1:1-1;badChr:1-0", IntervalMergingRule.ALL);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testParseGoodLocations() {
|
|
||||||
GenomeLocParser.parseGenomeLocs("chr1:1-1;chr1:5-9", IntervalMergingRule.ALL);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test(expected = RuntimeException.class)
|
|
||||||
public void testParseGoodLocationsTooManySemiColons() {
|
|
||||||
GenomeLocParser.parseGenomeLocs("chr1:1-1;;chr1:5-9;", IntervalMergingRule.ALL);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testOverlappingGoodLocationsWithAbuttingFlag() {
|
|
||||||
List<GenomeLoc> locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", IntervalMergingRule.OVERLAPPING_ONLY);
|
|
||||||
assertEquals(1, locs.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testAbuttingGoodLocationsWithAbuttingOffFlag() {
|
|
||||||
List<GenomeLoc> locs = GenomeLocParser.parseGenomeLocs("chr1:1-4;chr1:5-9", IntervalMergingRule.OVERLAPPING_ONLY);
|
|
||||||
assertEquals(2, locs.size());
|
|
||||||
}
|
|
||||||
@Test
|
|
||||||
public void testAbuttingGoodLocationsWithNoneFlag() {
|
|
||||||
List<GenomeLoc> locs = GenomeLocParser.parseGenomeLocs("chr1:1-8;chr1:5-9", IntervalMergingRule.NONE);
|
|
||||||
assertEquals(2, locs.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCreateGenomeLoc1() {
|
public void testCreateGenomeLoc1() {
|
||||||
GenomeLoc loc = GenomeLocParser.createGenomeLoc("chr1", 1, 100);
|
GenomeLoc loc = GenomeLocParser.createGenomeLoc("chr1", 1, 100);
|
||||||
|
|
@ -162,16 +130,6 @@ public class GenomeLocParserTest extends BaseTest {
|
||||||
assertEquals(1, copy.getStart());
|
assertEquals(1, copy.getStart());
|
||||||
}
|
}
|
||||||
|
|
||||||
/*@Test // - uncomment if you want to test speed
|
|
||||||
public void testGenomeLocParserList() {
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
List<GenomeLoc> parsedIntervals = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(new String[]{"/humgen/gsa-scr1/GATK_Data/Validation_Data/bigChr1IntervalList.list"}));
|
|
||||||
Collections.sort(parsedIntervals);
|
|
||||||
LinkedList<GenomeLoc> loc = new LinkedList<GenomeLoc>(GenomeLocParser.mergeIntervalLocations(parsedIntervals));
|
|
||||||
long stop = System.currentTimeMillis();
|
|
||||||
logger.warn("Elapsed time = " + (stop - start));
|
|
||||||
}*/
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGenomeLocPlusSign() {
|
public void testGenomeLocPlusSign() {
|
||||||
GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1+");
|
GenomeLoc loc = GenomeLocParser.parseGenomeLoc("chr1:1+");
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue