Contracts for GenomeLocParser and GenomeLoc are now fully implemented.

GenomeLocs can officially have any start/stop values from -Inf - +Inf.  Bounds w.r.t. the reference are enforced, optionally, by GenomeLocParser.  General code cleanup throughout the subsystem.

All validation code for GLs is now centralized, and all I/O systems now validate their inputs.  Because of this, the Picard interval processing code has been changed to examine whether an interval is valid, and only keep the valid intervals.  Note that the scatter/gather test was changed, because the original hg18 chr20 interval files as actually malformed (all records for some reason where on chr20).  

Many interval processing routines were moved to IntervalUtils, as this is their natural home.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5830 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-05-21 02:01:59 +00:00
parent 3aa56037af
commit e234589240
23 changed files with 276 additions and 413 deletions

View File

@ -655,10 +655,6 @@ public class GenomeAnalysisEngine {
argCollection.unsafe == ValidationExclusion.TYPE.ALL); argCollection.unsafe == ValidationExclusion.TYPE.ALL);
List<GenomeLoc> nonRODIntervals = IntervalUtils.parseIntervalArguments(genomeLocParser, argList, allowEmptyIntervalList); List<GenomeLoc> nonRODIntervals = IntervalUtils.parseIntervalArguments(genomeLocParser, argList, allowEmptyIntervalList);
genomeLocParser.validateGenomeLocList(nonRODIntervals);
genomeLocParser.validateGenomeLocList(rodIntervals);
List<GenomeLoc> allIntervals = IntervalUtils.mergeListsBySetOperator(rodIntervals, nonRODIntervals, argCollection.BTIMergeRule); List<GenomeLoc> allIntervals = IntervalUtils.mergeListsBySetOperator(rodIntervals, nonRODIntervals, argCollection.BTIMergeRule);
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging); return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging);

View File

@ -795,7 +795,7 @@ public class VariantContextUtils {
* @return the genomeLoc * @return the genomeLoc
*/ */
public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) {
return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd()); return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd(), true);
} }
public abstract static class AlleleMergeRule { public abstract static class AlleleMergeRule {

View File

@ -93,7 +93,7 @@ public class AnnotatorInputTableCodec implements ReferenceDependentFeatureCodec<
GenomeLoc loc; GenomeLoc loc;
String chr = st.nextToken(); String chr = st.nextToken();
if ( chr.indexOf(":") != -1 ) { if ( chr.indexOf(":") != -1 ) {
loc = genomeLocParser.parseGenomeInterval(chr); loc = genomeLocParser.parseGenomeLoc(chr);
} else { } else {
if ( st.countTokens() < 3 ) if ( st.countTokens() < 3 )
throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line); throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line);
@ -123,7 +123,7 @@ public class AnnotatorInputTableCodec implements ReferenceDependentFeatureCodec<
GenomeLoc loc; GenomeLoc loc;
if ( values.get(0).indexOf(":") != -1 ) if ( values.get(0).indexOf(":") != -1 )
loc = genomeLocParser.parseGenomeInterval(values.get(0)); loc = genomeLocParser.parseGenomeLoc(values.get(0));
else else
loc = genomeLocParser.createGenomeLoc(values.get(0), Integer.valueOf(values.get(1)), Integer.valueOf(values.get(2))); loc = genomeLocParser.createGenomeLoc(values.get(0), Integer.valueOf(values.get(1)), Integer.valueOf(values.get(2)));

View File

@ -177,7 +177,7 @@ public class BeagleCodec implements ReferenceDependentFeatureCodec<BeagleFeature
BeagleFeature bglFeature = new BeagleFeature(); BeagleFeature bglFeature = new BeagleFeature();
final GenomeLoc loc = genomeLocParser.parseGenomeLoc(tokens[markerPosition]); //GenomeLocParser.parseGenomeInterval(values.get(0)); - TODO switch to this final GenomeLoc loc = genomeLocParser.parseGenomeLoc(tokens[markerPosition]); //GenomeLocParser.parseGenomeLoc(values.get(0)); - TODO switch to this
//parse the location: common to all readers //parse the location: common to all readers
bglFeature.setChr(loc.getContig()); bglFeature.setChr(loc.getContig());

View File

@ -55,8 +55,8 @@ public class RefSeqCodec implements ReferenceDependentFeatureCodec<RefSeqFeature
else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line); else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line);
feature.setTranscript_interval(genomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
feature.setTranscript_coding_interval(genomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]))); feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7])));
feature.setGene_name(fields[12]); feature.setGene_name(fields[12]);
String[] exon_starts = fields[9].split(","); String[] exon_starts = fields[9].split(",");
String[] exon_stops = fields[10].split(","); String[] exon_stops = fields[10].split(",");
@ -71,7 +71,7 @@ public class RefSeqCodec implements ReferenceDependentFeatureCodec<RefSeqFeature
ArrayList<Integer> exon_frames = new ArrayList<Integer>(eframes.length); ArrayList<Integer> exon_frames = new ArrayList<Integer>(eframes.length);
for ( int i = 0 ; i < exon_starts.length ; i++ ) { for ( int i = 0 ; i < exon_starts.length ; i++ ) {
exons.add(genomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
exon_frames.add(Integer.decode(eframes[i])); exon_frames.add(Integer.decode(eframes[i]));
} }

View File

@ -84,7 +84,7 @@ public class StringToGenomeLocIteratorAdapter implements Iterator<GenomeLoc> {
public GenomeLoc next() { public GenomeLoc next() {
if ( myFormat == FORMAT.GATK ) return genomeLocParser.parseGenomeInterval( it.next() ); if ( myFormat == FORMAT.GATK ) return genomeLocParser.parseGenomeLoc(it.next());
return BedParser.parseLocation( genomeLocParser,it.next() ); return BedParser.parseLocation( genomeLocParser,it.next() );
} }

View File

@ -245,7 +245,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
if (IntervalUtils.isIntervalFile(fileOrInterval)) { if (IntervalUtils.isIntervalFile(fileOrInterval)) {
merger.add(new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new java.io.File(fileOrInterval), IntervalMergingRule.OVERLAPPING_ONLY ) ); merger.add(new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new java.io.File(fileOrInterval), IntervalMergingRule.OVERLAPPING_ONLY ) );
} else { } else {
rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeInterval(fileOrInterval)); rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeLoc(fileOrInterval));
} }
} }
if ( ! rawIntervals.isEmpty() ) merger.add(rawIntervals.iterator()); if ( ! rawIntervals.isEmpty() ) merger.add(rawIntervals.iterator());

View File

@ -275,7 +275,7 @@ public class RealignerTargetCreator extends RodWalker<RealignerTargetCreator.Eve
} }
public boolean isReportableEvent() { public boolean isReportableEvent() {
return getToolkit().getGenomeLocParser().validGenomeLoc(loc.getContig(), eventStartPos, eventStopPos) && eventStopPos >= 0 && eventStopPos - eventStartPos < maxIntervalSize; return getToolkit().getGenomeLocParser().isValidGenomeLoc(loc.getContig(), eventStartPos, eventStopPos, true) && eventStopPos >= 0 && eventStopPos - eventStartPos < maxIntervalSize;
} }
public String toString() { public String toString() {

View File

@ -101,7 +101,7 @@ public class CalcFullHaplotypesWalker extends RodWalker<Integer, Integer> {
if (waitingHaplotype == null) {// changed to a new contig: if (waitingHaplotype == null) {// changed to a new contig:
// Set the new haplotype to extend from [1, prevPosition] // Set the new haplotype to extend from [1, prevPosition]
if (prevPosition >= 1) { if (prevPosition >= 1) {
GenomeLoc startInterval = getToolkit().getGenomeLocParser().parseGenomeLoc(curLocus.getContig(), 1, prevPosition); GenomeLoc startInterval = getToolkit().getGenomeLocParser().createGenomeLoc(curLocus.getContig(), 1, prevPosition);
waitingHaplotype = new Haplotype(startInterval, sampleHapEntry.getKey()); waitingHaplotype = new Haplotype(startInterval, sampleHapEntry.getKey());
sampleHapEntry.setValue(waitingHaplotype); sampleHapEntry.setValue(waitingHaplotype);
} }
@ -201,7 +201,7 @@ public class CalcFullHaplotypesWalker extends RodWalker<Integer, Integer> {
public void extend(int stop) { public void extend(int stop) {
if (stop > interval.getStop()) if (stop > interval.getStop())
interval = getToolkit().getGenomeLocParser().parseGenomeLoc(interval.getContig(), interval.getStart(), stop); interval = getToolkit().getGenomeLocParser().createGenomeLoc(interval.getContig(), interval.getStart(), stop);
} }
public void incrementHetCount() { public void incrementHetCount() {

View File

@ -13,7 +13,9 @@ import java.io.Serializable;
* Date: Mar 2, 2009 * Date: Mar 2, 2009
* Time: 8:50:11 AM * Time: 8:50:11 AM
* *
* Genome location representation. It is *** 1 *** based closed. * Genome location representation. It is *** 1 *** based closed. Note that GenomeLocs start and stop values
* can be any positive or negative number, by design. Bound validation is a feature of the GenomeLocParser,
* and not a fundamental constraint of the GenomeLoc
*/ */
public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenomeLocation { public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenomeLocation {
/** /**
@ -48,7 +50,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
@Requires({ @Requires({
"contig != null", "contig != null",
"contigIndex >= 0", // I believe we aren't allowed to create GenomeLocs without a valid contigIndex "contigIndex >= 0", // I believe we aren't allowed to create GenomeLocs without a valid contigIndex
"start >= 0 && start <= stop"}) "start <= stop"})
protected GenomeLoc( final String contig, final int contigIndex, final int start, final int stop ) { protected GenomeLoc( final String contig, final int contigIndex, final int start, final int stop ) {
this.contigName = contig; this.contigName = contig;
this.contigIndex = contigIndex; this.contigIndex = contigIndex;
@ -122,8 +124,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
*/ */
@Requires({ @Requires({
"that != null", "that != null",
"! isUnmapped(this)", "isUnmapped(this) == isUnmapped(that)"})
"! isUnmapped(that)"})
@Ensures("result != null") @Ensures("result != null")
public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException { public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException {
if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {

View File

@ -25,8 +25,6 @@
package org.broadinstitute.sting.utils; package org.broadinstitute.sting.utils;
import java.util.List;
import com.google.java.contract.*; import com.google.java.contract.*;
import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
@ -178,6 +176,21 @@ public class GenomeLocParser {
} }
} }
/**
* Determines whether the given contig is valid with respect to the sequence dictionary
* already installed in the GenomeLoc.
*
* @return True if the contig is valid. False otherwise.
*/
public boolean contigIsInDictionary(String contig) {
return contig != null && contigInfo.hasContig(contig);
}
public boolean indexIsInDictionary(final int index) {
return index >= 0 && contigInfo.hasContig(index);
}
/** /**
* get the contig's SAMSequenceRecord * get the contig's SAMSequenceRecord
* *
@ -185,27 +198,14 @@ public class GenomeLocParser {
* *
* @return the sam sequence record * @return the sam sequence record
*/ */
@Requires({"contig != null", "contigIsInDictionary(contig)"})
@Ensures("result != null") @Ensures("result != null")
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
public SAMSequenceRecord getContigInfo(final String contig) { public SAMSequenceRecord getContigInfo(final String contig) {
if ( contig == null || ! contigIsInDictionary(contig) )
throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig));
return contigInfo.getSequence(contig); return contigInfo.getSequence(contig);
} }
/**
* Determines whether the given contig is valid with respect to the sequence dictionary
* already installed in the GenomeLoc.
*
* @return True if the contig is valid. False otherwise.
*/
@Requires({"contig != null"})
public boolean contigIsInDictionary(String contig) {
return contigInfo.hasContig(contig);
}
public boolean indexIsInDictionary(final int index) {
return contigInfo.hasContig(index);
}
/** /**
* Returns the contig index of a specified string version of the contig * Returns the contig index of a specified string version of the contig
* *
@ -213,47 +213,126 @@ public class GenomeLocParser {
* *
* @return the contig index, -1 if not found * @return the contig index, -1 if not found
*/ */
@Requires("contig != null")
@Ensures("result >= 0") @Ensures("result >= 0")
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"})
public int getContigIndex(final String contig) { public int getContigIndex(final String contig) {
if (! contigInfo.hasContig(contig) ) return getContigInfo(contig).getSequenceIndex();
throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig));
return contigInfo.getSequenceIndex(contig);
} }
@Requires("contig != null") @Requires("contig != null")
protected int getContigIndexWithoutException(final String contig) { protected int getContigIndexWithoutException(final String contig) {
if (! contigInfo.hasContig(contig)) if ( contig == null || ! contigInfo.hasContig(contig) )
return -1; return -1;
return contigInfo.getSequenceIndex(contig); return contigInfo.getSequenceIndex(contig);
} }
// --------------------------------------------------------------------------------------------------------------
//
// Low-level creation functions
//
// --------------------------------------------------------------------------------------------------------------
/**
* create a genome loc, given the contig name, start, and stop
*
* @param contig the contig name
* @param start the starting position
* @param stop the stop position
*
* @return a new genome loc
*/
@Ensures("result != null")
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop)"})
public GenomeLoc createGenomeLoc(String contig, final int start, final int stop) {
return createGenomeLoc(contig, getContigIndex(contig), start, stop);
}
public GenomeLoc createGenomeLoc(String contig, final int start, final int stop, boolean mustBeOnReference) {
return createGenomeLoc(contig, getContigIndex(contig), start, stop, mustBeOnReference);
}
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop, false)"})
public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop) {
return createGenomeLoc(contig, index, start, stop, false);
}
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop,mustBeOnReference)"})
public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop, boolean mustBeOnReference) {
validateGenomeLoc(contig, index, start, stop, mustBeOnReference, true);
return new GenomeLoc(contig, index, start, stop);
}
/**
* validate a position or interval on the genome as valid
*
* Requires that contig exist in the master sequence dictionary, and that contig index be valid as well. Requires
* that start <= stop.
*
* if mustBeOnReference is true,
* performs boundary validation for genome loc INTERVALS:
* start and stop are on contig and start <= stop
*
* @param contig the contig name
* @param start the start position
* @param stop the stop position
*
* @return true if it's valid, false otherwise. If exceptOnError, then throws a UserException if invalid
*/
private boolean validateGenomeLoc(String contig, int contigIndex, int start, int stop, boolean mustBeOnReference, boolean exceptOnError) {
if ( ! contigInfo.hasContig(contig) )
return vglHelper(exceptOnError, String.format("Unknown contig %s", contig));
if (stop < start)
return vglHelper(exceptOnError, String.format("The stop position %d is less than start %d", stop, start));
if (contigIndex < 0)
return vglHelper(exceptOnError, String.format("The contig index %d is less than 0", contigIndex));
if (contigIndex >= contigInfo.getNSequences())
return vglHelper(exceptOnError, String.format("The contig index %d is greater than the stored sequence count (%d)", contigIndex, contigInfo.getNSequences()));
if ( mustBeOnReference ) {
if (start < 0)
return vglHelper(exceptOnError, String.format("The start position %d is less than 0", start));
if (stop < 0)
return vglHelper(exceptOnError, String.format("The stop position %d is less than 0", stop));
int contigSize = contigInfo.getSequence(contigIndex).getSequenceLength();
if (start > contigSize || stop > contigSize)
return vglHelper(exceptOnError, String.format("The genome loc coordinates %d-%d exceed the contig size (%d)", start, stop, contigSize));
}
// we passed
return true;
}
public boolean isValidGenomeLoc(String contig, int start, int stop, boolean mustBeOnReference ) {
return validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, mustBeOnReference, false);
}
public boolean isValidGenomeLoc(String contig, int start, int stop ) {
return validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, true, false);
}
private boolean vglHelper(boolean exceptOnError, String msg) {
if ( exceptOnError )
throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect:" + msg);
else
return false;
}
// --------------------------------------------------------------------------------------------------------------
//
// Parsing genome locs
//
// --------------------------------------------------------------------------------------------------------------
/** /**
* parse a genome interval, from a location string * parse a genome interval, from a location string
* *
* Performs interval-style validation: * Performs interval-style validation:
* *
* contig is valid; start and stop less than the end; start <= sto * contig is valid; start and stop less than the end; start <= stop, and start/stop are on the contig
* @param str the string to parse
*
* @return a GenomeLoc representing the String
*
*/
@Requires("str != null")
@Ensures("result != null")
public GenomeLoc parseGenomeInterval(final String str) {
GenomeLoc ret = parseGenomeLoc(str);
exceptionOnInvalidGenomeLocBounds(ret);
return ret;
}
/**
* parse a genome location, from a location string
*
* Performs read-style validation:
* checks that start and stop are positive, start < stop, and the contig is valid
* does not check that genomeLoc is actually on the contig
*
* @param str the string to parse * @param str the string to parse
* *
* @return a GenomeLoc representing the String * @return a GenomeLoc representing the String
@ -296,22 +375,15 @@ public class GenomeLocParser {
// is the contig valid? // is the contig valid?
if (!contigIsInDictionary(contig)) if (!contigIsInDictionary(contig))
throw new UserException("Contig '" + contig + "' does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?"); throw new UserException.MalformedGenomeLoc("Contig '" + contig + "' does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?");
if (stop == Integer.MAX_VALUE) if (stop == Integer.MAX_VALUE)
// lookup the actually stop position! // lookup the actually stop position!
stop = getContigInfo(contig).getSequenceLength(); stop = getContigInfo(contig).getSequenceLength();
GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig), start, stop); return createGenomeLoc(contig, getContigIndex(contig), start, stop, true);
exceptionOnInvalidGenomeLoc(locus);
return locus;
} }
// --------------------------------------------------------------------------------------------------------------
//
// Parsing string representations
//
// --------------------------------------------------------------------------------------------------------------
/** /**
* Parses a number like 1,000,000 into a long. * Parses a number like 1,000,000 into a long.
* @param pos * @param pos
@ -342,49 +414,11 @@ public class GenomeLocParser {
} }
} }
/** // --------------------------------------------------------------------------------------------------------------
* Use this static constructor when the input data is under limited control (i.e. parsing user data). //
* // Parsing string representations
* @param contig Contig to parse. //
* @param start Starting point. // --------------------------------------------------------------------------------------------------------------
* @param stop Stop point.
*
* @return The genome location, or a MalformedGenomeLocException if unparseable.
*
* Validation: only checks that contig is valid
* start/stop could be anything
*/
public GenomeLoc parseGenomeLoc(final String contig, int start, int stop) {
if (!contigIsInDictionary(contig))
throw new MalformedGenomeLocException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?");
return new GenomeLoc(contig, getContigIndex(contig), start, stop);
}
/**
* get the sequence name from a sequence index
*
* @param contigIndex get the contig index
*
* @return the string that represents that contig name
*/
@Requires("indexIsInDictionary(contigIndex)")
@Ensures("result != null")
private String getSequenceNameFromIndex(int contigIndex) {
return contigInfo.getSequence(contigIndex).getSequenceName();
}
/**
* create a genome loc, given the contig name, start, and stop
*
* @param contig the contig name
* @param start the starting position
* @param stop the stop position
*
* @return a new genome loc
*/
public GenomeLoc createGenomeLoc(String contig, final int start, final int stop) {
return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig), start, stop));
}
/** /**
* create a genome loc, given a read. If the read is unmapped, *and* yet the read has a contig and start position, * create a genome loc, given a read. If the read is unmapped, *and* yet the read has a contig and start position,
@ -394,183 +428,44 @@ public class GenomeLocParser {
* *
* @return * @return
*/ */
@Requires("read != null")
@Ensures("result != null")
public GenomeLoc createGenomeLoc(final SAMRecord read) { public GenomeLoc createGenomeLoc(final SAMRecord read) {
GenomeLoc loc;
if ( read.getReadUnmappedFlag() && read.getReferenceIndex() == -1 ) if ( read.getReadUnmappedFlag() && read.getReferenceIndex() == -1 )
// read is unmapped and not placed anywhere on the genome // read is unmapped and not placed anywhere on the genome
loc = GenomeLoc.UNMAPPED; return GenomeLoc.UNMAPPED;
else { else {
int end = read.getReadUnmappedFlag() ? read.getAlignmentStart() : read.getAlignmentEnd(); int end = read.getReadUnmappedFlag() ? read.getAlignmentStart() : read.getAlignmentEnd();
loc = new GenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), end); return createGenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), end, false);
} }
return exceptionOnInvalidGenomeLoc(loc);
} }
/** /**
* create a new genome loc, given the contig name, and a single position * create a new genome loc, given the contig name, and a single position. Must be on the reference
* *
* @param contig the contig name * @param contig the contig name
* @param pos the postion * @param pos the postion
* *
* @return a genome loc representing a single base at the specified postion on the contig * @return a genome loc representing a single base at the specified postion on the contig
*/ */
@Ensures("result != null")
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, pos, pos, true)"})
public GenomeLoc createGenomeLoc(final String contig, final int pos) { public GenomeLoc createGenomeLoc(final String contig, final int pos) {
return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig), pos, pos)); return createGenomeLoc(contig, getContigIndex(contig), pos, pos);
} }
/** // /**
* Creates a new GenomeLoc without performing any validation on its contig or bounds. // * Creates a new GenomeLoc without performing any validation on its contig or bounds.
* FOR UNIT TESTING PURPOSES ONLY! // * FOR UNIT TESTING PURPOSES ONLY!
* // *
* @param contig the contig name // * @param contig the contig name
* @param start start position of the interval // * @param start start position of the interval
* @param stop stop position of the interval // * @param stop stop position of the interval
* @return a new GenomeLoc representing the specified location // * @return a new GenomeLoc representing the specified location
*/ // */
public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) { // public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) {
return new GenomeLoc(contig, getContigIndexWithoutException(contig), start, stop); // return new GenomeLoc(contig, getContigIndexWithoutException(contig), start, stop);
} // }
/**
* verify the specified genome loc is valid, if it's not, throw an exception
* Will not verify the location against contig bounds.
*
*
* Validation:
* checks that start and stop are positive, start < stop, and the contig is valid
* does not check that genomeLoc is actually on the contig, so start could be > end of contig
*
* @param toReturn the genome loc we're about to return
*
* @return the genome loc if it's valid, otherwise we throw an exception
*
*/
private GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) {
if ( GenomeLoc.isUnmapped(toReturn) ) {
return toReturn;
}
if (toReturn.getStart() < 0) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the start position is less than 0 " +
"in interval: " + toReturn);
}
if ((toReturn.getStop() != -1) && (toReturn.getStop() < 0)) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the stop position is less than 0 " +
"in interval: " + toReturn);
}
if (toReturn.getContigIndex() < 0) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0 " +
"in interval: " + toReturn);
}
if (toReturn.getContigIndex() >= contigInfo.getNSequences()) {
throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater than " +
"the stored sequence count in interval: " + toReturn);
}
return toReturn;
}
/**
* Verify the locus against the bounds of the contig.
*
* performs boundary validation for genome loc INTERVALS:
* start and stop are on contig and start <= stop
* does NOT check that start and stop > 0, or that contig is valid
* for that reason, this function should only be called AFTER exceptionOnInvalidGenomeLoc()
* exceptionOnInvalidGenomeLoc isn't included in this function to save time
*
* @param locus Locus to verify.
*/
private void exceptionOnInvalidGenomeLocBounds(GenomeLoc locus) {
if ( GenomeLoc.isUnmapped(locus) ) {
return;
}
int contigSize = contigInfo.getSequence(locus.getContigIndex()).getSequenceLength();
if(locus.getStart() > contigSize)
throw new UserException.MalformedGenomeLoc("GenomeLoc is invalid: locus start is after the end of contig",locus);
if(locus.getStop() > contigSize)
throw new UserException.MalformedGenomeLoc("GenomeLoc is invalid: locus stop is after the end of contig",locus);
if (locus.getStart() > locus.getStop()) {
throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect: the start position is " +
"greater than the end position", locus);
}
}
/**
* Validates each of the GenomeLocs in the list passed as an argument: each GenomeLoc must refer to a
* valid contig, and its bounds must be within the bounds of its contig. Throws a MalformedGenomeLoc
* exception if an invalid GenomeLoc is found.
*
* @param genomeLocList The list of GenomeLocs to validate
*/
public void validateGenomeLocList( List<GenomeLoc> genomeLocList ) {
if ( genomeLocList == null ) {
return;
}
for ( GenomeLoc loc : genomeLocList ) {
try {
exceptionOnInvalidGenomeLoc(loc);
exceptionOnInvalidGenomeLocBounds(loc);
}
catch ( UserException e ) {
throw e;
}
catch ( ReviewedStingException e ) {
throw new UserException.MalformedGenomeLoc(e.getMessage());
}
}
}
/**
* validate a position or interval on the genome as valid
*
* @param contig the contig name
* @param start the start position
* @param stop the stop position
*
* @return true if it's valid, false otherwise
*
* performs interval-style validation: contig is valid and atart and stop less than the end
*/
public boolean validGenomeLoc(String contig, int start, int stop) {
if ( ! contigInfo.hasContig(contig) )
return false;
else
return validGenomeLoc(getContigIndex(contig), start, stop);
}
/**
* validate a position or interval on the genome as valid
*
* @param contigIndex the contig name
* @param start the start position
* @param stop the stop position
*
* @return true if it's valid, false otherwise
*
* performs interval-style validation: contig is valid and atart and stop less than the end
*/
public boolean validGenomeLoc(int contigIndex, int start, int stop) {
// quick check before we get the contig size, is the contig number valid
if ((contigIndex < 0) || // the contig index has to be positive
(contigIndex >= contigInfo.getNSequences())) // the contig must be in the integer range of contigs)
return false;
int contigSize = contigInfo.getSequence(contigIndex).getSequenceLength();
if ((start < 0) || // start must be greater than 0
(stop < 0 ) || // the stop must be greater than 0
(start > contigSize) || // the start must be before or equal to the contig end
(stop > contigSize)) // the stop must also be before or equal to the contig end
return false;
// we passed
return true;
}
/** /**
* create a new genome loc from an existing loc, with a new start position * create a new genome loc from an existing loc, with a new start position
@ -579,12 +474,11 @@ public class GenomeLocParser {
* *
* @param loc the old location * @param loc the old location
* @param start a new start position * @param start a new start position
* TODO -- move to me GenomeLoc class itself
* *
* @return the newly created genome loc * @return the newly created genome loc
*/ */
public GenomeLoc setStart(GenomeLoc loc, int start) { public GenomeLoc setStart(GenomeLoc loc, int start) {
return exceptionOnInvalidGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop())); return createGenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop());
} }
/** /**
@ -595,11 +489,10 @@ public class GenomeLocParser {
* @param loc the old location * @param loc the old location
* @param stop a new stop position * @param stop a new stop position
* *
* TODO -- move to me GenomeLoc class itself
* @return * @return
*/ */
public GenomeLoc setStop(GenomeLoc loc, int stop) { public GenomeLoc setStop(GenomeLoc loc, int stop) {
return exceptionOnInvalidGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop)); return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop);
} }
/** /**
@ -607,7 +500,6 @@ public class GenomeLocParser {
* *
* @param loc the old location * @param loc the old location
* *
* TODO -- move to me GenomeLoc class itself
* @return a new genome loc * @return a new genome loc
*/ */
public GenomeLoc incPos(GenomeLoc loc) { public GenomeLoc incPos(GenomeLoc loc) {
@ -620,11 +512,10 @@ public class GenomeLocParser {
* @param loc the old location * @param loc the old location
* @param by how much to move the start and stop by * @param by how much to move the start and stop by
* *
* TODO -- move to me GenomeLoc class itself
* @return a new genome loc * @return a new genome loc
*/ */
public GenomeLoc incPos(GenomeLoc loc, int by) { public GenomeLoc incPos(GenomeLoc loc, int by) {
return exceptionOnInvalidGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by)); return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by);
} }
/** /**
@ -636,6 +527,7 @@ public class GenomeLocParser {
@Ensures("result != null") @Ensures("result != null")
public GenomeLoc createOverEntireContig(String contigName) { public GenomeLoc createOverEntireContig(String contigName) {
SAMSequenceRecord contig = contigInfo.getSequence(contigName); SAMSequenceRecord contig = contigInfo.getSequence(contigName);
return exceptionOnInvalidGenomeLoc(new GenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength())); return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true);
} }
} }

View File

@ -253,12 +253,14 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
* |------| + |--------| * |------| + |--------|
* *
*/ */
GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), g.getStart(), e.getStart() - 1); int afterStop = g.getStop(), afterStart = e.getStop() + 1;
GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), e.getStop() + 1, g.getStop()); int beforeStop = e.getStart() - 1, beforeStart = g.getStart();
if (after.getStop() - after.getStart() >= 0) { if (afterStop - afterStart >= 0) {
GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), afterStart, afterStop);
l.add(after); l.add(after);
} }
if (before.getStop() - before.getStart() >= 0) { if (beforeStop - beforeStart >= 0) {
GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), beforeStart, beforeStop);
l.add(before); l.add(before);
} }

View File

@ -1,39 +0,0 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* User: hanna
* Date: Jun 2, 2009
* Time: 11:43:48 AM
* BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
* Software and documentation are copyright 2005 by the Broad Institute.
* All rights are reserved.
*
* Users acknowledge that this software is supplied without any warranty or support.
* The Broad Institute is not responsible for its use, misuse, or
* functionality.
*/
/**
* Indicates that something was wrong with in the parameters passed to create a GenomeLoc...
* bad sequence id out of bounds, etc.
*/
public class MalformedGenomeLocException extends ReviewedStingException {
/**
* Create a new MalformedGenomeLocException with the given message. Does not preserve the existing stack trace.
* @param message The message.
*/
public MalformedGenomeLocException( String message ) {
super(message);
}
/**
* Create a new MalformedGenomeLocException with the given message and root cause.
* @param message The message.
* @param t The root cause.
*/
public MalformedGenomeLocException( String message, Throwable t ) {
super(message,t);
}
}

View File

@ -88,7 +88,7 @@ public class BedParser {
} }
// we currently drop the rest of the bed record, which can contain names, scores, etc // we currently drop the rest of the bed record, which can contain names, scores, etc
return genomeLocParser.createGenomeLoc(contig, start, stop); return genomeLocParser.createGenomeLoc(contig, start, stop, true);
} }

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.interval;
import net.sf.picard.util.Interval; import net.sf.picard.util.Interval;
import net.sf.picard.util.IntervalList; import net.sf.picard.util.IntervalList;
import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
@ -24,6 +25,8 @@ import java.io.File;
* @version 0.1 * @version 0.1
*/ */
public class IntervalUtils { public class IntervalUtils {
private static Logger logger = Logger.getLogger(IntervalUtils.class);
/** /**
* Turns a set of strings describing intervals into a parsed set of intervals. Valid string elements can be files, * Turns a set of strings describing intervals into a parsed set of intervals. Valid string elements can be files,
* intervals in samtools notation (chrA:B-C), or some combination of the above separated by semicolons. Additionally, * intervals in samtools notation (chrA:B-C), or some combination of the above separated by semicolons. Additionally,
@ -70,7 +73,7 @@ public class IntervalUtils {
// otherwise treat as an interval -> parse and add to raw interval list // otherwise treat as an interval -> parse and add to raw interval list
else { else {
rawIntervals.add(parser.parseGenomeInterval(fileOrInterval)); rawIntervals.add(parser.parseGenomeLoc(fileOrInterval));
} }
} }
} }
@ -103,28 +106,41 @@ public class IntervalUtils {
* first try to read it as a Picard interval file since that's well structured * first try to read it as a Picard interval file since that's well structured
* we'll fail quickly if it's not a valid file. * we'll fail quickly if it's not a valid file.
*/ */
boolean isPicardInterval = false;
try { try {
// Note: Picard will skip over intervals with contigs not in the sequence dictionary // Note: Picard will skip over intervals with contigs not in the sequence dictionary
IntervalList il = IntervalList.fromFile(inputFile); IntervalList il = IntervalList.fromFile(inputFile);
isPicardInterval = true;
int nInvalidIntervals = 0;
for (Interval interval : il.getIntervals()) { for (Interval interval : il.getIntervals()) {
ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd())); if ( glParser.isValidGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true))
ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true));
else {
nInvalidIntervals++;
}
} }
if ( nInvalidIntervals > 0 )
logger.warn("Ignoring " + nInvalidIntervals + " invalid intervals from " + inputFile);
} }
// if that didn't work, try parsing file as a GATK interval file // if that didn't work, try parsing file as a GATK interval file
catch (Exception e) { catch (Exception e) {
try { if ( isPicardInterval ) // definitely a picard file, but we failed to parse
XReadLines reader = new XReadLines(new File(file_name)); throw new UserException.CouldNotReadInputFile(inputFile, e);
for(String line: reader) { else {
if ( line.trim().length() > 0 ) { try {
ret.add(glParser.parseGenomeInterval(line)); XReadLines reader = new XReadLines(new File(file_name));
for(String line: reader) {
if ( line.trim().length() > 0 ) {
ret.add(glParser.parseGenomeLoc(line));
}
} }
reader.close();
}
catch (IOException e2) {
throw new UserException.CouldNotReadInputFile(inputFile, e2);
} }
reader.close();
}
catch (IOException e2) {
throw new UserException.CouldNotReadInputFile(inputFile, e2);
} }
} }
} }

View File

@ -113,7 +113,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
String contig, int intervalStart, int intervalEnd ) throws Exception { String contig, int intervalStart, int intervalEnd ) throws Exception {
List<String> intervalArgs = new ArrayList<String>(); List<String> intervalArgs = new ArrayList<String>();
List<GenomeLoc> rodIntervals = Arrays.asList(genomeLocParser.createGenomeLocWithoutValidation(contig, intervalStart, intervalEnd)); List<GenomeLoc> rodIntervals = Arrays.asList(genomeLocParser.createGenomeLoc(contig, intervalStart, intervalEnd, true));
testEngine.loadIntervals(intervalArgs, rodIntervals); testEngine.loadIntervals(intervalArgs, rodIntervals);
} }
@ -124,7 +124,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
// We need to adjust intervalStart, since BED intervals are 0-based. We don't need to adjust intervalEnd, // We need to adjust intervalStart, since BED intervals are 0-based. We don't need to adjust intervalEnd,
// since the ending point is an open interval. // since the ending point is an open interval.
File bedFile = createTempFile("testInvalidBedIntervalHandling", ".bed", File bedFile = createTempFile("testInvalidBedIntervalHandling", ".bed",
String.format("%s %d %d", contig, intervalStart - 1, intervalEnd)); String.format("%s %d %d", contig, intervalStart -1, intervalEnd));
List<String> intervalArgs = Arrays.asList(bedFile.getAbsolutePath()); List<String> intervalArgs = Arrays.asList(bedFile.getAbsolutePath());
List<GenomeLoc> rodIntervals = new ArrayList<GenomeLoc>(); List<GenomeLoc> rodIntervals = new ArrayList<GenomeLoc>();

View File

@ -80,7 +80,7 @@ public class VariantJEXLContextUnitTest extends BaseTest {
} catch (Exception e) { } catch (Exception e) {
Assert.fail("Unable to create expression" + e.getMessage()); Assert.fail("Unable to create expression" + e.getMessage());
} }
snpLoc = genomeLocParser.createGenomeLoc("chr1", 10, 10); snpLoc = genomeLocParser.createGenomeLoc("chr1", 10, 10, true);
} }
@BeforeMethod @BeforeMethod

View File

@ -275,7 +275,7 @@ public abstract class LocusViewTemplate extends BaseTest {
private static ReferenceSequenceFile fakeReferenceSequenceFile() { private static ReferenceSequenceFile fakeReferenceSequenceFile() {
return new ReferenceSequenceFile() { return new ReferenceSequenceFile() {
public SAMSequenceDictionary getSequenceDictionary() { public SAMSequenceDictionary getSequenceDictionary() {
SAMSequenceRecord sequenceRecord = new SAMSequenceRecord("chr1"); SAMSequenceRecord sequenceRecord = new SAMSequenceRecord("chr1", 1000000);
SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Collections.singletonList(sequenceRecord)); SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Collections.singletonList(sequenceRecord));
return dictionary; return dictionary;
} }

View File

@ -29,7 +29,7 @@ public class GenomeLocParserUnitTest extends BaseTest {
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
} }
@Test(expectedExceptions=RuntimeException.class) @Test(expectedExceptions=UserException.MalformedGenomeLoc.class)
public void testGetContigIndex() { public void testGetContigIndex() {
assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference
} }
@ -78,9 +78,9 @@ public class GenomeLocParserUnitTest extends BaseTest {
@Test @Test
public void testParseGoodString() { public void testParseGoodString() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-100"); GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-10");
assertEquals(0, loc.getContigIndex()); assertEquals(0, loc.getContigIndex());
assertEquals(loc.getStop(), 100); assertEquals(loc.getStop(), 10);
assertEquals(loc.getStart(), 1); assertEquals(loc.getStart(), 1);
} }
@ -165,7 +165,7 @@ public class GenomeLocParserUnitTest extends BaseTest {
assertEquals(loc.getStart(), 1); assertEquals(loc.getStart(), 1);
} }
@Test(expectedExceptions=ReviewedStingException.class) @Test(expectedExceptions=UserException.class)
public void testGenomeLocBad2() { public void testGenomeLocBad2() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-500-0"); GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-500-0");
assertEquals(loc.getContigIndex(), 0); assertEquals(loc.getContigIndex(), 0);
@ -173,7 +173,7 @@ public class GenomeLocParserUnitTest extends BaseTest {
assertEquals(loc.getStart(), 1); assertEquals(loc.getStart(), 1);
} }
@Test(expectedExceptions=ReviewedStingException.class) @Test(expectedExceptions=UserException.class)
public void testGenomeLocBad3() { public void testGenomeLocBad3() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1--0"); GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1--0");
assertEquals(loc.getContigIndex(), 0); assertEquals(loc.getContigIndex(), 0);
@ -184,19 +184,11 @@ public class GenomeLocParserUnitTest extends BaseTest {
// test out the validating methods // test out the validating methods
@Test @Test
public void testValidationOfGenomeLocs() { public void testValidationOfGenomeLocs() {
assertTrue(genomeLocParser.validGenomeLoc("chr1",1,1)); assertTrue(genomeLocParser.isValidGenomeLoc("chr1",1,1));
assertTrue(!genomeLocParser.validGenomeLoc("chr2",1,1)); // shouldn't have an entry assertTrue(!genomeLocParser.isValidGenomeLoc("chr2",1,1)); // shouldn't have an entry
assertTrue(!genomeLocParser.validGenomeLoc("chr1",1,11)); // past the end of the contig assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig
assertTrue(!genomeLocParser.validGenomeLoc("chr1",-1,10)); // bad start assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start
assertTrue(!genomeLocParser.validGenomeLoc("chr1",1,-2)); // bad stop assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
assertTrue(!genomeLocParser.validGenomeLoc("chr1",10,11)); // bad start, past end assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
assertTrue(genomeLocParser.validGenomeLoc(0,1,1));
assertTrue(!genomeLocParser.validGenomeLoc(1,1,1)); // shouldn't have an entry
assertTrue(!genomeLocParser.validGenomeLoc(0,1,11)); // past the end of the contig
assertTrue(!genomeLocParser.validGenomeLoc(-1,0,10)); // bad start
assertTrue(!genomeLocParser.validGenomeLoc(0,1,-2)); // bad stop
assertTrue(!genomeLocParser.validGenomeLoc(0,10,11)); // bad start, past end
} }
} }

View File

@ -138,7 +138,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
return hg18ReferenceLocs; return hg18ReferenceLocs;
List<GenomeLoc> locs = new ArrayList<GenomeLoc>(); List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
for (String interval: intervals) for (String interval: intervals)
locs.add(hg18GenomeLocParser.parseGenomeInterval(interval)); locs.add(hg18GenomeLocParser.parseGenomeLoc(interval));
return locs; return locs;
} }
@ -167,9 +167,9 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testFixedScatterIntervalsBasic() { public void testFixedScatterIntervalsBasic() {
GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3"); GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
List<File> files = testFiles("basic.", 3, ".intervals"); List<File> files = testFiles("basic.", 3, ".intervals");
@ -192,10 +192,10 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterFixedIntervalsLessFiles() { public void testScatterFixedIntervalsLessFiles() {
GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3"); GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeInterval("chr4"); GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4");
List<File> files = testFiles("less.", 3, ".intervals"); List<File> files = testFiles("less.", 3, ".intervals");
@ -234,10 +234,10 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterFixedIntervalsStart() { public void testScatterFixedIntervalsStart() {
List<String> intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); List<String> intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2");
GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeInterval("chr1:1-2"); GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2");
GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeInterval("chr1:4-5"); GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5");
GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:1-1"); GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1");
GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
List<File> files = testFiles("split.", 3, ".intervals"); List<File> files = testFiles("split.", 3, ".intervals");
@ -262,10 +262,10 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterFixedIntervalsMiddle() { public void testScatterFixedIntervalsMiddle() {
List<String> intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); List<String> intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2");
GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeInterval("chr2:1-2"); GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2");
GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeInterval("chr2:4-5"); GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5");
GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
List<File> files = testFiles("split.", 3, ".intervals"); List<File> files = testFiles("split.", 3, ".intervals");
@ -290,10 +290,10 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterFixedIntervalsEnd() { public void testScatterFixedIntervalsEnd() {
List<String> intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); List<String> intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5");
GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:2-2"); GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2");
GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeInterval("chr3:1-2"); GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2");
GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeInterval("chr3:4-5"); GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5");
List<File> files = testFiles("split.", 3, ".intervals"); List<File> files = testFiles("split.", 3, ".intervals");
@ -322,10 +322,13 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
int[] counts = { int[] counts = {
5169, 5573, 10017, 10567, 10551, 125, 138, 287, 291, 312, 105, 155, 324,
5087, 4908, 10120, 10435, 10399, 295, 298, 141, 121, 285, 302, 282, 88,
5391, 4735, 10621, 10352, 10654, 116, 274, 282, 248
5227, 5256, 10151, 9649, 9825 // 5169, 5573, 10017, 10567, 10551,
// 5087, 4908, 10120, 10435, 10399,
// 5391, 4735, 10621, 10352, 10654,
// 5227, 5256, 10151, 9649, 9825
}; };
//String splitCounts = ""; //String splitCounts = "";
@ -368,9 +371,9 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterContigIntervalsOrder() { public void testScatterContigIntervalsOrder() {
List<String> intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); List<String> intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2");
GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:1-1"); GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1");
GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
List<File> files = testFiles("split.", 3, ".intervals"); List<File> files = testFiles("split.", 3, ".intervals");
@ -391,9 +394,9 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterContigIntervalsBasic() { public void testScatterContigIntervalsBasic() {
GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3"); GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
List<File> files = testFiles("contig_basic.", 3, ".intervals"); List<File> files = testFiles("contig_basic.", 3, ".intervals");
@ -414,10 +417,10 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterContigIntervalsLessFiles() { public void testScatterContigIntervalsLessFiles() {
GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1"); GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1");
GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2"); GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2");
GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3"); GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3");
GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeInterval("chr4"); GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4");
List<File> files = testFiles("contig_less.", 3, ".intervals"); List<File> files = testFiles("contig_less.", 3, ".intervals");
@ -446,10 +449,10 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterContigIntervalsStart() { public void testScatterContigIntervalsStart() {
List<String> intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); List<String> intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2");
GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeInterval("chr1:1-2"); GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2");
GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeInterval("chr1:4-5"); GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5");
GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:1-1"); GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1");
GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
List<File> files = testFiles("contig_split_start.", 3, ".intervals"); List<File> files = testFiles("contig_split_start.", 3, ".intervals");
@ -472,10 +475,10 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterContigIntervalsMiddle() { public void testScatterContigIntervalsMiddle() {
List<String> intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); List<String> intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2");
GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeInterval("chr2:1-2"); GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2");
GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeInterval("chr2:4-5"); GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5");
GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2");
List<File> files = testFiles("contig_split_middle.", 3, ".intervals"); List<File> files = testFiles("contig_split_middle.", 3, ".intervals");
@ -498,10 +501,10 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test @Test
public void testScatterContigIntervalsEnd() { public void testScatterContigIntervalsEnd() {
List<String> intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); List<String> intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5");
GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1");
GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:2-2"); GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2");
GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeInterval("chr3:1-2"); GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2");
GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeInterval("chr3:4-5"); GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5");
List<File> files = testFiles("contig_split_end.", 3 ,".intervals"); List<File> files = testFiles("contig_split_end.", 3 ,".intervals");

View File

@ -121,7 +121,7 @@ class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ip
def parseGenomeInterval( s : String ) : GenomeLoc = { def parseGenomeInterval( s : String ) : GenomeLoc = {
val sp = s.split("\\s+") val sp = s.split("\\s+")
// todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted
if ( s.contains(":") ) parser.parseGenomeInterval(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn) if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn)
} }
object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") { object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") {

View File

@ -44,7 +44,7 @@ class VCFSimpleMerge extends InProcessFunction {
def genomeLoc(xrl : PeekableXRL, p : GenomeLocParser ) : GenomeLoc = { def genomeLoc(xrl : PeekableXRL, p : GenomeLocParser ) : GenomeLoc = {
var slp = xrl.peek.split("\t",3) var slp = xrl.peek.split("\t",3)
return p.parseGenomeLoc(slp(0),Integer.parseInt(slp(1)),Integer.parseInt(slp(1))) return p.createGenomeLoc(slp(0),Integer.parseInt(slp(1)),Integer.parseInt(slp(1)))
} }
def run = { def run = {

View File

@ -46,9 +46,9 @@ class GATKIntervalsUnitTest {
@Test @Test
def testWithIntervals() { def testWithIntervals() {
val chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1") val chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1")
val chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:2-3") val chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-3")
val chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:3-5") val chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:3-5")
val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3)) Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3))