From e234589240583a593511b8e0579ab2d1c6acae3c Mon Sep 17 00:00:00 2001 From: depristo Date: Sat, 21 May 2011 02:01:59 +0000 Subject: [PATCH] Contracts for GenomeLocParser and GenomeLoc are now fully implemented. GenomeLocs can officially have any start/stop values from -Inf - +Inf. Bounds w.r.t. the reference are enforced, optionally, by GenomeLocParser. General code cleanup throughout the subsystem. All validation code for GLs is now centralized, and all I/O systems now validate their inputs. Because of this, the Picard interval processing code has been changed to examine whether an interval is valid, and only keep the valid intervals. Note that the scatter/gather test was changed, because the original hg18 chr20 interval files as actually malformed (all records for some reason where on chr20). Many interval processing routines were moved to IntervalUtils, as this is their natural home. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5830 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 4 - .../variantcontext/VariantContextUtils.java | 2 +- .../annotator/AnnotatorInputTableCodec.java | 4 +- .../refdata/features/beagle/BeagleCodec.java | 2 +- .../refdata/features/refseq/RefSeqCodec.java | 6 +- .../StringToGenomeLocIteratorAdapter.java | 2 +- .../gatk/walkers/indels/IndelRealigner.java | 2 +- .../indels/RealignerTargetCreator.java | 2 +- .../phasing/CalcFullHaplotypesWalker.java | 4 +- .../broadinstitute/sting/utils/GenomeLoc.java | 9 +- .../sting/utils/GenomeLocParser.java | 420 +++++++----------- .../sting/utils/GenomeLocSortedSet.java | 10 +- .../utils/MalformedGenomeLocException.java | 39 -- .../sting/utils/bed/BedParser.java | 2 +- .../sting/utils/interval/IntervalUtils.java | 38 +- .../gatk/GenomeAnalysisEngineUnitTest.java | 4 +- .../VariantJEXLContextUnitTest.java | 2 +- .../providers/LocusViewTemplate.java | 2 +- .../sting/utils/GenomeLocParserUnitTest.java | 30 +- .../utils/interval/IntervalUtilsUnitTest.java | 95 ++-- .../ipf/intervals/ExpandIntervals.scala | 2 +- .../library/ipf/vcf/VCFSimpleMerge.scala | 2 +- .../gatk/GATKIntervalsUnitTest.scala | 6 +- 23 files changed, 276 insertions(+), 413 deletions(-) delete mode 100644 java/src/org/broadinstitute/sting/utils/MalformedGenomeLocException.java diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 15c375369..af355ab03 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -655,10 +655,6 @@ public class GenomeAnalysisEngine { argCollection.unsafe == ValidationExclusion.TYPE.ALL); List nonRODIntervals = IntervalUtils.parseIntervalArguments(genomeLocParser, argList, allowEmptyIntervalList); - - genomeLocParser.validateGenomeLocList(nonRODIntervals); - genomeLocParser.validateGenomeLocList(rodIntervals); - List allIntervals = IntervalUtils.mergeListsBySetOperator(rodIntervals, nonRODIntervals, argCollection.BTIMergeRule); return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging); diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index 212a09937..fe6495b64 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -795,7 +795,7 @@ public class VariantContextUtils { * @return the genomeLoc */ public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { - return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd()); + return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd(), true); } public abstract static class AlleleMergeRule { diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java index 964d61be9..59cd14a22 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java @@ -93,7 +93,7 @@ public class AnnotatorInputTableCodec implements ReferenceDependentFeatureCodec< GenomeLoc loc; String chr = st.nextToken(); if ( chr.indexOf(":") != -1 ) { - loc = genomeLocParser.parseGenomeInterval(chr); + loc = genomeLocParser.parseGenomeLoc(chr); } else { if ( st.countTokens() < 3 ) throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line); @@ -123,7 +123,7 @@ public class AnnotatorInputTableCodec implements ReferenceDependentFeatureCodec< GenomeLoc loc; if ( values.get(0).indexOf(":") != -1 ) - loc = genomeLocParser.parseGenomeInterval(values.get(0)); + loc = genomeLocParser.parseGenomeLoc(values.get(0)); else loc = genomeLocParser.createGenomeLoc(values.get(0), Integer.valueOf(values.get(1)), Integer.valueOf(values.get(2))); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java index 580ed08a5..7f97451cf 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/beagle/BeagleCodec.java @@ -177,7 +177,7 @@ public class BeagleCodec implements ReferenceDependentFeatureCodec exon_frames = new ArrayList(eframes.length); for ( int i = 0 ; i < exon_starts.length ; i++ ) { - exons.add(genomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); + exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); exon_frames.add(Integer.decode(eframes[i])); } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java b/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java index 3405f6f92..101784d97 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/utils/StringToGenomeLocIteratorAdapter.java @@ -84,7 +84,7 @@ public class StringToGenomeLocIteratorAdapter implements Iterator { public GenomeLoc next() { - if ( myFormat == FORMAT.GATK ) return genomeLocParser.parseGenomeInterval( it.next() ); + if ( myFormat == FORMAT.GATK ) return genomeLocParser.parseGenomeLoc(it.next()); return BedParser.parseLocation( genomeLocParser,it.next() ); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 052e6363b..484981e44 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -245,7 +245,7 @@ public class IndelRealigner extends ReadWalker { if (IntervalUtils.isIntervalFile(fileOrInterval)) { merger.add(new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new java.io.File(fileOrInterval), IntervalMergingRule.OVERLAPPING_ONLY ) ); } else { - rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeInterval(fileOrInterval)); + rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeLoc(fileOrInterval)); } } if ( ! rawIntervals.isEmpty() ) merger.add(rawIntervals.iterator()); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index 5f6a342ae..b536a0cb5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -275,7 +275,7 @@ public class RealignerTargetCreator extends RodWalker= 0 && eventStopPos - eventStartPos < maxIntervalSize; + return getToolkit().getGenomeLocParser().isValidGenomeLoc(loc.getContig(), eventStartPos, eventStopPos, true) && eventStopPos >= 0 && eventStopPos - eventStartPos < maxIntervalSize; } public String toString() { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CalcFullHaplotypesWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CalcFullHaplotypesWalker.java index e9e333d2e..4f8d84965 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CalcFullHaplotypesWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/CalcFullHaplotypesWalker.java @@ -101,7 +101,7 @@ public class CalcFullHaplotypesWalker extends RodWalker { if (waitingHaplotype == null) {// changed to a new contig: // Set the new haplotype to extend from [1, prevPosition] if (prevPosition >= 1) { - GenomeLoc startInterval = getToolkit().getGenomeLocParser().parseGenomeLoc(curLocus.getContig(), 1, prevPosition); + GenomeLoc startInterval = getToolkit().getGenomeLocParser().createGenomeLoc(curLocus.getContig(), 1, prevPosition); waitingHaplotype = new Haplotype(startInterval, sampleHapEntry.getKey()); sampleHapEntry.setValue(waitingHaplotype); } @@ -201,7 +201,7 @@ public class CalcFullHaplotypesWalker extends RodWalker { public void extend(int stop) { if (stop > interval.getStop()) - interval = getToolkit().getGenomeLocParser().parseGenomeLoc(interval.getContig(), interval.getStart(), stop); + interval = getToolkit().getGenomeLocParser().createGenomeLoc(interval.getContig(), interval.getStart(), stop); } public void incrementHetCount() { diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index f94586183..a8b774986 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -13,7 +13,9 @@ import java.io.Serializable; * Date: Mar 2, 2009 * Time: 8:50:11 AM * - * Genome location representation. It is *** 1 *** based closed. + * Genome location representation. It is *** 1 *** based closed. Note that GenomeLocs start and stop values + * can be any positive or negative number, by design. Bound validation is a feature of the GenomeLocParser, + * and not a fundamental constraint of the GenomeLoc */ public class GenomeLoc implements Comparable, Serializable, HasGenomeLocation { /** @@ -48,7 +50,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome @Requires({ "contig != null", "contigIndex >= 0", // I believe we aren't allowed to create GenomeLocs without a valid contigIndex - "start >= 0 && start <= stop"}) + "start <= stop"}) protected GenomeLoc( final String contig, final int contigIndex, final int start, final int stop ) { this.contigName = contig; this.contigIndex = contigIndex; @@ -122,8 +124,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome */ @Requires({ "that != null", - "! isUnmapped(this)", - "! isUnmapped(that)"}) + "isUnmapped(this) == isUnmapped(that)"}) @Ensures("result != null") public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException { if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 42c17f104..732d52b2b 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -25,8 +25,6 @@ package org.broadinstitute.sting.utils; -import java.util.List; - import com.google.java.contract.*; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.SAMRecord; @@ -178,6 +176,21 @@ public class GenomeLocParser { } } + /** + * Determines whether the given contig is valid with respect to the sequence dictionary + * already installed in the GenomeLoc. + * + * @return True if the contig is valid. False otherwise. + */ + public boolean contigIsInDictionary(String contig) { + return contig != null && contigInfo.hasContig(contig); + } + + public boolean indexIsInDictionary(final int index) { + return index >= 0 && contigInfo.hasContig(index); + } + + /** * get the contig's SAMSequenceRecord * @@ -185,27 +198,14 @@ public class GenomeLocParser { * * @return the sam sequence record */ - @Requires({"contig != null", "contigIsInDictionary(contig)"}) @Ensures("result != null") + @ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"}) public SAMSequenceRecord getContigInfo(final String contig) { + if ( contig == null || ! contigIsInDictionary(contig) ) + throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); return contigInfo.getSequence(contig); } - /** - * Determines whether the given contig is valid with respect to the sequence dictionary - * already installed in the GenomeLoc. - * - * @return True if the contig is valid. False otherwise. - */ - @Requires({"contig != null"}) - public boolean contigIsInDictionary(String contig) { - return contigInfo.hasContig(contig); - } - - public boolean indexIsInDictionary(final int index) { - return contigInfo.hasContig(index); - } - /** * Returns the contig index of a specified string version of the contig * @@ -213,47 +213,126 @@ public class GenomeLocParser { * * @return the contig index, -1 if not found */ - @Requires("contig != null") @Ensures("result >= 0") + @ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"}) public int getContigIndex(final String contig) { - if (! contigInfo.hasContig(contig) ) - throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); - return contigInfo.getSequenceIndex(contig); + return getContigInfo(contig).getSequenceIndex(); } @Requires("contig != null") protected int getContigIndexWithoutException(final String contig) { - if (! contigInfo.hasContig(contig)) + if ( contig == null || ! contigInfo.hasContig(contig) ) return -1; return contigInfo.getSequenceIndex(contig); } + // -------------------------------------------------------------------------------------------------------------- + // + // Low-level creation functions + // + // -------------------------------------------------------------------------------------------------------------- + /** + * create a genome loc, given the contig name, start, and stop + * + * @param contig the contig name + * @param start the starting position + * @param stop the stop position + * + * @return a new genome loc + */ + @Ensures("result != null") + @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop)"}) + public GenomeLoc createGenomeLoc(String contig, final int start, final int stop) { + return createGenomeLoc(contig, getContigIndex(contig), start, stop); + } + + public GenomeLoc createGenomeLoc(String contig, final int start, final int stop, boolean mustBeOnReference) { + return createGenomeLoc(contig, getContigIndex(contig), start, stop, mustBeOnReference); + } + + @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop, false)"}) + public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop) { + return createGenomeLoc(contig, index, start, stop, false); + } + + @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop,mustBeOnReference)"}) + public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop, boolean mustBeOnReference) { + validateGenomeLoc(contig, index, start, stop, mustBeOnReference, true); + return new GenomeLoc(contig, index, start, stop); + } + + /** + * validate a position or interval on the genome as valid + * + * Requires that contig exist in the master sequence dictionary, and that contig index be valid as well. Requires + * that start <= stop. + * + * if mustBeOnReference is true, + * performs boundary validation for genome loc INTERVALS: + * start and stop are on contig and start <= stop + * + * @param contig the contig name + * @param start the start position + * @param stop the stop position + * + * @return true if it's valid, false otherwise. If exceptOnError, then throws a UserException if invalid + */ + private boolean validateGenomeLoc(String contig, int contigIndex, int start, int stop, boolean mustBeOnReference, boolean exceptOnError) { + if ( ! contigInfo.hasContig(contig) ) + return vglHelper(exceptOnError, String.format("Unknown contig %s", contig)); + + if (stop < start) + return vglHelper(exceptOnError, String.format("The stop position %d is less than start %d", stop, start)); + + if (contigIndex < 0) + return vglHelper(exceptOnError, String.format("The contig index %d is less than 0", contigIndex)); + + if (contigIndex >= contigInfo.getNSequences()) + return vglHelper(exceptOnError, String.format("The contig index %d is greater than the stored sequence count (%d)", contigIndex, contigInfo.getNSequences())); + + if ( mustBeOnReference ) { + if (start < 0) + return vglHelper(exceptOnError, String.format("The start position %d is less than 0", start)); + + if (stop < 0) + return vglHelper(exceptOnError, String.format("The stop position %d is less than 0", stop)); + + int contigSize = contigInfo.getSequence(contigIndex).getSequenceLength(); + if (start > contigSize || stop > contigSize) + return vglHelper(exceptOnError, String.format("The genome loc coordinates %d-%d exceed the contig size (%d)", start, stop, contigSize)); + } + + // we passed + return true; + } + + public boolean isValidGenomeLoc(String contig, int start, int stop, boolean mustBeOnReference ) { + return validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, mustBeOnReference, false); + } + + public boolean isValidGenomeLoc(String contig, int start, int stop ) { + return validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, true, false); + } + + private boolean vglHelper(boolean exceptOnError, String msg) { + if ( exceptOnError ) + throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect:" + msg); + else + return false; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // Parsing genome locs + // + // -------------------------------------------------------------------------------------------------------------- + /** * parse a genome interval, from a location string * * Performs interval-style validation: * - * contig is valid; start and stop less than the end; start <= sto - * @param str the string to parse - * - * @return a GenomeLoc representing the String - * - */ - @Requires("str != null") - @Ensures("result != null") - public GenomeLoc parseGenomeInterval(final String str) { - GenomeLoc ret = parseGenomeLoc(str); - exceptionOnInvalidGenomeLocBounds(ret); - return ret; - } - - /** - * parse a genome location, from a location string - * - * Performs read-style validation: - * checks that start and stop are positive, start < stop, and the contig is valid - * does not check that genomeLoc is actually on the contig - * + * contig is valid; start and stop less than the end; start <= stop, and start/stop are on the contig * @param str the string to parse * * @return a GenomeLoc representing the String @@ -296,22 +375,15 @@ public class GenomeLocParser { // is the contig valid? if (!contigIsInDictionary(contig)) - throw new UserException("Contig '" + contig + "' does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?"); + throw new UserException.MalformedGenomeLoc("Contig '" + contig + "' does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?"); if (stop == Integer.MAX_VALUE) // lookup the actually stop position! stop = getContigInfo(contig).getSequenceLength(); - GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig), start, stop); - exceptionOnInvalidGenomeLoc(locus); - return locus; + return createGenomeLoc(contig, getContigIndex(contig), start, stop, true); } - // -------------------------------------------------------------------------------------------------------------- - // - // Parsing string representations - // - // -------------------------------------------------------------------------------------------------------------- /** * Parses a number like 1,000,000 into a long. * @param pos @@ -342,49 +414,11 @@ public class GenomeLocParser { } } - /** - * Use this static constructor when the input data is under limited control (i.e. parsing user data). - * - * @param contig Contig to parse. - * @param start Starting point. - * @param stop Stop point. - * - * @return The genome location, or a MalformedGenomeLocException if unparseable. - * - * Validation: only checks that contig is valid - * start/stop could be anything - */ - public GenomeLoc parseGenomeLoc(final String contig, int start, int stop) { - if (!contigIsInDictionary(contig)) - throw new MalformedGenomeLocException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?"); - return new GenomeLoc(contig, getContigIndex(contig), start, stop); - } - - /** - * get the sequence name from a sequence index - * - * @param contigIndex get the contig index - * - * @return the string that represents that contig name - */ - @Requires("indexIsInDictionary(contigIndex)") - @Ensures("result != null") - private String getSequenceNameFromIndex(int contigIndex) { - return contigInfo.getSequence(contigIndex).getSequenceName(); - } - - /** - * create a genome loc, given the contig name, start, and stop - * - * @param contig the contig name - * @param start the starting position - * @param stop the stop position - * - * @return a new genome loc - */ - public GenomeLoc createGenomeLoc(String contig, final int start, final int stop) { - return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig), start, stop)); - } + // -------------------------------------------------------------------------------------------------------------- + // + // Parsing string representations + // + // -------------------------------------------------------------------------------------------------------------- /** * create a genome loc, given a read. If the read is unmapped, *and* yet the read has a contig and start position, @@ -394,183 +428,44 @@ public class GenomeLocParser { * * @return */ + @Requires("read != null") + @Ensures("result != null") public GenomeLoc createGenomeLoc(final SAMRecord read) { - GenomeLoc loc; - if ( read.getReadUnmappedFlag() && read.getReferenceIndex() == -1 ) // read is unmapped and not placed anywhere on the genome - loc = GenomeLoc.UNMAPPED; + return GenomeLoc.UNMAPPED; else { int end = read.getReadUnmappedFlag() ? read.getAlignmentStart() : read.getAlignmentEnd(); - loc = new GenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), end); + return createGenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), end, false); } - - return exceptionOnInvalidGenomeLoc(loc); } /** - * create a new genome loc, given the contig name, and a single position + * create a new genome loc, given the contig name, and a single position. Must be on the reference * * @param contig the contig name * @param pos the postion * * @return a genome loc representing a single base at the specified postion on the contig */ + @Ensures("result != null") + @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, pos, pos, true)"}) public GenomeLoc createGenomeLoc(final String contig, final int pos) { - return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig), pos, pos)); + return createGenomeLoc(contig, getContigIndex(contig), pos, pos); } - /** - * Creates a new GenomeLoc without performing any validation on its contig or bounds. - * FOR UNIT TESTING PURPOSES ONLY! - * - * @param contig the contig name - * @param start start position of the interval - * @param stop stop position of the interval - * @return a new GenomeLoc representing the specified location - */ - public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) { - return new GenomeLoc(contig, getContigIndexWithoutException(contig), start, stop); - } - - /** - * verify the specified genome loc is valid, if it's not, throw an exception - * Will not verify the location against contig bounds. - * - * - * Validation: - * checks that start and stop are positive, start < stop, and the contig is valid - * does not check that genomeLoc is actually on the contig, so start could be > end of contig - * - * @param toReturn the genome loc we're about to return - * - * @return the genome loc if it's valid, otherwise we throw an exception - * - */ - private GenomeLoc exceptionOnInvalidGenomeLoc(GenomeLoc toReturn) { - if ( GenomeLoc.isUnmapped(toReturn) ) { - return toReturn; - } - - if (toReturn.getStart() < 0) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the start position is less than 0 " + - "in interval: " + toReturn); - } - if ((toReturn.getStop() != -1) && (toReturn.getStop() < 0)) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the stop position is less than 0 " + - "in interval: " + toReturn); - } - if (toReturn.getContigIndex() < 0) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is less than 0 " + - "in interval: " + toReturn); - } - if (toReturn.getContigIndex() >= contigInfo.getNSequences()) { - throw new ReviewedStingException("Parameters to GenomeLocParser are incorrect: the contig index is greater than " + - "the stored sequence count in interval: " + toReturn); - - } - return toReturn; - - } - - /** - * Verify the locus against the bounds of the contig. - * - * performs boundary validation for genome loc INTERVALS: - * start and stop are on contig and start <= stop - * does NOT check that start and stop > 0, or that contig is valid - * for that reason, this function should only be called AFTER exceptionOnInvalidGenomeLoc() - * exceptionOnInvalidGenomeLoc isn't included in this function to save time - * - * @param locus Locus to verify. - */ - private void exceptionOnInvalidGenomeLocBounds(GenomeLoc locus) { - if ( GenomeLoc.isUnmapped(locus) ) { - return; - } - - int contigSize = contigInfo.getSequence(locus.getContigIndex()).getSequenceLength(); - if(locus.getStart() > contigSize) - throw new UserException.MalformedGenomeLoc("GenomeLoc is invalid: locus start is after the end of contig",locus); - if(locus.getStop() > contigSize) - throw new UserException.MalformedGenomeLoc("GenomeLoc is invalid: locus stop is after the end of contig",locus); - if (locus.getStart() > locus.getStop()) { - throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect: the start position is " + - "greater than the end position", locus); - } - } - - /** - * Validates each of the GenomeLocs in the list passed as an argument: each GenomeLoc must refer to a - * valid contig, and its bounds must be within the bounds of its contig. Throws a MalformedGenomeLoc - * exception if an invalid GenomeLoc is found. - * - * @param genomeLocList The list of GenomeLocs to validate - */ - public void validateGenomeLocList( List genomeLocList ) { - if ( genomeLocList == null ) { - return; - } - - for ( GenomeLoc loc : genomeLocList ) { - try { - exceptionOnInvalidGenomeLoc(loc); - exceptionOnInvalidGenomeLocBounds(loc); - } - catch ( UserException e ) { - throw e; - } - catch ( ReviewedStingException e ) { - throw new UserException.MalformedGenomeLoc(e.getMessage()); - } - } - } - - /** - * validate a position or interval on the genome as valid - * - * @param contig the contig name - * @param start the start position - * @param stop the stop position - * - * @return true if it's valid, false otherwise - * - * performs interval-style validation: contig is valid and atart and stop less than the end - */ - public boolean validGenomeLoc(String contig, int start, int stop) { - if ( ! contigInfo.hasContig(contig) ) - return false; - else - return validGenomeLoc(getContigIndex(contig), start, stop); - } - - /** - * validate a position or interval on the genome as valid - * - * @param contigIndex the contig name - * @param start the start position - * @param stop the stop position - * - * @return true if it's valid, false otherwise - * - * performs interval-style validation: contig is valid and atart and stop less than the end - */ - public boolean validGenomeLoc(int contigIndex, int start, int stop) { - // quick check before we get the contig size, is the contig number valid - if ((contigIndex < 0) || // the contig index has to be positive - (contigIndex >= contigInfo.getNSequences())) // the contig must be in the integer range of contigs) - return false; - - int contigSize = contigInfo.getSequence(contigIndex).getSequenceLength(); - if ((start < 0) || // start must be greater than 0 - (stop < 0 ) || // the stop must be greater than 0 - (start > contigSize) || // the start must be before or equal to the contig end - (stop > contigSize)) // the stop must also be before or equal to the contig end - return false; - - // we passed - return true; - } +// /** +// * Creates a new GenomeLoc without performing any validation on its contig or bounds. +// * FOR UNIT TESTING PURPOSES ONLY! +// * +// * @param contig the contig name +// * @param start start position of the interval +// * @param stop stop position of the interval +// * @return a new GenomeLoc representing the specified location +// */ +// public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) { +// return new GenomeLoc(contig, getContigIndexWithoutException(contig), start, stop); +// } /** * create a new genome loc from an existing loc, with a new start position @@ -579,12 +474,11 @@ public class GenomeLocParser { * * @param loc the old location * @param start a new start position - * TODO -- move to me GenomeLoc class itself * * @return the newly created genome loc */ public GenomeLoc setStart(GenomeLoc loc, int start) { - return exceptionOnInvalidGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop())); + return createGenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop()); } /** @@ -595,11 +489,10 @@ public class GenomeLocParser { * @param loc the old location * @param stop a new stop position * - * TODO -- move to me GenomeLoc class itself * @return */ public GenomeLoc setStop(GenomeLoc loc, int stop) { - return exceptionOnInvalidGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop)); + return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop); } /** @@ -607,7 +500,6 @@ public class GenomeLocParser { * * @param loc the old location * - * TODO -- move to me GenomeLoc class itself * @return a new genome loc */ public GenomeLoc incPos(GenomeLoc loc) { @@ -620,11 +512,10 @@ public class GenomeLocParser { * @param loc the old location * @param by how much to move the start and stop by * - * TODO -- move to me GenomeLoc class itself * @return a new genome loc */ public GenomeLoc incPos(GenomeLoc loc, int by) { - return exceptionOnInvalidGenomeLoc(new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by)); + return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by); } /** @@ -636,6 +527,7 @@ public class GenomeLocParser { @Ensures("result != null") public GenomeLoc createOverEntireContig(String contigName) { SAMSequenceRecord contig = contigInfo.getSequence(contigName); - return exceptionOnInvalidGenomeLoc(new GenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength())); - } + return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true); + } + } diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index efcc27cf2..fd7a79f48 100755 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -253,12 +253,14 @@ public class GenomeLocSortedSet extends AbstractSet { * |------| + |--------| * */ - GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), g.getStart(), e.getStart() - 1); - GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), e.getStop() + 1, g.getStop()); - if (after.getStop() - after.getStart() >= 0) { + int afterStop = g.getStop(), afterStart = e.getStop() + 1; + int beforeStop = e.getStart() - 1, beforeStart = g.getStart(); + if (afterStop - afterStart >= 0) { + GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), afterStart, afterStop); l.add(after); } - if (before.getStop() - before.getStart() >= 0) { + if (beforeStop - beforeStart >= 0) { + GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), beforeStart, beforeStop); l.add(before); } diff --git a/java/src/org/broadinstitute/sting/utils/MalformedGenomeLocException.java b/java/src/org/broadinstitute/sting/utils/MalformedGenomeLocException.java deleted file mode 100644 index ce554397c..000000000 --- a/java/src/org/broadinstitute/sting/utils/MalformedGenomeLocException.java +++ /dev/null @@ -1,39 +0,0 @@ -package org.broadinstitute.sting.utils; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -/** - * User: hanna - * Date: Jun 2, 2009 - * Time: 11:43:48 AM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * Indicates that something was wrong with in the parameters passed to create a GenomeLoc... - * bad sequence id out of bounds, etc. - */ - -public class MalformedGenomeLocException extends ReviewedStingException { - /** - * Create a new MalformedGenomeLocException with the given message. Does not preserve the existing stack trace. - * @param message The message. - */ - public MalformedGenomeLocException( String message ) { - super(message); - } - - /** - * Create a new MalformedGenomeLocException with the given message and root cause. - * @param message The message. - * @param t The root cause. - */ - public MalformedGenomeLocException( String message, Throwable t ) { - super(message,t); - } -} diff --git a/java/src/org/broadinstitute/sting/utils/bed/BedParser.java b/java/src/org/broadinstitute/sting/utils/bed/BedParser.java index b7ca11f9c..abcae066f 100644 --- a/java/src/org/broadinstitute/sting/utils/bed/BedParser.java +++ b/java/src/org/broadinstitute/sting/utils/bed/BedParser.java @@ -88,7 +88,7 @@ public class BedParser { } // we currently drop the rest of the bed record, which can contain names, scores, etc - return genomeLocParser.createGenomeLoc(contig, start, stop); + return genomeLocParser.createGenomeLoc(contig, start, stop, true); } diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index fea7d08d8..80dc35455 100644 --- a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLoc; @@ -24,6 +25,8 @@ import java.io.File; * @version 0.1 */ public class IntervalUtils { + private static Logger logger = Logger.getLogger(IntervalUtils.class); + /** * Turns a set of strings describing intervals into a parsed set of intervals. Valid string elements can be files, * intervals in samtools notation (chrA:B-C), or some combination of the above separated by semicolons. Additionally, @@ -70,7 +73,7 @@ public class IntervalUtils { // otherwise treat as an interval -> parse and add to raw interval list else { - rawIntervals.add(parser.parseGenomeInterval(fileOrInterval)); + rawIntervals.add(parser.parseGenomeLoc(fileOrInterval)); } } } @@ -103,28 +106,41 @@ public class IntervalUtils { * first try to read it as a Picard interval file since that's well structured * we'll fail quickly if it's not a valid file. */ + boolean isPicardInterval = false; try { // Note: Picard will skip over intervals with contigs not in the sequence dictionary IntervalList il = IntervalList.fromFile(inputFile); + isPicardInterval = true; + int nInvalidIntervals = 0; for (Interval interval : il.getIntervals()) { - ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd())); + if ( glParser.isValidGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true)) + ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true)); + else { + nInvalidIntervals++; + } } + if ( nInvalidIntervals > 0 ) + logger.warn("Ignoring " + nInvalidIntervals + " invalid intervals from " + inputFile); } // if that didn't work, try parsing file as a GATK interval file catch (Exception e) { - try { - XReadLines reader = new XReadLines(new File(file_name)); - for(String line: reader) { - if ( line.trim().length() > 0 ) { - ret.add(glParser.parseGenomeInterval(line)); + if ( isPicardInterval ) // definitely a picard file, but we failed to parse + throw new UserException.CouldNotReadInputFile(inputFile, e); + else { + try { + XReadLines reader = new XReadLines(new File(file_name)); + for(String line: reader) { + if ( line.trim().length() > 0 ) { + ret.add(glParser.parseGenomeLoc(line)); + } } + reader.close(); + } + catch (IOException e2) { + throw new UserException.CouldNotReadInputFile(inputFile, e2); } - reader.close(); - } - catch (IOException e2) { - throw new UserException.CouldNotReadInputFile(inputFile, e2); } } } diff --git a/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index e4a714e4a..a17c162f5 100644 --- a/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -113,7 +113,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { String contig, int intervalStart, int intervalEnd ) throws Exception { List intervalArgs = new ArrayList(); - List rodIntervals = Arrays.asList(genomeLocParser.createGenomeLocWithoutValidation(contig, intervalStart, intervalEnd)); + List rodIntervals = Arrays.asList(genomeLocParser.createGenomeLoc(contig, intervalStart, intervalEnd, true)); testEngine.loadIntervals(intervalArgs, rodIntervals); } @@ -124,7 +124,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { // We need to adjust intervalStart, since BED intervals are 0-based. We don't need to adjust intervalEnd, // since the ending point is an open interval. File bedFile = createTempFile("testInvalidBedIntervalHandling", ".bed", - String.format("%s %d %d", contig, intervalStart - 1, intervalEnd)); + String.format("%s %d %d", contig, intervalStart -1, intervalEnd)); List intervalArgs = Arrays.asList(bedFile.getAbsolutePath()); List rodIntervals = new ArrayList(); diff --git a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantJEXLContextUnitTest.java b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantJEXLContextUnitTest.java index 6fce044c9..4de413334 100644 --- a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantJEXLContextUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantJEXLContextUnitTest.java @@ -80,7 +80,7 @@ public class VariantJEXLContextUnitTest extends BaseTest { } catch (Exception e) { Assert.fail("Unable to create expression" + e.getMessage()); } - snpLoc = genomeLocParser.createGenomeLoc("chr1", 10, 10); + snpLoc = genomeLocParser.createGenomeLoc("chr1", 10, 10, true); } @BeforeMethod diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index 372018365..acfefd627 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -275,7 +275,7 @@ public abstract class LocusViewTemplate extends BaseTest { private static ReferenceSequenceFile fakeReferenceSequenceFile() { return new ReferenceSequenceFile() { public SAMSequenceDictionary getSequenceDictionary() { - SAMSequenceRecord sequenceRecord = new SAMSequenceRecord("chr1"); + SAMSequenceRecord sequenceRecord = new SAMSequenceRecord("chr1", 1000000); SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Collections.singletonList(sequenceRecord)); return dictionary; } diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index 99653094d..f1f849bf5 100644 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -29,7 +29,7 @@ public class GenomeLocParserUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } - @Test(expectedExceptions=RuntimeException.class) + @Test(expectedExceptions=UserException.MalformedGenomeLoc.class) public void testGetContigIndex() { assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference } @@ -78,9 +78,9 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test public void testParseGoodString() { - GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-100"); + GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-10"); assertEquals(0, loc.getContigIndex()); - assertEquals(loc.getStop(), 100); + assertEquals(loc.getStop(), 10); assertEquals(loc.getStart(), 1); } @@ -165,7 +165,7 @@ public class GenomeLocParserUnitTest extends BaseTest { assertEquals(loc.getStart(), 1); } - @Test(expectedExceptions=ReviewedStingException.class) + @Test(expectedExceptions=UserException.class) public void testGenomeLocBad2() { GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-500-0"); assertEquals(loc.getContigIndex(), 0); @@ -173,7 +173,7 @@ public class GenomeLocParserUnitTest extends BaseTest { assertEquals(loc.getStart(), 1); } - @Test(expectedExceptions=ReviewedStingException.class) + @Test(expectedExceptions=UserException.class) public void testGenomeLocBad3() { GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1--0"); assertEquals(loc.getContigIndex(), 0); @@ -184,19 +184,11 @@ public class GenomeLocParserUnitTest extends BaseTest { // test out the validating methods @Test public void testValidationOfGenomeLocs() { - assertTrue(genomeLocParser.validGenomeLoc("chr1",1,1)); - assertTrue(!genomeLocParser.validGenomeLoc("chr2",1,1)); // shouldn't have an entry - assertTrue(!genomeLocParser.validGenomeLoc("chr1",1,11)); // past the end of the contig - assertTrue(!genomeLocParser.validGenomeLoc("chr1",-1,10)); // bad start - assertTrue(!genomeLocParser.validGenomeLoc("chr1",1,-2)); // bad stop - assertTrue(!genomeLocParser.validGenomeLoc("chr1",10,11)); // bad start, past end - - assertTrue(genomeLocParser.validGenomeLoc(0,1,1)); - assertTrue(!genomeLocParser.validGenomeLoc(1,1,1)); // shouldn't have an entry - assertTrue(!genomeLocParser.validGenomeLoc(0,1,11)); // past the end of the contig - assertTrue(!genomeLocParser.validGenomeLoc(-1,0,10)); // bad start - assertTrue(!genomeLocParser.validGenomeLoc(0,1,-2)); // bad stop - assertTrue(!genomeLocParser.validGenomeLoc(0,10,11)); // bad start, past end - + assertTrue(genomeLocParser.isValidGenomeLoc("chr1",1,1)); + assertTrue(!genomeLocParser.isValidGenomeLoc("chr2",1,1)); // shouldn't have an entry + assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig + assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start + assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop + assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end } } diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index bd2fbb357..bb892eec8 100644 --- a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -138,7 +138,7 @@ public class IntervalUtilsUnitTest extends BaseTest { return hg18ReferenceLocs; List locs = new ArrayList(); for (String interval: intervals) - locs.add(hg18GenomeLocParser.parseGenomeInterval(interval)); + locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); return locs; } @@ -167,9 +167,9 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testFixedScatterIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); List files = testFiles("basic.", 3, ".intervals"); @@ -192,10 +192,10 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeInterval("chr4"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); List files = testFiles("less.", 3, ".intervals"); @@ -234,10 +234,10 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsStart() { List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeInterval("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeInterval("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); + GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); + GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); List files = testFiles("split.", 3, ".intervals"); @@ -262,10 +262,10 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsMiddle() { List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeInterval("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeInterval("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); + GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); List files = testFiles("split.", 3, ".intervals"); @@ -290,10 +290,10 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsEnd() { List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeInterval("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeInterval("chr3:4-5"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); + GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); + GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); List files = testFiles("split.", 3, ".intervals"); @@ -322,10 +322,13 @@ public class IntervalUtilsUnitTest extends BaseTest { List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); int[] counts = { - 5169, 5573, 10017, 10567, 10551, - 5087, 4908, 10120, 10435, 10399, - 5391, 4735, 10621, 10352, 10654, - 5227, 5256, 10151, 9649, 9825 + 125, 138, 287, 291, 312, 105, 155, 324, + 295, 298, 141, 121, 285, 302, 282, 88, + 116, 274, 282, 248 +// 5169, 5573, 10017, 10567, 10551, +// 5087, 4908, 10120, 10435, 10399, +// 5391, 4735, 10621, 10352, 10654, +// 5227, 5256, 10151, 9649, 9825 }; //String splitCounts = ""; @@ -368,9 +371,9 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterContigIntervalsOrder() { List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); List files = testFiles("split.", 3, ".intervals"); @@ -391,9 +394,9 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterContigIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); List files = testFiles("contig_basic.", 3, ".intervals"); @@ -414,10 +417,10 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterContigIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeInterval("chr4"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); List files = testFiles("contig_less.", 3, ".intervals"); @@ -446,10 +449,10 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterContigIntervalsStart() { List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeInterval("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeInterval("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); + GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); + GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); List files = testFiles("contig_split_start.", 3, ".intervals"); @@ -472,10 +475,10 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterContigIntervalsMiddle() { List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeInterval("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeInterval("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); + GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); List files = testFiles("contig_split_middle.", 3, ".intervals"); @@ -498,10 +501,10 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterContigIntervalsEnd() { List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeInterval("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeInterval("chr3:4-5"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); + GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); + GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); List files = testFiles("contig_split_end.", 3 ,".intervals"); diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala index 8ae8b7192..77eb3ccbc 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala @@ -121,7 +121,7 @@ class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ip def parseGenomeInterval( s : String ) : GenomeLoc = { val sp = s.split("\\s+") // todo -- maybe specify whether the bed format [0,6) --> (1,2,3,4,5) is what's wanted - if ( s.contains(":") ) parser.parseGenomeInterval(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn) + if ( s.contains(":") ) parser.parseGenomeLoc(s) else parser.createGenomeLoc(sp(0),sp(1).toInt+offsetIn,sp(2).toInt-offsetIn) } object IntervalFormatType extends Enumeration("INTERVALS","BED","TDF") { diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala index c193b63e8..a3c493179 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala @@ -44,7 +44,7 @@ class VCFSimpleMerge extends InProcessFunction { def genomeLoc(xrl : PeekableXRL, p : GenomeLocParser ) : GenomeLoc = { var slp = xrl.peek.split("\t",3) - return p.parseGenomeLoc(slp(0),Integer.parseInt(slp(1)),Integer.parseInt(slp(1))) + return p.createGenomeLoc(slp(0),Integer.parseInt(slp(1)),Integer.parseInt(slp(1))) } def run = { diff --git a/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala b/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala index 52720b940..b3a2d23ae 100644 --- a/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala +++ b/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala @@ -46,9 +46,9 @@ class GATKIntervalsUnitTest { @Test def testWithIntervals() { - val chr1 = hg18GenomeLocParser.parseGenomeInterval("chr1:1-1") - val chr2 = hg18GenomeLocParser.parseGenomeInterval("chr2:2-3") - val chr3 = hg18GenomeLocParser.parseGenomeInterval("chr3:3-5") + val chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1") + val chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-3") + val chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:3-5") val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5")) Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3))