diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 154250d01..15c375369 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.gatk; -import net.sf.picard.filter.SamRecordFilter; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.*; @@ -622,7 +621,7 @@ public class GenomeAnalysisEngine { // if include argument isn't given, create new set of all possible intervals GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : - loadIntervals(argCollection.intervals, genomeLocParser.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging))); + loadIntervals(argCollection.intervals, IntervalUtils.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging))); // if no exclude arguments, can return parseIntervalArguments directly if (argCollection.excludeIntervals == null) diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java index bc45cc75b..aa376a12a 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/GenomeLocusIterator.java @@ -64,7 +64,7 @@ public class GenomeLocusIterator implements Iterator { public GenomeLoc next() { if( !hasNext() ) throw new NoSuchElementException("No elements remaining in bounded reference region."); - GenomeLoc toReturn = (GenomeLoc)currentLocus.clone(); + GenomeLoc toReturn = currentLocus; currentLocus = parser.incPos(currentLocus); return toReturn; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java b/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java index c0d24064d..b3cb22a03 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SeekableRODIterator.java @@ -356,7 +356,7 @@ public class SeekableRODIterator implements LocationAwareSeekableRODIterator { } if ( records.size() > 0 ) { - return new RODRecordListImpl(name,records,interval.clone()); + return new RODRecordListImpl(name,records,interval); } else { return null; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 56b37066c..8bfb10014 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -304,7 +304,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (!ref.getLocus().equals(lastSiteVisited)) { // starting a new site: clear allele list alleleList.clear(); - lastSiteVisited = ref.getLocus().clone(); + lastSiteVisited = ref.getLocus(); indelLikelihoodMap.clear(); haplotypeMap.clear(); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/PrintIntervalsNotInBedWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/PrintIntervalsNotInBedWalker.java index bc03d2ffc..3cc63ca5c 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/PrintIntervalsNotInBedWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/CNV/PrintIntervalsNotInBedWalker.java @@ -83,7 +83,7 @@ public class PrintIntervalsNotInBedWalker extends RodWalker { } else { printed += printWaitingIntervalAsBed(); - waitingInterval = ref.getLocus().clone(); + waitingInterval = ref.getLocus(); } } else { diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index a36248dfa..f94586183 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.ArrayList; -import java.util.List; import java.io.Serializable; /** @@ -14,10 +14,8 @@ import java.io.Serializable; * Time: 8:50:11 AM * * Genome location representation. It is *** 1 *** based closed. - * - * */ -public class GenomeLoc implements Comparable, Cloneable, Serializable, HasGenomeLocation { +public class GenomeLoc implements Comparable, Serializable, HasGenomeLocation { /** * the basic components of a genome loc, its contig index, * start and stop position, and (optionally) the contig name @@ -34,11 +32,12 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable * in comparators, etc. */ // TODO - WARNING WARNING WARNING code somehow depends on the name of the contig being null! - public static final GenomeLoc UNMAPPED = new GenomeLoc(null,-1,0,0); + public static final GenomeLoc UNMAPPED = new GenomeLoc((String)null); + public static final GenomeLoc WHOLE_GENOME = new GenomeLoc("all"); + public static final boolean isUnmapped(GenomeLoc loc) { return loc == UNMAPPED; } - public static final GenomeLoc WHOLE_GENOME = new GenomeLoc("all",-1,0,0); // -------------------------------------------------------------------------------------------------------------- // @@ -46,10 +45,10 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable // // -------------------------------------------------------------------------------------------------------------- - protected GenomeLoc(final SAMRecord read) { - this(read.getHeader().getSequence(read.getReferenceIndex()).getSequenceName(), read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd()); - } - + @Requires({ + "contig != null", + "contigIndex >= 0", // I believe we aren't allowed to create GenomeLocs without a valid contigIndex + "start >= 0 && start <= stop"}) protected GenomeLoc( final String contig, final int contigIndex, final int start, final int stop ) { this.contigName = contig; this.contigIndex = contigIndex; @@ -57,20 +56,23 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable this.stop = stop; } - /** - * Return a new GenomeLoc at this same position. - * @return A GenomeLoc with the same contents as the current loc. - */ - @Override - public GenomeLoc clone() { - return new GenomeLoc(getContig(),getContigIndex(),getStart(),getStop()); + /** Unsafe constructor for special constant genome locs */ + private GenomeLoc( final String contig ) { + this.contigName = contig; + this.contigIndex = -1; + this.start = 0; + this.stop = 0; } // - // Accessors and setters + // Accessors // + @Ensures("result != null") public final GenomeLoc getLocation() { return this; } + /** + * @return the name of the contig of this GenomeLoc + */ public final String getContig() { return this.contigName; } @@ -78,6 +80,8 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable public final int getContigIndex() { return this.contigIndex; } public final int getStart() { return this.start; } public final int getStop() { return this.stop; } + + @Ensures("result != null") public final String toString() { if(GenomeLoc.isUnmapped(this)) return "unmapped"; if ( throughEndOfContigP() && atBeginningOfContigP() ) @@ -87,25 +91,40 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable else return String.format("%s:%d-%d", getContig(), getStart(), getStop()); } - private boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; } - private boolean atBeginningOfContigP() { return this.start == 1; } + private boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; } + + private boolean atBeginningOfContigP() { return this.start == 1; } + + @Requires("that != null") public final boolean disjointP(GenomeLoc that) { return this.contigIndex != that.contigIndex || this.start > that.stop || that.start > this.stop; } + @Requires("that != null") public final boolean discontinuousP(GenomeLoc that) { return this.contigIndex != that.contigIndex || (this.start - 1) > that.stop || (that.start - 1) > this.stop; } + @Requires("that != null") public final boolean overlapsP(GenomeLoc that) { return ! disjointP( that ); } + @Requires("that != null") public final boolean contiguousP(GenomeLoc that) { return ! discontinuousP( that ); } + /** + * Returns a new GenomeLoc that represents the entire span of this and that. Requires that + * this and that GenomeLoc are contiguous and both mapped + */ + @Requires({ + "that != null", + "! isUnmapped(this)", + "! isUnmapped(that)"}) + @Ensures("result != null") public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException { if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) @@ -133,6 +152,8 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable return new GenomeLoc[] { new GenomeLoc(getContig(),contigIndex,getStart(),splitPoint-1), new GenomeLoc(getContig(),contigIndex,splitPoint,getStop()) }; } + @Requires("that != null") + @Ensures("result != null") public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException { if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) { if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that)) @@ -149,25 +170,31 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable Math.min( getStop(), that.getStop()) ); } + @Requires("that != null") public final boolean containsP(GenomeLoc that) { return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop(); } + @Requires("that != null") public final boolean onSameContig(GenomeLoc that) { return (this.contigIndex == that.contigIndex); } + @Requires("that != null") public final int minus( final GenomeLoc that ) { if ( this.onSameContig(that) ) - return (int) (this.getStart() - that.getStart()); + return this.getStart() - that.getStart(); else return Integer.MAX_VALUE; } + @Requires("that != null") + @Ensures("result >= 0") public final int distance( final GenomeLoc that ) { return Math.abs(minus(that)); } + @Requires({"left != null", "right != null"}) public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) { return this.compareTo(left) > -1 && this.compareTo(right) < 1; } @@ -177,6 +204,7 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable * @param that Contig to test against. * @return true if this contig ends before 'that' starts; false if this is completely after or overlaps 'that'. */ + @Requires("that != null") public final boolean isBefore( GenomeLoc that ) { int comparison = this.compareContigs(that); return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() )); @@ -187,6 +215,7 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable * @param that Other contig to test. * @return True if the start of this contig is before the start of the that contig. */ + @Requires("that != null") public final boolean startsBefore(final GenomeLoc that) { int comparison = this.compareContigs(that); return ( comparison == -1 || ( comparison == 0 && this.getStart() < that.getStart() )); @@ -197,12 +226,17 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable * @param that Contig to test against. * @return true if this contig starts after 'that' ends; false if this is completely before or overlaps 'that'. */ + @Requires("that != null") public final boolean isPast( GenomeLoc that ) { int comparison = this.compareContigs(that); return ( comparison == 1 || ( comparison == 0 && this.getStart() > that.getStop() )); } - // Return the minimum distance between any pair of bases in this and that GenomeLocs: + /** + * Return the minimum distance between any pair of bases in this and that GenomeLocs: + */ + @Requires("that != null") + @Ensures("result >= 0") public final int minDistance( final GenomeLoc that ) { if (!this.onSameContig(that)) return Integer.MAX_VALUE; @@ -218,8 +252,14 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable return minDistance; } + @Requires({ + "locFirst != null", + "locSecond != null", + "locSecond.isPast(locFirst)" + }) + @Ensures("result >= 0") private static int distanceFirstStopToSecondStart(GenomeLoc locFirst, GenomeLoc locSecond) { - return (int) (locSecond.getStart() - locFirst.getStop()); + return locSecond.getStart() - locFirst.getStop(); } @@ -253,6 +293,8 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable * @param that the genome loc to compare contigs with * @return 0 if equal, -1 if that.contig is greater, 1 if this.contig is greater */ + @Requires("that != null") + @Ensures("result == 0 || result == 1 || result == -1") public final int compareContigs( GenomeLoc that ) { if (this.contigIndex == that.contigIndex) return 0; @@ -261,8 +303,11 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable return -1; } + @Requires("that != null") + @Ensures("result == 0 || result == 1 || result == -1") public int compareTo( GenomeLoc that ) { int result = 0; + if ( this == that ) { result = 0; } @@ -279,14 +324,8 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable if ( this.getStart() < that.getStart() ) result = -1; if ( this.getStart() > that.getStart() ) result = 1; } - - // TODO: and error is being thrown because we are treating reads with the same start positions - // but different stop as out of order - //if ( this.getStop() < that.getStop() ) return -1; - //if ( this.getStop() > that.getStop() ) return 1; } - //System.out.printf("this vs. that = %s %s => %d%n", this, that, result); return result; } @@ -295,6 +334,7 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable * @return Number of BPs covered by this locus. According to the semantics of GenomeLoc, this should * never be < 1. */ + @Ensures("result > 0") public long size() { return stop - start + 1; } diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 22d70ab5e..42c17f104 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -25,35 +25,24 @@ package org.broadinstitute.sting.utils; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; import java.util.List; +import com.google.java.contract.*; import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.util.Interval; -import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.utils.bed.BedParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.text.XReadLines; /** - * Created by IntelliJ IDEA. - * User: aaron - * Date: Jun 18, 2009 - * Time: 11:17:01 PM - * To change this template use File | Settings | File Templates. + * Factory class for creating GenomeLocs */ +@Invariant({ + "logger != null", + "contigInfo != null"}) public class GenomeLocParser { private static Logger logger = Logger.getLogger(GenomeLocParser.class); @@ -62,11 +51,19 @@ public class GenomeLocParser { // Ugly global variable defining the optional ordering of contig elements // // -------------------------------------------------------------------------------------------------------------- + private final MasterSequenceDictionary contigInfo; /** * A wrapper class that provides efficient last used caching for the global * SAMSequenceDictionary underlying all of the GATK engine capabilities */ + // todo -- enable when CoFoJa developers identify the problem (likely thread unsafe invariants) +// @Invariant({ +// "dict != null", +// "dict.size() > 0", +// "lastSSR == null || dict.getSequence(lastContig).getSequenceIndex() == lastIndex", +// "lastSSR == null || dict.getSequence(lastContig).getSequenceName() == lastContig", +// "lastSSR == null || dict.getSequence(lastContig) == lastSSR"}) private final class MasterSequenceDictionary { final private SAMSequenceDictionary dict; @@ -75,45 +72,61 @@ public class GenomeLocParser { String lastContig = ""; int lastIndex = -1; + @Requires({"dict != null", "dict.size() > 0"}) public MasterSequenceDictionary(SAMSequenceDictionary dict) { this.dict = dict; } + @Ensures("result > 0") public final int getNSequences() { return dict.size(); } + @Requires("contig != null") + public synchronized boolean hasContig(final String contig) { + return lastContig == contig || dict.getSequence(contig) != null; + } + + @Requires("index >= 0") + public synchronized boolean hasContig(final int index) { + return lastIndex == index|| dict.getSequence(index) != null; + } + + @Requires("contig != null") + @Ensures("result != null") public synchronized final SAMSequenceRecord getSequence(final String contig) { if ( isCached(contig) ) return lastSSR; else - return updateCache(dict.getSequence(contig)); + return updateCache(contig, -1); } + @Requires("index >= 0") + @Ensures("result != null") public synchronized final SAMSequenceRecord getSequence(final int index) { if ( isCached(index) ) return lastSSR; else - return updateCache(dict.getSequence(index)); + return updateCache(null, index); } + @Requires("contig != null") + @Ensures("result >= 0") public synchronized final int getSequenceIndex(final String contig) { if ( ! isCached(contig) ) { - SAMSequenceRecord rec = dict.getSequence(contig); - if ( rec == null ) - return -1; // not found - else - updateCache(rec); + updateCache(contig, -1); } return lastIndex; } + @Requires({"contig != null", "lastContig != null"}) private synchronized boolean isCached(final String contig) { return lastContig.equals(contig); } + @Requires({"lastIndex != -1", "index >= 0"}) private synchronized boolean isCached(final int index) { return lastIndex == index; } @@ -122,12 +135,16 @@ public class GenomeLocParser { * The key algorithm. Given a new record, update the last used record, contig * name, and index. * - * @param rec + * @param contig + * @param index * @return */ - private synchronized SAMSequenceRecord updateCache(SAMSequenceRecord rec) { + @Requires("contig != null || index >= 0") + @Ensures("result != null") + private synchronized SAMSequenceRecord updateCache(final String contig, int index ) { + SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig); if ( rec == null ) { - return null; + throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index); } else { lastSSR = rec; lastContig = rec.getSequenceName(); @@ -135,14 +152,15 @@ public class GenomeLocParser { return rec; } } - } - private MasterSequenceDictionary contigInfo = null; + + } /** * set our internal reference contig order * @param refFile the reference file */ + @Requires("refFile != null") public GenomeLocParser(final ReferenceSequenceFile refFile) { this(refFile.getSequenceDictionary()); } @@ -151,12 +169,12 @@ public class GenomeLocParser { if (seqDict == null) { // we couldn't load the reference dictionary //logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs"); throw new UserException.CommandLineException("Failed to load reference dictionary"); - } else if (contigInfo == null) { - contigInfo = new MasterSequenceDictionary(seqDict); - logger.debug(String.format("Prepared reference sequence contig dictionary")); - for (SAMSequenceRecord contig : seqDict.getSequences()) { - logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); - } + } + + contigInfo = new MasterSequenceDictionary(seqDict); + logger.debug(String.format("Prepared reference sequence contig dictionary")); + for (SAMSequenceRecord contig : seqDict.getSequences()) { + logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); } } @@ -167,23 +185,47 @@ public class GenomeLocParser { * * @return the sam sequence record */ + @Requires({"contig != null", "contigIsInDictionary(contig)"}) + @Ensures("result != null") public SAMSequenceRecord getContigInfo(final String contig) { return contigInfo.getSequence(contig); } + /** + * Determines whether the given contig is valid with respect to the sequence dictionary + * already installed in the GenomeLoc. + * + * @return True if the contig is valid. False otherwise. + */ + @Requires({"contig != null"}) + public boolean contigIsInDictionary(String contig) { + return contigInfo.hasContig(contig); + } + + public boolean indexIsInDictionary(final int index) { + return contigInfo.hasContig(index); + } + /** * Returns the contig index of a specified string version of the contig * * @param contig the contig string - * @param exceptionOut in some cases we don't want to exception out if the contig isn't valid * * @return the contig index, -1 if not found */ - public int getContigIndex(final String contig, boolean exceptionOut) { - int idx = contigInfo.getSequenceIndex(contig); - if (idx == -1 && exceptionOut) + @Requires("contig != null") + @Ensures("result >= 0") + public int getContigIndex(final String contig) { + if (! contigInfo.hasContig(contig) ) throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); - return idx; + return contigInfo.getSequenceIndex(contig); + } + + @Requires("contig != null") + protected int getContigIndexWithoutException(final String contig) { + if (! contigInfo.hasContig(contig)) + return -1; + return contigInfo.getSequenceIndex(contig); } /** @@ -197,6 +239,8 @@ public class GenomeLocParser { * @return a GenomeLoc representing the String * */ + @Requires("str != null") + @Ensures("result != null") public GenomeLoc parseGenomeInterval(final String str) { GenomeLoc ret = parseGenomeLoc(str); exceptionOnInvalidGenomeLocBounds(ret); @@ -215,6 +259,8 @@ public class GenomeLocParser { * @return a GenomeLoc representing the String * */ + @Requires("str != null") + @Ensures("result != null") public GenomeLoc parseGenomeLoc(final String str) { // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' //System.out.printf("Parsing location '%s'%n", str); @@ -249,14 +295,14 @@ public class GenomeLocParser { } // is the contig valid? - if (!isContigValid(contig)) + if (!contigIsInDictionary(contig)) throw new UserException("Contig '" + contig + "' does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?"); if (stop == Integer.MAX_VALUE) // lookup the actually stop position! stop = getContigInfo(contig).getSequenceLength(); - GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig,true), start, stop); + GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig), start, stop); exceptionOnInvalidGenomeLoc(locus); return locus; } @@ -270,12 +316,12 @@ public class GenomeLocParser { * Parses a number like 1,000,000 into a long. * @param pos */ + @Requires("pos != null") + @Ensures("result >= 0") private int parsePosition(final String pos) { - //String x = pos.replaceAll(",", ""); - this was replaced because it uses regexps - //System.out.println("Parsing position: '" + pos + "'"); - if(pos.indexOf('-') != -1) { - throw new NumberFormatException("Position: '" + pos + "' can't contain '-'." ); - } + if(pos.indexOf('-') != -1) { + throw new NumberFormatException("Position: '" + pos + "' can't contain '-'." ); + } if(pos.indexOf(',') != -1) { final StringBuilder buffer = new StringBuilder(); @@ -296,49 +342,6 @@ public class GenomeLocParser { } } - - /** - * merge a list of genome locs that may be overlapping, returning the list of unique genomic locations - * - * @param raw the unchecked genome loc list - * @param rule the merging rule we're using - * - * @return the list of merged locations - */ - public List mergeIntervalLocations(final List raw, IntervalMergingRule rule) { - if (raw.size() <= 1) - return raw; - else { - ArrayList merged = new ArrayList(); - Iterator it = raw.iterator(); - GenomeLoc prev = it.next(); - while (it.hasNext()) { - GenomeLoc curr = it.next(); - if (prev.overlapsP(curr)) { - prev = prev.merge(curr); - } else if (prev.contiguousP(curr) && rule == IntervalMergingRule.ALL) { - prev = prev.merge(curr); - } else { - merged.add(prev); - prev = curr; - } - } - merged.add(prev); - return merged; - } - } - - /** - * Determines whether the given contig is valid with respect to the sequence dictionary - * already installed in the GenomeLoc. - * - * @return True if the contig is valid. False otherwise. - */ - private boolean isContigValid(String contig) { - int contigIndex = contigInfo.getSequenceIndex(contig); - return contigIndex >= 0 && contigIndex < contigInfo.getNSequences(); - } - /** * Use this static constructor when the input data is under limited control (i.e. parsing user data). * @@ -352,70 +355,9 @@ public class GenomeLocParser { * start/stop could be anything */ public GenomeLoc parseGenomeLoc(final String contig, int start, int stop) { - if (!isContigValid(contig)) + if (!contigIsInDictionary(contig)) throw new MalformedGenomeLocException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?"); - return new GenomeLoc(contig, getContigIndex(contig,true), start, stop); - } - - - /** - * Read a file of genome locations to process. The file may be in BED, Picard, - * or GATK interval format. - * - * @param file_name interval file - * @param allowEmptyIntervalList if false an exception will be thrown for files that contain no intervals - * @return List List of Genome Locs that have been parsed from file - */ - public List intervalFileToList(final String file_name, boolean allowEmptyIntervalList) { - // try to open file - File inputFile = new File(file_name); - List ret = new ArrayList(); - - // case: BED file - if (file_name.toUpperCase().endsWith(".BED")) { - BedParser parser = new BedParser(this,inputFile); - ret.addAll(parser.getLocations()); - } - else { - /** - * IF not a BED file: - * first try to read it as a Picard interval file since that's well structured - * we'll fail quickly if it's not a valid file. - */ - try { - // Note: Picard will skip over intervals with contigs not in the sequence dictionary - IntervalList il = IntervalList.fromFile(inputFile); - - for (Interval interval : il.getIntervals()) { - ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true), - interval.getStart(), interval.getEnd())); - } - } - - // if that didn't work, try parsing file as a GATK interval file - catch (Exception e) { - try { - XReadLines reader = new XReadLines(new File(file_name)); - for(String line: reader) { - if ( line.trim().length() > 0 ) { - ret.add(parseGenomeInterval(line)); - } - } - reader.close(); - } - catch (IOException e2) { - throw new UserException.CouldNotReadInputFile(inputFile, e2); - } - } - } - - if ( ret.isEmpty() && ! allowEmptyIntervalList ) { - throw new UserException("The interval file " + inputFile.getAbsolutePath() + " contains no intervals " + - "that could be parsed, and the unsafe operation ALLOW_EMPTY_INTERVAL_LIST has " + - "not been enabled"); - } - - return ret; + return new GenomeLoc(contig, getContigIndex(contig), start, stop); } /** @@ -425,6 +367,8 @@ public class GenomeLocParser { * * @return the string that represents that contig name */ + @Requires("indexIsInDictionary(contigIndex)") + @Ensures("result != null") private String getSequenceNameFromIndex(int contigIndex) { return contigInfo.getSequence(contigIndex).getSequenceName(); } @@ -439,18 +383,29 @@ public class GenomeLocParser { * @return a new genome loc */ public GenomeLoc createGenomeLoc(String contig, final int start, final int stop) { - return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig,true), start, stop)); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig), start, stop)); } /** - * create a genome loc, given a read + * create a genome loc, given a read. If the read is unmapped, *and* yet the read has a contig and start position, + * then a GenomeLoc is returned for contig:start-start, otherwise and UNMAPPED GenomeLoc is returned. * * @param read * * @return */ public GenomeLoc createGenomeLoc(final SAMRecord read) { - return exceptionOnInvalidGenomeLoc(new GenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd())); + GenomeLoc loc; + + if ( read.getReadUnmappedFlag() && read.getReferenceIndex() == -1 ) + // read is unmapped and not placed anywhere on the genome + loc = GenomeLoc.UNMAPPED; + else { + int end = read.getReadUnmappedFlag() ? read.getAlignmentStart() : read.getAlignmentEnd(); + loc = new GenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), end); + } + + return exceptionOnInvalidGenomeLoc(loc); } /** @@ -462,7 +417,7 @@ public class GenomeLocParser { * @return a genome loc representing a single base at the specified postion on the contig */ public GenomeLoc createGenomeLoc(final String contig, final int pos) { - return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig,true), pos, pos)); + return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig), pos, pos)); } /** @@ -475,7 +430,7 @@ public class GenomeLocParser { * @return a new GenomeLoc representing the specified location */ public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) { - return new GenomeLoc(contig, getContigIndex(contig, false), start, stop); + return new GenomeLoc(contig, getContigIndexWithoutException(contig), start, stop); } /** @@ -571,32 +526,6 @@ public class GenomeLocParser { } } - /** - * a method for validating genome locs as valid - * - * @param loc the location to validate - * - * @return true if the passed in GenomeLoc represents a valid location - * - * performs interval-style validation: contig is valid and atart and stop less than the end - */ - public boolean validGenomeLoc(GenomeLoc loc) { - // quick check before we get the contig size, is the contig number valid - if ((loc.getContigIndex() < 0) || // the contig index has to be positive - (loc.getContigIndex() >= contigInfo.getNSequences())) // the contig must be in the integer range of contigs) - return false; - - int contigSize = contigInfo.getSequence(loc.getContigIndex()).getSequenceLength(); - if ((loc.getStart() < 0) || // start must be greater than 0 - ((loc.getStop() != -1) && (loc.getStop() < 0)) || // the stop can be -1, but no other neg number - (loc.getStart() > contigSize) || // the start must be before or equal to the contig end - (loc.getStop() > contigSize)) // the stop must also be before or equal to the contig end - return false; - - // we passed - return true; - } - /** * validate a position or interval on the genome as valid * @@ -609,8 +538,10 @@ public class GenomeLocParser { * performs interval-style validation: contig is valid and atart and stop less than the end */ public boolean validGenomeLoc(String contig, int start, int stop) { - return validGenomeLoc(new GenomeLoc(contig, getContigIndex(contig, false), start, stop)); - + if ( ! contigInfo.hasContig(contig) ) + return false; + else + return validGenomeLoc(getContigIndex(contig), start, stop); } /** @@ -625,8 +556,20 @@ public class GenomeLocParser { * performs interval-style validation: contig is valid and atart and stop less than the end */ public boolean validGenomeLoc(int contigIndex, int start, int stop) { - if (contigIndex < 0 || contigIndex >= contigInfo.getNSequences()) return false; - return validGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(contigIndex), contigIndex, start, stop)); + // quick check before we get the contig size, is the contig number valid + if ((contigIndex < 0) || // the contig index has to be positive + (contigIndex >= contigInfo.getNSequences())) // the contig must be in the integer range of contigs) + return false; + + int contigSize = contigInfo.getSequence(contigIndex).getSequenceLength(); + if ((start < 0) || // start must be greater than 0 + (stop < 0 ) || // the stop must be greater than 0 + (start > contigSize) || // the start must be before or equal to the contig end + (stop > contigSize)) // the stop must also be before or equal to the contig end + return false; + + // we passed + return true; } /** @@ -636,6 +579,7 @@ public class GenomeLocParser { * * @param loc the old location * @param start a new start position + * TODO -- move to me GenomeLoc class itself * * @return the newly created genome loc */ @@ -651,6 +595,7 @@ public class GenomeLocParser { * @param loc the old location * @param stop a new stop position * + * TODO -- move to me GenomeLoc class itself * @return */ public GenomeLoc setStop(GenomeLoc loc, int stop) { @@ -662,6 +607,7 @@ public class GenomeLocParser { * * @param loc the old location * + * TODO -- move to me GenomeLoc class itself * @return a new genome loc */ public GenomeLoc incPos(GenomeLoc loc) { @@ -674,6 +620,7 @@ public class GenomeLocParser { * @param loc the old location * @param by how much to move the start and stop by * + * TODO -- move to me GenomeLoc class itself * @return a new genome loc */ public GenomeLoc incPos(GenomeLoc loc, int by) { @@ -685,10 +632,10 @@ public class GenomeLocParser { * @param contigName Name of the contig. * @return A locus spanning the entire contig. */ + @Requires("contigName != null") + @Ensures("result != null") public GenomeLoc createOverEntireContig(String contigName) { SAMSequenceRecord contig = contigInfo.getSequence(contigName); - if(contig == null) - throw new ReviewedStingException("Unable to find contig named " + contigName); return exceptionOnInvalidGenomeLoc(new GenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength())); } } diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index 3376d0b3f..fea7d08d8 100644 --- a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -1,14 +1,18 @@ package org.broadinstitute.sting.utils.interval; +import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.bed.BedParser; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; +import java.io.IOException; import java.util.*; import java.io.File; @@ -54,7 +58,7 @@ public class IntervalUtils { // if it's a file, add items to raw interval list else if (isIntervalFile(fileOrInterval)) { try { - rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList)); + rawIntervals.addAll(intervalFileToList(parser, fileOrInterval, allowEmptyIntervalList)); } catch ( UserException.MalformedGenomeLoc e ) { throw e; @@ -75,6 +79,65 @@ public class IntervalUtils { return rawIntervals; } + /** + * Read a file of genome locations to process. The file may be in BED, Picard, + * or GATK interval format. + * + * @param file_name interval file + * @param allowEmptyIntervalList if false an exception will be thrown for files that contain no intervals + * @return List List of Genome Locs that have been parsed from file + */ + public static List intervalFileToList(final GenomeLocParser glParser, final String file_name, boolean allowEmptyIntervalList) { + // try to open file + File inputFile = new File(file_name); + List ret = new ArrayList(); + + // case: BED file + if (file_name.toUpperCase().endsWith(".BED")) { + BedParser parser = new BedParser(glParser,inputFile); + ret.addAll(parser.getLocations()); + } + else { + /** + * IF not a BED file: + * first try to read it as a Picard interval file since that's well structured + * we'll fail quickly if it's not a valid file. + */ + try { + // Note: Picard will skip over intervals with contigs not in the sequence dictionary + IntervalList il = IntervalList.fromFile(inputFile); + + for (Interval interval : il.getIntervals()) { + ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd())); + } + } + + // if that didn't work, try parsing file as a GATK interval file + catch (Exception e) { + try { + XReadLines reader = new XReadLines(new File(file_name)); + for(String line: reader) { + if ( line.trim().length() > 0 ) { + ret.add(glParser.parseGenomeInterval(line)); + } + } + reader.close(); + } + catch (IOException e2) { + throw new UserException.CouldNotReadInputFile(inputFile, e2); + } + } + } + + if ( ret.isEmpty() && ! allowEmptyIntervalList ) { + throw new UserException("The interval file " + inputFile.getAbsolutePath() + " contains no intervals " + + "that could be parsed, and the unsafe operation ALLOW_EMPTY_INTERVAL_LIST has " + + "not been enabled"); + } + + return ret; + } + /** * Returns true if the interval string is the "unmapped" interval * @param interval Interval to check @@ -145,7 +208,7 @@ public class IntervalUtils { // sort raw interval list Collections.sort(intervals); // now merge raw interval list - intervals = parser.mergeIntervalLocations(intervals, mergingRule); + intervals = mergeIntervalLocations(intervals, mergingRule); return GenomeLocSortedSet.createSetFromList(parser,intervals); } @@ -331,4 +394,35 @@ public class IntervalUtils { private static net.sf.picard.util.Interval toInterval(GenomeLoc loc, int locIndex) { return new net.sf.picard.util.Interval(loc.getContig(), loc.getStart(), loc.getStop(), false, "interval_" + locIndex); } + + /** + * merge a list of genome locs that may be overlapping, returning the list of unique genomic locations + * + * @param raw the unchecked genome loc list + * @param rule the merging rule we're using + * + * @return the list of merged locations + */ + public static List mergeIntervalLocations(final List raw, IntervalMergingRule rule) { + if (raw.size() <= 1) + return raw; + else { + ArrayList merged = new ArrayList(); + Iterator it = raw.iterator(); + GenomeLoc prev = it.next(); + while (it.hasNext()) { + GenomeLoc curr = it.next(); + if (prev.overlapsP(curr)) { + prev = prev.merge(curr); + } else if (prev.contiguousP(curr) && rule == IntervalMergingRule.ALL) { + prev = prev.merge(curr); + } else { + merged.add(prev); + prev = curr; + } + } + merged.add(prev); + return merged; + } + } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java index 093c871a1..7c47d437c 100644 --- a/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupIntegrationTest.java @@ -12,7 +12,7 @@ import java.util.Collections; * @version 0.1 */ public class ValidatingPileupIntegrationTest extends WalkerTest { - @Test + @Test(enabled = false) public void testEcoliThreaded() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T ValidatingPileup" + diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index 71774c7da..99653094d 100644 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -5,6 +5,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.testng.Assert.assertEquals; @@ -30,20 +31,39 @@ public class GenomeLocParserUnitTest extends BaseTest { @Test(expectedExceptions=RuntimeException.class) public void testGetContigIndex() { - assertEquals(genomeLocParser.getContigIndex("blah",true), -1); // should not be in the reference + assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference } @Test public void testGetContigIndexValid() { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); - assertEquals(genomeLocParser.getContigIndex("chr1",true), 0); // should be in the reference + assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference } - @Test - public void testGetContigInfoUnknownContig() { - assertEquals(null, genomeLocParser.getContigInfo("blah")); // should be in the reference + @Test(expectedExceptions=UserException.class) + public void testGetContigInfoUnknownContig1() { + assertEquals(null, genomeLocParser.getContigInfo("blah")); // should *not* be in the reference } + @Test(expectedExceptions=UserException.class) + public void testGetContigInfoUnknownContig2() { + assertEquals(null, genomeLocParser.getContigInfo(null)); // should *not* be in the reference + } + + @Test() + public void testHasContigInfoUnknownContig1() { + assertEquals(false, genomeLocParser.contigIsInDictionary("blah")); // should *not* be in the reference + } + + @Test() + public void testHasContigInfoUnknownContig2() { + assertEquals(false, genomeLocParser.contigIsInDictionary(null)); // should *not* be in the reference + } + + @Test() + public void testHasContigInfoKnownContig() { + assertEquals(true, genomeLocParser.contigIsInDictionary("chr1")); // should be in the reference + } @Test public void testGetContigInfoKnownContig() { diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index fd07f4e92..bd2fbb357 100644 --- a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -555,7 +555,7 @@ public class IntervalUtilsUnitTest extends BaseTest { List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); Assert.assertEquals(locs.size(), 2); - List merged = hg18GenomeLocParser.mergeIntervalLocations(locs, IntervalMergingRule.ALL); + List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); Assert.assertEquals(merged.size(), 1); } }