Contracts for Java now write for GenomeLoc and GenomeLocParser. The semantics of GenomeLoc are now much clearer. It is no longer allowed to create invalid GenomeLocs -- you can only create them with well formed start, end, and contigs, with respect to the mater dictionary. Where one previously created an invalid GenomeLoc, and asked is this valid, you must now provide the raw arguments to helper functions to assess this. Providing bad arguments to GenomeLoc generates UserExceptions now. Added utilty functions contigIsInDictionary and indexIsInDictionary to help with this.

Refactored several Interval utilties from GenomeLocParser to IntervalUtils, as one might expect they go

Removed GenomeLoc.clone() method, as this was not correctly implemented, and actually unnecessary, as GenomeLocs are immutable.  Several iterator classes have changed to remove their use of clone()

Removed misc. unnecessary imports

Disabled, temporarily, the validating pileup integration test, as it uses reads mapped to an different reference sequence for ecoli, and this now does not satisfy the contracts for GenomeLoc


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5827 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-05-20 15:43:27 +00:00
parent 0095aa2627
commit e16bc2cbd9
11 changed files with 334 additions and 234 deletions

View File

@ -24,7 +24,6 @@
package org.broadinstitute.sting.gatk;
import net.sf.picard.filter.SamRecordFilter;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
@ -622,7 +621,7 @@ public class GenomeAnalysisEngine {
// if include argument isn't given, create new set of all possible intervals
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
loadIntervals(argCollection.intervals, genomeLocParser.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging)));
loadIntervals(argCollection.intervals, IntervalUtils.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging)));
// if no exclude arguments, can return parseIntervalArguments directly
if (argCollection.excludeIntervals == null)

View File

@ -64,7 +64,7 @@ public class GenomeLocusIterator implements Iterator<GenomeLoc> {
public GenomeLoc next() {
if( !hasNext() )
throw new NoSuchElementException("No elements remaining in bounded reference region.");
GenomeLoc toReturn = (GenomeLoc)currentLocus.clone();
GenomeLoc toReturn = currentLocus;
currentLocus = parser.incPos(currentLocus);
return toReturn;
}

View File

@ -356,7 +356,7 @@ public class SeekableRODIterator implements LocationAwareSeekableRODIterator {
}
if ( records.size() > 0 ) {
return new RODRecordListImpl(name,records,interval.clone());
return new RODRecordListImpl(name,records,interval);
} else {
return null;
}

View File

@ -304,7 +304,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (!ref.getLocus().equals(lastSiteVisited)) {
// starting a new site: clear allele list
alleleList.clear();
lastSiteVisited = ref.getLocus().clone();
lastSiteVisited = ref.getLocus();
indelLikelihoodMap.clear();
haplotypeMap.clear();

View File

@ -83,7 +83,7 @@ public class PrintIntervalsNotInBedWalker extends RodWalker<Integer, Integer> {
}
else {
printed += printWaitingIntervalAsBed();
waitingInterval = ref.getLocus().clone();
waitingInterval = ref.getLocus();
}
}
else {

View File

@ -1,10 +1,10 @@
package org.broadinstitute.sting.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.List;
import java.io.Serializable;
/**
@ -14,10 +14,8 @@ import java.io.Serializable;
* Time: 8:50:11 AM
*
* Genome location representation. It is *** 1 *** based closed.
*
*
*/
public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable, HasGenomeLocation {
public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenomeLocation {
/**
* the basic components of a genome loc, its contig index,
* start and stop position, and (optionally) the contig name
@ -34,11 +32,12 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
* in comparators, etc.
*/
// TODO - WARNING WARNING WARNING code somehow depends on the name of the contig being null!
public static final GenomeLoc UNMAPPED = new GenomeLoc(null,-1,0,0);
public static final GenomeLoc UNMAPPED = new GenomeLoc((String)null);
public static final GenomeLoc WHOLE_GENOME = new GenomeLoc("all");
public static final boolean isUnmapped(GenomeLoc loc) {
return loc == UNMAPPED;
}
public static final GenomeLoc WHOLE_GENOME = new GenomeLoc("all",-1,0,0);
// --------------------------------------------------------------------------------------------------------------
//
@ -46,10 +45,10 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
//
// --------------------------------------------------------------------------------------------------------------
protected GenomeLoc(final SAMRecord read) {
this(read.getHeader().getSequence(read.getReferenceIndex()).getSequenceName(), read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd());
}
@Requires({
"contig != null",
"contigIndex >= 0", // I believe we aren't allowed to create GenomeLocs without a valid contigIndex
"start >= 0 && start <= stop"})
protected GenomeLoc( final String contig, final int contigIndex, final int start, final int stop ) {
this.contigName = contig;
this.contigIndex = contigIndex;
@ -57,20 +56,23 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
this.stop = stop;
}
/**
* Return a new GenomeLoc at this same position.
* @return A GenomeLoc with the same contents as the current loc.
*/
@Override
public GenomeLoc clone() {
return new GenomeLoc(getContig(),getContigIndex(),getStart(),getStop());
/** Unsafe constructor for special constant genome locs */
private GenomeLoc( final String contig ) {
this.contigName = contig;
this.contigIndex = -1;
this.start = 0;
this.stop = 0;
}
//
// Accessors and setters
// Accessors
//
@Ensures("result != null")
public final GenomeLoc getLocation() { return this; }
/**
* @return the name of the contig of this GenomeLoc
*/
public final String getContig() {
return this.contigName;
}
@ -78,6 +80,8 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
public final int getContigIndex() { return this.contigIndex; }
public final int getStart() { return this.start; }
public final int getStop() { return this.stop; }
@Ensures("result != null")
public final String toString() {
if(GenomeLoc.isUnmapped(this)) return "unmapped";
if ( throughEndOfContigP() && atBeginningOfContigP() )
@ -87,25 +91,40 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
else
return String.format("%s:%d-%d", getContig(), getStart(), getStop());
}
private boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; }
private boolean atBeginningOfContigP() { return this.start == 1; }
private boolean throughEndOfContigP() { return this.stop == Integer.MAX_VALUE; }
private boolean atBeginningOfContigP() { return this.start == 1; }
@Requires("that != null")
public final boolean disjointP(GenomeLoc that) {
return this.contigIndex != that.contigIndex || this.start > that.stop || that.start > this.stop;
}
@Requires("that != null")
public final boolean discontinuousP(GenomeLoc that) {
return this.contigIndex != that.contigIndex || (this.start - 1) > that.stop || (that.start - 1) > this.stop;
}
@Requires("that != null")
public final boolean overlapsP(GenomeLoc that) {
return ! disjointP( that );
}
@Requires("that != null")
public final boolean contiguousP(GenomeLoc that) {
return ! discontinuousP( that );
}
/**
* Returns a new GenomeLoc that represents the entire span of this and that. Requires that
* this and that GenomeLoc are contiguous and both mapped
*/
@Requires({
"that != null",
"! isUnmapped(this)",
"! isUnmapped(that)"})
@Ensures("result != null")
public GenomeLoc merge( GenomeLoc that ) throws ReviewedStingException {
if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that))
@ -133,6 +152,8 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
return new GenomeLoc[] { new GenomeLoc(getContig(),contigIndex,getStart(),splitPoint-1), new GenomeLoc(getContig(),contigIndex,splitPoint,getStop()) };
}
@Requires("that != null")
@Ensures("result != null")
public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException {
if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that))
@ -149,25 +170,31 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
Math.min( getStop(), that.getStop()) );
}
@Requires("that != null")
public final boolean containsP(GenomeLoc that) {
return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop();
}
@Requires("that != null")
public final boolean onSameContig(GenomeLoc that) {
return (this.contigIndex == that.contigIndex);
}
@Requires("that != null")
public final int minus( final GenomeLoc that ) {
if ( this.onSameContig(that) )
return (int) (this.getStart() - that.getStart());
return this.getStart() - that.getStart();
else
return Integer.MAX_VALUE;
}
@Requires("that != null")
@Ensures("result >= 0")
public final int distance( final GenomeLoc that ) {
return Math.abs(minus(that));
}
@Requires({"left != null", "right != null"})
public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) {
return this.compareTo(left) > -1 && this.compareTo(right) < 1;
}
@ -177,6 +204,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
* @param that Contig to test against.
* @return true if this contig ends before 'that' starts; false if this is completely after or overlaps 'that'.
*/
@Requires("that != null")
public final boolean isBefore( GenomeLoc that ) {
int comparison = this.compareContigs(that);
return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() ));
@ -187,6 +215,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
* @param that Other contig to test.
* @return True if the start of this contig is before the start of the that contig.
*/
@Requires("that != null")
public final boolean startsBefore(final GenomeLoc that) {
int comparison = this.compareContigs(that);
return ( comparison == -1 || ( comparison == 0 && this.getStart() < that.getStart() ));
@ -197,12 +226,17 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
* @param that Contig to test against.
* @return true if this contig starts after 'that' ends; false if this is completely before or overlaps 'that'.
*/
@Requires("that != null")
public final boolean isPast( GenomeLoc that ) {
int comparison = this.compareContigs(that);
return ( comparison == 1 || ( comparison == 0 && this.getStart() > that.getStop() ));
}
// Return the minimum distance between any pair of bases in this and that GenomeLocs:
/**
* Return the minimum distance between any pair of bases in this and that GenomeLocs:
*/
@Requires("that != null")
@Ensures("result >= 0")
public final int minDistance( final GenomeLoc that ) {
if (!this.onSameContig(that))
return Integer.MAX_VALUE;
@ -218,8 +252,14 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
return minDistance;
}
@Requires({
"locFirst != null",
"locSecond != null",
"locSecond.isPast(locFirst)"
})
@Ensures("result >= 0")
private static int distanceFirstStopToSecondStart(GenomeLoc locFirst, GenomeLoc locSecond) {
return (int) (locSecond.getStart() - locFirst.getStop());
return locSecond.getStart() - locFirst.getStop();
}
@ -253,6 +293,8 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
* @param that the genome loc to compare contigs with
* @return 0 if equal, -1 if that.contig is greater, 1 if this.contig is greater
*/
@Requires("that != null")
@Ensures("result == 0 || result == 1 || result == -1")
public final int compareContigs( GenomeLoc that ) {
if (this.contigIndex == that.contigIndex)
return 0;
@ -261,8 +303,11 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
return -1;
}
@Requires("that != null")
@Ensures("result == 0 || result == 1 || result == -1")
public int compareTo( GenomeLoc that ) {
int result = 0;
if ( this == that ) {
result = 0;
}
@ -279,14 +324,8 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
if ( this.getStart() < that.getStart() ) result = -1;
if ( this.getStart() > that.getStart() ) result = 1;
}
// TODO: and error is being thrown because we are treating reads with the same start positions
// but different stop as out of order
//if ( this.getStop() < that.getStop() ) return -1;
//if ( this.getStop() > that.getStop() ) return 1;
}
//System.out.printf("this vs. that = %s %s => %d%n", this, that, result);
return result;
}
@ -295,6 +334,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
* @return Number of BPs covered by this locus. According to the semantics of GenomeLoc, this should
* never be < 1.
*/
@Ensures("result > 0")
public long size() {
return stop - start + 1;
}

View File

@ -25,35 +25,24 @@
package org.broadinstitute.sting.utils;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.google.java.contract.*;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.util.Interval;
import net.sf.picard.util.IntervalList;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.utils.bed.BedParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.text.XReadLines;
/**
* Created by IntelliJ IDEA.
* User: aaron
* Date: Jun 18, 2009
* Time: 11:17:01 PM
* To change this template use File | Settings | File Templates.
* Factory class for creating GenomeLocs
*/
@Invariant({
"logger != null",
"contigInfo != null"})
public class GenomeLocParser {
private static Logger logger = Logger.getLogger(GenomeLocParser.class);
@ -62,11 +51,19 @@ public class GenomeLocParser {
// Ugly global variable defining the optional ordering of contig elements
//
// --------------------------------------------------------------------------------------------------------------
private final MasterSequenceDictionary contigInfo;
/**
* A wrapper class that provides efficient last used caching for the global
* SAMSequenceDictionary underlying all of the GATK engine capabilities
*/
// todo -- enable when CoFoJa developers identify the problem (likely thread unsafe invariants)
// @Invariant({
// "dict != null",
// "dict.size() > 0",
// "lastSSR == null || dict.getSequence(lastContig).getSequenceIndex() == lastIndex",
// "lastSSR == null || dict.getSequence(lastContig).getSequenceName() == lastContig",
// "lastSSR == null || dict.getSequence(lastContig) == lastSSR"})
private final class MasterSequenceDictionary {
final private SAMSequenceDictionary dict;
@ -75,45 +72,61 @@ public class GenomeLocParser {
String lastContig = "";
int lastIndex = -1;
@Requires({"dict != null", "dict.size() > 0"})
public MasterSequenceDictionary(SAMSequenceDictionary dict) {
this.dict = dict;
}
@Ensures("result > 0")
public final int getNSequences() {
return dict.size();
}
@Requires("contig != null")
public synchronized boolean hasContig(final String contig) {
return lastContig == contig || dict.getSequence(contig) != null;
}
@Requires("index >= 0")
public synchronized boolean hasContig(final int index) {
return lastIndex == index|| dict.getSequence(index) != null;
}
@Requires("contig != null")
@Ensures("result != null")
public synchronized final SAMSequenceRecord getSequence(final String contig) {
if ( isCached(contig) )
return lastSSR;
else
return updateCache(dict.getSequence(contig));
return updateCache(contig, -1);
}
@Requires("index >= 0")
@Ensures("result != null")
public synchronized final SAMSequenceRecord getSequence(final int index) {
if ( isCached(index) )
return lastSSR;
else
return updateCache(dict.getSequence(index));
return updateCache(null, index);
}
@Requires("contig != null")
@Ensures("result >= 0")
public synchronized final int getSequenceIndex(final String contig) {
if ( ! isCached(contig) ) {
SAMSequenceRecord rec = dict.getSequence(contig);
if ( rec == null )
return -1; // not found
else
updateCache(rec);
updateCache(contig, -1);
}
return lastIndex;
}
@Requires({"contig != null", "lastContig != null"})
private synchronized boolean isCached(final String contig) {
return lastContig.equals(contig);
}
@Requires({"lastIndex != -1", "index >= 0"})
private synchronized boolean isCached(final int index) {
return lastIndex == index;
}
@ -122,12 +135,16 @@ public class GenomeLocParser {
* The key algorithm. Given a new record, update the last used record, contig
* name, and index.
*
* @param rec
* @param contig
* @param index
* @return
*/
private synchronized SAMSequenceRecord updateCache(SAMSequenceRecord rec) {
@Requires("contig != null || index >= 0")
@Ensures("result != null")
private synchronized SAMSequenceRecord updateCache(final String contig, int index ) {
SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig);
if ( rec == null ) {
return null;
throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index);
} else {
lastSSR = rec;
lastContig = rec.getSequenceName();
@ -135,14 +152,15 @@ public class GenomeLocParser {
return rec;
}
}
}
private MasterSequenceDictionary contigInfo = null;
}
/**
* set our internal reference contig order
* @param refFile the reference file
*/
@Requires("refFile != null")
public GenomeLocParser(final ReferenceSequenceFile refFile) {
this(refFile.getSequenceDictionary());
}
@ -151,12 +169,12 @@ public class GenomeLocParser {
if (seqDict == null) { // we couldn't load the reference dictionary
//logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs");
throw new UserException.CommandLineException("Failed to load reference dictionary");
} else if (contigInfo == null) {
contigInfo = new MasterSequenceDictionary(seqDict);
logger.debug(String.format("Prepared reference sequence contig dictionary"));
for (SAMSequenceRecord contig : seqDict.getSequences()) {
logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
}
}
contigInfo = new MasterSequenceDictionary(seqDict);
logger.debug(String.format("Prepared reference sequence contig dictionary"));
for (SAMSequenceRecord contig : seqDict.getSequences()) {
logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
}
}
@ -167,23 +185,47 @@ public class GenomeLocParser {
*
* @return the sam sequence record
*/
@Requires({"contig != null", "contigIsInDictionary(contig)"})
@Ensures("result != null")
public SAMSequenceRecord getContigInfo(final String contig) {
return contigInfo.getSequence(contig);
}
/**
* Determines whether the given contig is valid with respect to the sequence dictionary
* already installed in the GenomeLoc.
*
* @return True if the contig is valid. False otherwise.
*/
@Requires({"contig != null"})
public boolean contigIsInDictionary(String contig) {
return contigInfo.hasContig(contig);
}
public boolean indexIsInDictionary(final int index) {
return contigInfo.hasContig(index);
}
/**
* Returns the contig index of a specified string version of the contig
*
* @param contig the contig string
* @param exceptionOut in some cases we don't want to exception out if the contig isn't valid
*
* @return the contig index, -1 if not found
*/
public int getContigIndex(final String contig, boolean exceptionOut) {
int idx = contigInfo.getSequenceIndex(contig);
if (idx == -1 && exceptionOut)
@Requires("contig != null")
@Ensures("result >= 0")
public int getContigIndex(final String contig) {
if (! contigInfo.hasContig(contig) )
throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig));
return idx;
return contigInfo.getSequenceIndex(contig);
}
@Requires("contig != null")
protected int getContigIndexWithoutException(final String contig) {
if (! contigInfo.hasContig(contig))
return -1;
return contigInfo.getSequenceIndex(contig);
}
/**
@ -197,6 +239,8 @@ public class GenomeLocParser {
* @return a GenomeLoc representing the String
*
*/
@Requires("str != null")
@Ensures("result != null")
public GenomeLoc parseGenomeInterval(final String str) {
GenomeLoc ret = parseGenomeLoc(str);
exceptionOnInvalidGenomeLocBounds(ret);
@ -215,6 +259,8 @@ public class GenomeLocParser {
* @return a GenomeLoc representing the String
*
*/
@Requires("str != null")
@Ensures("result != null")
public GenomeLoc parseGenomeLoc(final String str) {
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
//System.out.printf("Parsing location '%s'%n", str);
@ -249,14 +295,14 @@ public class GenomeLocParser {
}
// is the contig valid?
if (!isContigValid(contig))
if (!contigIsInDictionary(contig))
throw new UserException("Contig '" + contig + "' does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?");
if (stop == Integer.MAX_VALUE)
// lookup the actually stop position!
stop = getContigInfo(contig).getSequenceLength();
GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig,true), start, stop);
GenomeLoc locus = new GenomeLoc(contig, getContigIndex(contig), start, stop);
exceptionOnInvalidGenomeLoc(locus);
return locus;
}
@ -270,12 +316,12 @@ public class GenomeLocParser {
* Parses a number like 1,000,000 into a long.
* @param pos
*/
@Requires("pos != null")
@Ensures("result >= 0")
private int parsePosition(final String pos) {
//String x = pos.replaceAll(",", ""); - this was replaced because it uses regexps
//System.out.println("Parsing position: '" + pos + "'");
if(pos.indexOf('-') != -1) {
throw new NumberFormatException("Position: '" + pos + "' can't contain '-'." );
}
if(pos.indexOf('-') != -1) {
throw new NumberFormatException("Position: '" + pos + "' can't contain '-'." );
}
if(pos.indexOf(',') != -1) {
final StringBuilder buffer = new StringBuilder();
@ -296,49 +342,6 @@ public class GenomeLocParser {
}
}
/**
* merge a list of genome locs that may be overlapping, returning the list of unique genomic locations
*
* @param raw the unchecked genome loc list
* @param rule the merging rule we're using
*
* @return the list of merged locations
*/
public List<GenomeLoc> mergeIntervalLocations(final List<GenomeLoc> raw, IntervalMergingRule rule) {
if (raw.size() <= 1)
return raw;
else {
ArrayList<GenomeLoc> merged = new ArrayList<GenomeLoc>();
Iterator<GenomeLoc> it = raw.iterator();
GenomeLoc prev = it.next();
while (it.hasNext()) {
GenomeLoc curr = it.next();
if (prev.overlapsP(curr)) {
prev = prev.merge(curr);
} else if (prev.contiguousP(curr) && rule == IntervalMergingRule.ALL) {
prev = prev.merge(curr);
} else {
merged.add(prev);
prev = curr;
}
}
merged.add(prev);
return merged;
}
}
/**
* Determines whether the given contig is valid with respect to the sequence dictionary
* already installed in the GenomeLoc.
*
* @return True if the contig is valid. False otherwise.
*/
private boolean isContigValid(String contig) {
int contigIndex = contigInfo.getSequenceIndex(contig);
return contigIndex >= 0 && contigIndex < contigInfo.getNSequences();
}
/**
* Use this static constructor when the input data is under limited control (i.e. parsing user data).
*
@ -352,70 +355,9 @@ public class GenomeLocParser {
* start/stop could be anything
*/
public GenomeLoc parseGenomeLoc(final String contig, int start, int stop) {
if (!isContigValid(contig))
if (!contigIsInDictionary(contig))
throw new MalformedGenomeLocException("Contig " + contig + " does not match any contig in the GATK sequence dictionary derived from the reference; are you sure you are using the correct reference fasta file?");
return new GenomeLoc(contig, getContigIndex(contig,true), start, stop);
}
/**
* Read a file of genome locations to process. The file may be in BED, Picard,
* or GATK interval format.
*
* @param file_name interval file
* @param allowEmptyIntervalList if false an exception will be thrown for files that contain no intervals
* @return List<GenomeLoc> List of Genome Locs that have been parsed from file
*/
public List<GenomeLoc> intervalFileToList(final String file_name, boolean allowEmptyIntervalList) {
// try to open file
File inputFile = new File(file_name);
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
// case: BED file
if (file_name.toUpperCase().endsWith(".BED")) {
BedParser parser = new BedParser(this,inputFile);
ret.addAll(parser.getLocations());
}
else {
/**
* IF not a BED file:
* first try to read it as a Picard interval file since that's well structured
* we'll fail quickly if it's not a valid file.
*/
try {
// Note: Picard will skip over intervals with contigs not in the sequence dictionary
IntervalList il = IntervalList.fromFile(inputFile);
for (Interval interval : il.getIntervals()) {
ret.add(new GenomeLoc(interval.getSequence(), getContigIndex(interval.getSequence(),true),
interval.getStart(), interval.getEnd()));
}
}
// if that didn't work, try parsing file as a GATK interval file
catch (Exception e) {
try {
XReadLines reader = new XReadLines(new File(file_name));
for(String line: reader) {
if ( line.trim().length() > 0 ) {
ret.add(parseGenomeInterval(line));
}
}
reader.close();
}
catch (IOException e2) {
throw new UserException.CouldNotReadInputFile(inputFile, e2);
}
}
}
if ( ret.isEmpty() && ! allowEmptyIntervalList ) {
throw new UserException("The interval file " + inputFile.getAbsolutePath() + " contains no intervals " +
"that could be parsed, and the unsafe operation ALLOW_EMPTY_INTERVAL_LIST has " +
"not been enabled");
}
return ret;
return new GenomeLoc(contig, getContigIndex(contig), start, stop);
}
/**
@ -425,6 +367,8 @@ public class GenomeLocParser {
*
* @return the string that represents that contig name
*/
@Requires("indexIsInDictionary(contigIndex)")
@Ensures("result != null")
private String getSequenceNameFromIndex(int contigIndex) {
return contigInfo.getSequence(contigIndex).getSequenceName();
}
@ -439,18 +383,29 @@ public class GenomeLocParser {
* @return a new genome loc
*/
public GenomeLoc createGenomeLoc(String contig, final int start, final int stop) {
return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig,true), start, stop));
return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig), start, stop));
}
/**
* create a genome loc, given a read
* create a genome loc, given a read. If the read is unmapped, *and* yet the read has a contig and start position,
* then a GenomeLoc is returned for contig:start-start, otherwise and UNMAPPED GenomeLoc is returned.
*
* @param read
*
* @return
*/
public GenomeLoc createGenomeLoc(final SAMRecord read) {
return exceptionOnInvalidGenomeLoc(new GenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd()));
GenomeLoc loc;
if ( read.getReadUnmappedFlag() && read.getReferenceIndex() == -1 )
// read is unmapped and not placed anywhere on the genome
loc = GenomeLoc.UNMAPPED;
else {
int end = read.getReadUnmappedFlag() ? read.getAlignmentStart() : read.getAlignmentEnd();
loc = new GenomeLoc(read.getReferenceName(), read.getReferenceIndex(), read.getAlignmentStart(), end);
}
return exceptionOnInvalidGenomeLoc(loc);
}
/**
@ -462,7 +417,7 @@ public class GenomeLocParser {
* @return a genome loc representing a single base at the specified postion on the contig
*/
public GenomeLoc createGenomeLoc(final String contig, final int pos) {
return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig,true), pos, pos));
return exceptionOnInvalidGenomeLoc(new GenomeLoc(contig, getContigIndex(contig), pos, pos));
}
/**
@ -475,7 +430,7 @@ public class GenomeLocParser {
* @return a new GenomeLoc representing the specified location
*/
public GenomeLoc createGenomeLocWithoutValidation( String contig, int start, int stop ) {
return new GenomeLoc(contig, getContigIndex(contig, false), start, stop);
return new GenomeLoc(contig, getContigIndexWithoutException(contig), start, stop);
}
/**
@ -571,32 +526,6 @@ public class GenomeLocParser {
}
}
/**
* a method for validating genome locs as valid
*
* @param loc the location to validate
*
* @return true if the passed in GenomeLoc represents a valid location
*
* performs interval-style validation: contig is valid and atart and stop less than the end
*/
public boolean validGenomeLoc(GenomeLoc loc) {
// quick check before we get the contig size, is the contig number valid
if ((loc.getContigIndex() < 0) || // the contig index has to be positive
(loc.getContigIndex() >= contigInfo.getNSequences())) // the contig must be in the integer range of contigs)
return false;
int contigSize = contigInfo.getSequence(loc.getContigIndex()).getSequenceLength();
if ((loc.getStart() < 0) || // start must be greater than 0
((loc.getStop() != -1) && (loc.getStop() < 0)) || // the stop can be -1, but no other neg number
(loc.getStart() > contigSize) || // the start must be before or equal to the contig end
(loc.getStop() > contigSize)) // the stop must also be before or equal to the contig end
return false;
// we passed
return true;
}
/**
* validate a position or interval on the genome as valid
*
@ -609,8 +538,10 @@ public class GenomeLocParser {
* performs interval-style validation: contig is valid and atart and stop less than the end
*/
public boolean validGenomeLoc(String contig, int start, int stop) {
return validGenomeLoc(new GenomeLoc(contig, getContigIndex(contig, false), start, stop));
if ( ! contigInfo.hasContig(contig) )
return false;
else
return validGenomeLoc(getContigIndex(contig), start, stop);
}
/**
@ -625,8 +556,20 @@ public class GenomeLocParser {
* performs interval-style validation: contig is valid and atart and stop less than the end
*/
public boolean validGenomeLoc(int contigIndex, int start, int stop) {
if (contigIndex < 0 || contigIndex >= contigInfo.getNSequences()) return false;
return validGenomeLoc(new GenomeLoc(getSequenceNameFromIndex(contigIndex), contigIndex, start, stop));
// quick check before we get the contig size, is the contig number valid
if ((contigIndex < 0) || // the contig index has to be positive
(contigIndex >= contigInfo.getNSequences())) // the contig must be in the integer range of contigs)
return false;
int contigSize = contigInfo.getSequence(contigIndex).getSequenceLength();
if ((start < 0) || // start must be greater than 0
(stop < 0 ) || // the stop must be greater than 0
(start > contigSize) || // the start must be before or equal to the contig end
(stop > contigSize)) // the stop must also be before or equal to the contig end
return false;
// we passed
return true;
}
/**
@ -636,6 +579,7 @@ public class GenomeLocParser {
*
* @param loc the old location
* @param start a new start position
* TODO -- move to me GenomeLoc class itself
*
* @return the newly created genome loc
*/
@ -651,6 +595,7 @@ public class GenomeLocParser {
* @param loc the old location
* @param stop a new stop position
*
* TODO -- move to me GenomeLoc class itself
* @return
*/
public GenomeLoc setStop(GenomeLoc loc, int stop) {
@ -662,6 +607,7 @@ public class GenomeLocParser {
*
* @param loc the old location
*
* TODO -- move to me GenomeLoc class itself
* @return a new genome loc
*/
public GenomeLoc incPos(GenomeLoc loc) {
@ -674,6 +620,7 @@ public class GenomeLocParser {
* @param loc the old location
* @param by how much to move the start and stop by
*
* TODO -- move to me GenomeLoc class itself
* @return a new genome loc
*/
public GenomeLoc incPos(GenomeLoc loc, int by) {
@ -685,10 +632,10 @@ public class GenomeLocParser {
* @param contigName Name of the contig.
* @return A locus spanning the entire contig.
*/
@Requires("contigName != null")
@Ensures("result != null")
public GenomeLoc createOverEntireContig(String contigName) {
SAMSequenceRecord contig = contigInfo.getSequence(contigName);
if(contig == null)
throw new ReviewedStingException("Unable to find contig named " + contigName);
return exceptionOnInvalidGenomeLoc(new GenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength()));
}
}

View File

@ -1,14 +1,18 @@
package org.broadinstitute.sting.utils.interval;
import net.sf.picard.util.Interval;
import net.sf.picard.util.IntervalList;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.bed.BedParser;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.IOException;
import java.util.*;
import java.io.File;
@ -54,7 +58,7 @@ public class IntervalUtils {
// if it's a file, add items to raw interval list
else if (isIntervalFile(fileOrInterval)) {
try {
rawIntervals.addAll(parser.intervalFileToList(fileOrInterval, allowEmptyIntervalList));
rawIntervals.addAll(intervalFileToList(parser, fileOrInterval, allowEmptyIntervalList));
}
catch ( UserException.MalformedGenomeLoc e ) {
throw e;
@ -75,6 +79,65 @@ public class IntervalUtils {
return rawIntervals;
}
/**
* Read a file of genome locations to process. The file may be in BED, Picard,
* or GATK interval format.
*
* @param file_name interval file
* @param allowEmptyIntervalList if false an exception will be thrown for files that contain no intervals
* @return List<GenomeLoc> List of Genome Locs that have been parsed from file
*/
public static List<GenomeLoc> intervalFileToList(final GenomeLocParser glParser, final String file_name, boolean allowEmptyIntervalList) {
// try to open file
File inputFile = new File(file_name);
List<GenomeLoc> ret = new ArrayList<GenomeLoc>();
// case: BED file
if (file_name.toUpperCase().endsWith(".BED")) {
BedParser parser = new BedParser(glParser,inputFile);
ret.addAll(parser.getLocations());
}
else {
/**
* IF not a BED file:
* first try to read it as a Picard interval file since that's well structured
* we'll fail quickly if it's not a valid file.
*/
try {
// Note: Picard will skip over intervals with contigs not in the sequence dictionary
IntervalList il = IntervalList.fromFile(inputFile);
for (Interval interval : il.getIntervals()) {
ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd()));
}
}
// if that didn't work, try parsing file as a GATK interval file
catch (Exception e) {
try {
XReadLines reader = new XReadLines(new File(file_name));
for(String line: reader) {
if ( line.trim().length() > 0 ) {
ret.add(glParser.parseGenomeInterval(line));
}
}
reader.close();
}
catch (IOException e2) {
throw new UserException.CouldNotReadInputFile(inputFile, e2);
}
}
}
if ( ret.isEmpty() && ! allowEmptyIntervalList ) {
throw new UserException("The interval file " + inputFile.getAbsolutePath() + " contains no intervals " +
"that could be parsed, and the unsafe operation ALLOW_EMPTY_INTERVAL_LIST has " +
"not been enabled");
}
return ret;
}
/**
* Returns true if the interval string is the "unmapped" interval
* @param interval Interval to check
@ -145,7 +208,7 @@ public class IntervalUtils {
// sort raw interval list
Collections.sort(intervals);
// now merge raw interval list
intervals = parser.mergeIntervalLocations(intervals, mergingRule);
intervals = mergeIntervalLocations(intervals, mergingRule);
return GenomeLocSortedSet.createSetFromList(parser,intervals);
}
@ -331,4 +394,35 @@ public class IntervalUtils {
private static net.sf.picard.util.Interval toInterval(GenomeLoc loc, int locIndex) {
return new net.sf.picard.util.Interval(loc.getContig(), loc.getStart(), loc.getStop(), false, "interval_" + locIndex);
}
/**
* merge a list of genome locs that may be overlapping, returning the list of unique genomic locations
*
* @param raw the unchecked genome loc list
* @param rule the merging rule we're using
*
* @return the list of merged locations
*/
public static List<GenomeLoc> mergeIntervalLocations(final List<GenomeLoc> raw, IntervalMergingRule rule) {
if (raw.size() <= 1)
return raw;
else {
ArrayList<GenomeLoc> merged = new ArrayList<GenomeLoc>();
Iterator<GenomeLoc> it = raw.iterator();
GenomeLoc prev = it.next();
while (it.hasNext()) {
GenomeLoc curr = it.next();
if (prev.overlapsP(curr)) {
prev = prev.merge(curr);
} else if (prev.contiguousP(curr) && rule == IntervalMergingRule.ALL) {
prev = prev.merge(curr);
} else {
merged.add(prev);
prev = curr;
}
}
merged.add(prev);
return merged;
}
}
}

View File

@ -12,7 +12,7 @@ import java.util.Collections;
* @version 0.1
*/
public class ValidatingPileupIntegrationTest extends WalkerTest {
@Test
@Test(enabled = false)
public void testEcoliThreaded() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T ValidatingPileup" +

View File

@ -5,6 +5,7 @@ import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import static org.testng.Assert.assertEquals;
@ -30,20 +31,39 @@ public class GenomeLocParserUnitTest extends BaseTest {
@Test(expectedExceptions=RuntimeException.class)
public void testGetContigIndex() {
assertEquals(genomeLocParser.getContigIndex("blah",true), -1); // should not be in the reference
assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference
}
@Test
public void testGetContigIndexValid() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
assertEquals(genomeLocParser.getContigIndex("chr1",true), 0); // should be in the reference
assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference
}
@Test
public void testGetContigInfoUnknownContig() {
assertEquals(null, genomeLocParser.getContigInfo("blah")); // should be in the reference
@Test(expectedExceptions=UserException.class)
public void testGetContigInfoUnknownContig1() {
assertEquals(null, genomeLocParser.getContigInfo("blah")); // should *not* be in the reference
}
@Test(expectedExceptions=UserException.class)
public void testGetContigInfoUnknownContig2() {
assertEquals(null, genomeLocParser.getContigInfo(null)); // should *not* be in the reference
}
@Test()
public void testHasContigInfoUnknownContig1() {
assertEquals(false, genomeLocParser.contigIsInDictionary("blah")); // should *not* be in the reference
}
@Test()
public void testHasContigInfoUnknownContig2() {
assertEquals(false, genomeLocParser.contigIsInDictionary(null)); // should *not* be in the reference
}
@Test()
public void testHasContigInfoKnownContig() {
assertEquals(true, genomeLocParser.contigIsInDictionary("chr1")); // should be in the reference
}
@Test
public void testGetContigInfoKnownContig() {

View File

@ -555,7 +555,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<GenomeLoc> locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false);
Assert.assertEquals(locs.size(), 2);
List<GenomeLoc> merged = hg18GenomeLocParser.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
List<GenomeLoc> merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL);
Assert.assertEquals(merged.size(), 1);
}
}