Refactoring and unit testing GenomeLocParser

-- Moved previously inner class to MRUCachingSAMSequenceDictionary, and unit test to 100% coverage
-- Fully document all functions in GenomeLocParser
-- Unit tests for things like parsePosition (shocking it wasn't tested!)
-- Removed function to specifically create GenomeLocs for VariantContexts.  The fact that you must incorporate END attributes in the context means that createGenomeLoc(Feature) works correctly
-- Depreciated (and moved functionality) of setStart, setStop, and incPos to GenomeLoc
-- Unit test coverage at like 80%, moving to 100% with next commit
This commit is contained in:
Mark DePristo 2013-01-29 16:51:39 -05:00
parent 8562bfaae1
commit 45603f58cd
7 changed files with 627 additions and 224 deletions

View File

@ -193,7 +193,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
private boolean overlapsKnownCNV(VariantContext cnv) {
if ( knownCNVs != null ) {
final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv, true);
final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv);
IntervalTree<GenomeLoc> intervalTree = knownCNVs.get(loc.getContig());
final Iterator<IntervalTree.Node<GenomeLoc>> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop());

View File

@ -77,7 +77,7 @@ public class IntervalStratification extends VariantStratifier {
public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
if (eval != null) {
final GenomeLoc loc = getVariantEvalWalker().getToolkit().getGenomeLocParser().createGenomeLoc(eval, true);
final GenomeLoc loc = getVariantEvalWalker().getToolkit().getGenomeLocParser().createGenomeLoc(eval);
IntervalTree<GenomeLoc> intervalTree = intervalTreeByContig.get(loc.getContig());
IntervalTree.Node<GenomeLoc> node = intervalTree.minOverlapper(loc.getStart(), loc.getStop());
//logger.info(String.format("Overlap %s found %s", loc, node));

View File

@ -530,4 +530,55 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
final int cmp = this.compareTo(other);
return cmp == -1 ? other : this;
}
/**
* create a new genome loc from an existing loc, with a new start position
* Note that this function will NOT explicitly check the ending offset, in case someone wants to
* set the start of a new GenomeLoc pertaining to a read that goes off the end of the contig.
*
* @param loc the old location
* @param start a new start position
*
* @return a newly allocated GenomeLoc as loc but with start == start
*/
public GenomeLoc setStart(GenomeLoc loc, int start) {
return new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop());
}
/**
* create a new genome loc from an existing loc, with a new stop position
* Note that this function will NOT explicitly check the ending offset, in case someone wants to
* set the stop of a new GenomeLoc pertaining to a read that goes off the end of the contig.
*
* @param loc the old location
* @param stop a new stop position
*
* @return a newly allocated GenomeLoc as loc but with stop == stop
*/
public GenomeLoc setStop(GenomeLoc loc, int stop) {
return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop);
}
/**
* return a new genome loc, with an incremented position
*
* @param loc the old location
*
* @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + 1
*/
public GenomeLoc incPos(GenomeLoc loc) {
return incPos(loc, 1);
}
/**
* return a new genome loc, with an incremented position
*
* @param loc the old location
* @param by how much to move the start and stop by
*
* @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + by
*/
public GenomeLoc incPos(GenomeLoc loc, int by) {
return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by);
}
}

View File

@ -34,10 +34,8 @@ import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.variant.variantcontext.VariantContext;
/**
* Factory class for creating GenomeLocs
@ -49,7 +47,9 @@ public final class GenomeLocParser {
* How much validation should we do at runtime with this parser?
*/
public enum ValidationLevel {
/** Do the standard amount of validation */
STANDARD,
/** Don't do any real checking at all */
NONE
}
@ -68,11 +68,11 @@ public final class GenomeLocParser {
/**
* A thread-local CachingSequenceDictionary
*/
private final ThreadLocal<CachingSequenceDictionary> contigInfoPerThread =
new ThreadLocal<CachingSequenceDictionary>() {
private final ThreadLocal<MRUCachingSAMSequenceDictionary> contigInfoPerThread =
new ThreadLocal<MRUCachingSAMSequenceDictionary>() {
@Override
protected CachingSequenceDictionary initialValue() {
return new CachingSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY);
protected MRUCachingSAMSequenceDictionary initialValue() {
return new MRUCachingSAMSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY);
}
};
@ -84,107 +84,10 @@ public final class GenomeLocParser {
/**
* @return a caching sequence dictionary appropriate for this thread
*/
private CachingSequenceDictionary getContigInfo() {
private MRUCachingSAMSequenceDictionary getContigInfo() {
return contigInfoPerThread.get();
}
/**
* A wrapper class that provides efficient last used caching for the global
* SAMSequenceDictionary underlying all of the GATK engine capabilities.
*/
private final class CachingSequenceDictionary {
final private SAMSequenceDictionary dict;
// cache
SAMSequenceRecord lastSSR = null;
String lastContig = "";
int lastIndex = -1;
@Requires({"dict != null", "dict.size() > 0"})
public CachingSequenceDictionary(SAMSequenceDictionary dict) {
this.dict = dict;
}
public final String internContigName(final String contig) {
return getContigInfo(contig).getSequenceName();
}
@Ensures("result > 0")
public final int getNSequences() {
return dict.size();
}
@Requires("contig != null")
public final boolean hasContig(final String contig) {
return contig.equals(lastContig) || dict.getSequence(contig) != null;
}
@Requires("index >= 0")
public final boolean hasContig(final int index) {
return lastIndex == index || dict.getSequence(index) != null;
}
@Requires("contig != null")
@Ensures("result != null")
public final SAMSequenceRecord getSequence(final String contig) {
if ( isCached(contig) )
return lastSSR;
else
return updateCache(contig, -1);
}
@Requires("index >= 0")
@Ensures("result != null")
public final SAMSequenceRecord getSequence(final int index) {
if ( isCached(index) )
return lastSSR;
else
return updateCache(null, index);
}
@Requires("contig != null")
@Ensures("result >= 0")
public final int getSequenceIndex(final String contig) {
if ( ! isCached(contig) ) {
updateCache(contig, -1);
}
return lastIndex;
}
@Requires({"contig != null", "lastContig != null"})
private boolean isCached(final String contig) {
return lastContig.equals(contig);
}
@Requires({"lastIndex != -1", "index >= 0"})
private boolean isCached(final int index) {
return lastIndex == index;
}
/**
* The key algorithm. Given a new record, update the last used record, contig
* name, and index.
*
* @param contig
* @param index
* @return
*/
@Requires("contig != null || index >= 0")
@Ensures("result != null")
private SAMSequenceRecord updateCache(final String contig, int index ) {
SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig);
if ( rec == null ) {
throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index);
} else {
lastSSR = rec;
lastContig = rec.getSequenceName();
lastIndex = rec.getSequenceIndex();
return rec;
}
}
}
/**
* set our internal reference contig order
* @param refFile the reference file
@ -205,16 +108,18 @@ public final class GenomeLocParser {
/**
* Create a genome loc parser based on seqDict with the specified level of validation
* @param seqDict the sequence dictionary to use when creating genome locs
* @param validationLevel how much validation should we do of the genome locs at runtime?
* @param validationLevel how much validation should we do of the genome locs at runtime? Purely for testing purposes
*/
public GenomeLocParser(SAMSequenceDictionary seqDict, final ValidationLevel validationLevel) {
this.validationLevel = validationLevel;
protected GenomeLocParser(SAMSequenceDictionary seqDict, final ValidationLevel validationLevel) {
if (validationLevel == null)
throw new IllegalArgumentException("validation level cannot be null");
if (seqDict == null) { // we couldn't load the reference dictionary
//logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs");
throw new UserException.CommandLineException("Failed to load reference dictionary");
}
SINGLE_MASTER_SEQUENCE_DICTIONARY = seqDict;
this.validationLevel = validationLevel;
this.SINGLE_MASTER_SEQUENCE_DICTIONARY = seqDict;
if ( logger.isDebugEnabled() ) {
logger.debug(String.format("Prepared reference sequence contig dictionary"));
for (SAMSequenceRecord contig : seqDict.getSequences()) {
@ -227,17 +132,13 @@ public final class GenomeLocParser {
* Determines whether the given contig is valid with respect to the sequence dictionary
* already installed in the GenomeLoc.
*
* @param contig a potentially null string name for the contig
* @return True if the contig is valid. False otherwise.
*/
public final boolean contigIsInDictionary(String contig) {
public final boolean contigIsInDictionary(final String contig) {
return contig != null && getContigInfo().hasContig(contig);
}
public final boolean indexIsInDictionary(final int index) {
return index >= 0 && getContigInfo().hasContig(index);
}
/**
* get the contig's SAMSequenceRecord
*
@ -278,7 +179,7 @@ public final class GenomeLocParser {
* @return
*/
public final SAMSequenceDictionary getContigs() {
return getContigInfo().dict;
return getContigInfo().getDictionary();
}
// --------------------------------------------------------------------------------------------------------------
@ -286,14 +187,13 @@ public final class GenomeLocParser {
// Low-level creation functions
//
// --------------------------------------------------------------------------------------------------------------
/**
* create a genome loc, given the contig name, start, and stop
* @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation.
*
* @param contig the contig name
* @param start the starting position
* @param stop the stop position
*
* @return a new genome loc
* Note that because this function doesn't take the contig index as an argument for contig, it
* has a slight performance penalty over the version that does take the contig index. Does not
* require the created genome loc on the reference genome
*/
@Ensures("result != null")
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop)"})
@ -301,34 +201,61 @@ public final class GenomeLocParser {
return createGenomeLoc(contig, getContigIndex(contig), start, stop);
}
public GenomeLoc createGenomeLoc(String contig, final int start, final int stop, boolean mustBeOnReference) {
/**
* @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation.
*
* Note that because this function doesn't take the contig index as an argument for contig, it
* has a slight performance penalty over the version that does take the contig index.
*/
public GenomeLoc createGenomeLoc(final String contig, final int start, final int stop, boolean mustBeOnReference) {
return createGenomeLoc(contig, getContigIndex(contig), start, stop, mustBeOnReference);
}
/**
* @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation.
*
* Doesn't require the start and stop to be on the genome
*/
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop, false)"})
public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop) {
return createGenomeLoc(contig, index, start, stop, false);
}
/**
* Create a GenomeLoc on contig, starting at start and ending (inclusive) at stop.
*
* @param contig the contig name
* @param index the index into the GATK's SAMSequencingDictionary of contig (passed for efficiency to avoid the lookup)
* @param start the starting position
* @param stop the stop position of this loc, inclusive
* @param mustBeOnReference if true, this factory will throw a UserException.MalformedGenomeLoc if start or stop isn't on the contig
*
* @return a non-null GenomeLoc
*/
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop,mustBeOnReference)"})
public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop, boolean mustBeOnReference) {
@Ensures("result != null")
public GenomeLoc createGenomeLoc(final String contig, int index, final int start, final int stop, boolean mustBeOnReference) {
// optimization: by interning the string we ensure that future comparisons use == not the full string comp
final String interned = validateGenomeLoc(contig, index, start, stop, mustBeOnReference);
return new GenomeLoc(interned, index, start, stop);
}
/**
* Create a new genome loc, bounding start and stop by the start and end of contig
* @param contig our contig
* @param start our start as an arbitrary integer (may be negative, etc)
* @param stop our stop as an arbitrary integer (may be negative, etc)
* @throws ReviewedStingException if there's no way to create a meaningful genome loc given contig, start, and stop
* @return a valid genome loc over contig
* Create a new GenomeLoc, on contig, including the single position pos.
*
* Pos is not required to be on the reference
*
* @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation.
*
* @param contig the contig name
* @param pos the start and stop of the created genome loc
*
* @return a genome loc representing a single base at the specified postion on the contig
*/
public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) {
final GenomeLoc myLoc = createGenomeLoc(contig, start, stop);
final GenomeLoc contigLoc = createOverEntireContig(contig);
return myLoc.intersect(contigLoc);
@Ensures("result != null")
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, pos, pos, true)"})
public GenomeLoc createGenomeLoc(final String contig, final int pos) {
return createGenomeLoc(contig, getContigIndex(contig), pos, pos);
}
/**
@ -472,7 +399,7 @@ public final class GenomeLocParser {
*/
@Requires("pos != null")
@Ensures("result >= 0")
private int parsePosition(final String pos) {
protected int parsePosition(final String pos) {
if(pos.indexOf('-') != -1) {
throw new NumberFormatException("Position: '" + pos + "' can't contain '-'." );
}
@ -533,89 +460,34 @@ public final class GenomeLocParser {
}
/**
* Creates a GenomeLoc corresponding to the variant context vc. If includeSymbolicEndIfPossible
* is true, and VC is a symbolic allele the end of the created genome loc will be the value
* of the END info field key, if it exists, or vc.getEnd() if not.
*
* @param vc
* @param includeSymbolicEndIfPossible
* @return
* @see GenomeLoc.setStart
*/
public GenomeLoc createGenomeLoc(final VariantContext vc, boolean includeSymbolicEndIfPossible) {
if ( includeSymbolicEndIfPossible && vc.isSymbolic() ) {
int end = vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getEnd());
return createGenomeLoc(vc.getChr(), vc.getStart(), end);
}
else
return createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd());
}
public GenomeLoc createGenomeLoc(final VariantContext vc) {
return createGenomeLoc(vc, false);
}
/**
* create a new genome loc, given the contig name, and a single position. Must be on the reference
*
* @param contig the contig name
* @param pos the postion
*
* @return a genome loc representing a single base at the specified postion on the contig
*/
@Ensures("result != null")
@ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, pos, pos, true)"})
public GenomeLoc createGenomeLoc(final String contig, final int pos) {
return createGenomeLoc(contig, getContigIndex(contig), pos, pos);
}
/**
* create a new genome loc from an existing loc, with a new start position
* Note that this function will NOT explicitly check the ending offset, in case someone wants to
* set the start of a new GenomeLoc pertaining to a read that goes off the end of the contig.
*
* @param loc the old location
* @param start a new start position
*
* @return the newly created genome loc
*/
public GenomeLoc setStart(GenomeLoc loc, int start) {
@Deprecated
public GenomeLoc setStart(final GenomeLoc loc, final int start) {
return createGenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop());
}
/**
* create a new genome loc from an existing loc, with a new stop position
* Note that this function will NOT explicitly check the ending offset, in case someone wants to
* set the stop of a new GenomeLoc pertaining to a read that goes off the end of the contig.
*
* @param loc the old location
* @param stop a new stop position
*
* @return
* @see GenomeLoc.setStop
*/
public GenomeLoc setStop(GenomeLoc loc, int stop) {
@Deprecated
public GenomeLoc setStop(final GenomeLoc loc, final int stop) {
return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop);
}
/**
* return a new genome loc, with an incremented position
*
* @param loc the old location
*
* @return a new genome loc
* @see GenomeLoc.incPos
*/
public GenomeLoc incPos(GenomeLoc loc) {
@Deprecated
public GenomeLoc incPos(final GenomeLoc loc) {
return incPos(loc, 1);
}
/**
* return a new genome loc, with an incremented position
*
* @param loc the old location
* @param by how much to move the start and stop by
*
* @return a new genome loc
* @see GenomeLoc.incPos
*/
public GenomeLoc incPos(GenomeLoc loc, int by) {
@Deprecated
public GenomeLoc incPos(final GenomeLoc loc, final int by) {
return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by);
}
@ -626,7 +498,7 @@ public final class GenomeLocParser {
*/
@Requires("contigName != null")
@Ensures("result != null")
public GenomeLoc createOverEntireContig(String contigName) {
public GenomeLoc createOverEntireContig(final String contigName) {
SAMSequenceRecord contig = getContigInfo().getSequence(contigName);
return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true);
}
@ -638,12 +510,12 @@ public final class GenomeLocParser {
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig.
*/
@Requires({"loc != null", "maxBasePairs > 0"})
public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) {
public GenomeLoc createGenomeLocAtStart(final GenomeLoc loc, final int maxBasePairs) {
if (GenomeLoc.isUnmapped(loc))
return null;
String contigName = loc.getContig();
SAMSequenceRecord contig = getContigInfo().getSequence(contigName);
int contigIndex = contig.getSequenceIndex();
final String contigName = loc.getContig();
final SAMSequenceRecord contig = getContigInfo().getSequence(contigName);
final int contigIndex = contig.getSequenceIndex();
int start = loc.getStart() - maxBasePairs;
int stop = loc.getStart() - 1;
@ -662,19 +534,12 @@ public final class GenomeLocParser {
* @param padding The number of base pairs to pad on either end
* @return The contiguous loc of length up to the original length + 2*padding (depending on the start/end of the contig).
*/
@Requires({"loc != null", "padding > 0"})
@Requires({"loc != null", "padding >= 0"})
public GenomeLoc createPaddedGenomeLoc(final GenomeLoc loc, final int padding) {
if (GenomeLoc.isUnmapped(loc))
if (GenomeLoc.isUnmapped(loc) || padding == 0)
return loc;
final String contigName = loc.getContig();
final SAMSequenceRecord contig = getContigInfo().getSequence(contigName);
final int contigIndex = contig.getSequenceIndex();
final int contigLength = contig.getSequenceLength();
final int start = Math.max(1, loc.getStart() - padding);
final int stop = Math.min(contigLength, loc.getStop() + padding);
return createGenomeLoc(contigName, contigIndex, start, stop, true);
else
return createGenomeLocOnContig(loc.getContig(), loc.getContigIndex(), loc.getStart() - padding, loc.getStop() + padding);
}
/**
@ -684,7 +549,7 @@ public final class GenomeLocParser {
* @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig.
*/
@Requires({"loc != null", "maxBasePairs > 0"})
public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) {
public GenomeLoc createGenomeLocAtStop(final GenomeLoc loc, final int maxBasePairs) {
if (GenomeLoc.isUnmapped(loc))
return null;
String contigName = loc.getContig();
@ -702,4 +567,35 @@ public final class GenomeLocParser {
return createGenomeLoc(contigName, contigIndex, start, stop, true);
}
/**
* @see #createGenomeLocOnContig(String, int, int, int) with the contig index looked up from contig
*/
public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) {
return createGenomeLocOnContig(contig, getContigIndex(contig), start, stop);
}
/**
* Create a new genome loc, bounding start and stop by the start and end of contig
*
* This function will return null if start and stop cannot be adjusted in any reasonable way
* to be on the contig. For example, if start and stop are both past the end of the contig,
* there's no way to fix this, and null will be returned.
*
* @param contig our contig
* @param start our start as an arbitrary integer (may be negative, etc)
* @param stop our stop as an arbitrary integer (may be negative, etc)
* @return a valid genome loc over contig, or null if a meaningful genome loc cannot be created
*/
public GenomeLoc createGenomeLocOnContig(final String contig, final int contigIndex, final int start, final int stop) {
final int contigLength = getContigInfo().getSequence(contigIndex).getSequenceLength();
final int boundedStart = Math.max(1, start);
final int boundedStop = Math.min(contigLength, stop);
if ( boundedStart > contigLength || boundedStop < 1 )
// there's no meaningful way to create this genome loc, as the start and stop are off the contig
return null;
else
return createGenomeLoc(contig, contigIndex, boundedStart, boundedStop);
}
}

View File

@ -0,0 +1,186 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* A wrapper class that provides efficient most recently used caching for the global
* SAMSequenceDictionary underlying all of the GATK engine capabilities. It is essential
* that these class be as efficient as possible. It doesn't need to be thread-safe, as
* GenomeLocParser uses a thread-local variable to ensure that each thread gets its own MRU
* cache.
*
* The MRU elements are the SAMSequenceRecord, the lastContig, and the lastIndex. The
* cached value is the actual SAMSequenceRecord of the most recently accessed value from
* getSequence, along with local variables for the contig index and contig string.
*/
final class MRUCachingSAMSequenceDictionary {
/**
* Our sequence dictionary
*/
private final SAMSequenceDictionary dict;
SAMSequenceRecord lastSSR = null;
String lastContig = "";
int lastIndex = -1;
/**
* Create a new MRUCachingSAMSequenceDictionary that provides information about sequences in dict
* @param dict a non-null, non-empty sequencing dictionary
*/
@Ensures("lastSSR == null")
public MRUCachingSAMSequenceDictionary(final SAMSequenceDictionary dict) {
if ( dict == null ) throw new IllegalArgumentException("Dictionary cannot be null");
if ( dict.size() == 0 ) throw new IllegalArgumentException("Dictionary cannot have size zero");
this.dict = dict;
}
/**
* Get our sequence dictionary
* @return a non-null SAMSequenceDictionary
*/
@Ensures("result != null")
public SAMSequenceDictionary getDictionary() {
return dict;
}
/**
* Is contig present in the dictionary? Efficiently caching.
* @param contig a non-null contig we want to test
* @return true if contig is in dictionary, false otherwise
*/
@Requires("contig != null")
public final boolean hasContig(final String contig) {
return contig.equals(lastContig) || dict.getSequence(contig) != null;
}
/**
* Is contig index present in the dictionary? Efficiently caching.
* @param contigIndex an integer offset that might map to a contig in this dictionary
* @return true if contigIndex is in dictionary, false otherwise
*/
@Requires("contigIndex >= 0")
public final boolean hasContigIndex(final int contigIndex) {
return lastIndex == contigIndex || dict.getSequence(contigIndex) != null;
}
/**
* Same as SAMSequenceDictionary.getSequence but uses a MRU cache for efficiency
*
* @param contig the contig name we want to get the sequence record of
* @throws ReviewedStingException if contig isn't present in the dictionary
* @return the sequence record for contig
*/
@Requires("contig != null")
@Ensures("result != null")
public final SAMSequenceRecord getSequence(final String contig) {
if ( isCached(contig) )
return lastSSR;
else
return updateCache(contig, -1);
}
/**
* Same as SAMSequenceDictionary.getSequence but uses a MRU cache for efficiency
*
* @param index the contig index we want to get the sequence record of
* @throws ReviewedStingException if contig isn't present in the dictionary
* @return the sequence record for contig
*/
@Requires("index >= 0")
@Ensures("result != null")
public final SAMSequenceRecord getSequence(final int index) {
if ( isCached(index) )
return lastSSR;
else
return updateCache(null, index);
}
/**
* Same as SAMSequenceDictionary.getSequenceIndex but uses a MRU cache for efficiency
*
* @param contig the contig we want to get the sequence record of
* @throws ReviewedStingException if index isn't present in the dictionary
* @return the sequence record index for contig
*/
@Requires("contig != null")
@Ensures("result >= 0")
public final int getSequenceIndex(final String contig) {
if ( ! isCached(contig) ) {
updateCache(contig, -1);
}
return lastIndex;
}
/**
* Is contig the MRU cached contig?
* @param contig the contig to test
* @return true if contig is the currently cached contig, false otherwise
*/
@Requires({"contig != null"})
protected boolean isCached(final String contig) {
return contig.equals(lastContig);
}
/**
* Is the contig index index the MRU cached index?
* @param index the contig index to test
* @return true if contig index is the currently cached contig index, false otherwise
*/
protected boolean isCached(final int index) {
return lastIndex == index;
}
/**
* The key algorithm. Given a new record, update the last used record, contig
* name, and index.
*
* @param contig the contig we want to look up. If null, index is used instead
* @param index the contig index we want to look up. Only used if contig is null
* @throws ReviewedStingException if index isn't present in the dictionary
* @return the SAMSequenceRecord for contig / index
*/
@Requires("contig != null || index >= 0")
@Ensures("result != null")
private SAMSequenceRecord updateCache(final String contig, int index ) {
SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig);
if ( rec == null ) {
throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index);
} else {
lastSSR = rec;
lastContig = rec.getSequenceName();
lastIndex = rec.getSequenceIndex();
return rec;
}
}
}

View File

@ -29,17 +29,31 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broad.tribble.BasicFeature;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
/**
* @author aaron
* <p/>
@ -49,10 +63,11 @@ import org.testng.annotations.Test;
*/
public class GenomeLocParserUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private SAMFileHeader header;
@BeforeClass
public void init() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
}
@ -231,7 +246,16 @@ public class GenomeLocParserUnitTest extends BaseTest {
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
assertTrue( genomeLocParser.isValidGenomeLoc("chr1",-1,2, false)); // bad stop
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
assertTrue( genomeLocParser.isValidGenomeLoc("chr1",10,11, false)); // bad start, past end
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",2,1)); // stop < start
}
@Test(expectedExceptions = ReviewedStingException.class)
public void testValidateGenomeLoc() {
// bad contig index
genomeLocParser.validateGenomeLoc("chr1", 1, 1, 2, false);
}
private static class FlankingGenomeLocTestData extends TestDataProvider {
@ -333,4 +357,153 @@ public class GenomeLocParserUnitTest extends BaseTest {
data.toString(), data.original, actual, data.flankStop);
assertEquals(actual, data.flankStop, description);
}
@DataProvider(name = "parseGenomeLoc")
public Object[][] makeParsingTest() {
final List<Object[]> tests = new LinkedList<Object[]>();
tests.add(new Object[]{ "chr1:10", "chr1", 10 });
tests.add(new Object[]{ "chr1:100", "chr1", 100 });
tests.add(new Object[]{ "chr1:1000", "chr1", 1000 });
tests.add(new Object[]{ "chr1:1,000", "chr1", 1000 });
tests.add(new Object[]{ "chr1:10000", "chr1", 10000 });
tests.add(new Object[]{ "chr1:10,000", "chr1", 10000 });
tests.add(new Object[]{ "chr1:100000", "chr1", 100000 });
tests.add(new Object[]{ "chr1:100,000", "chr1", 100000 });
tests.add(new Object[]{ "chr1:1000000", "chr1", 1000000 });
tests.add(new Object[]{ "chr1:1,000,000", "chr1", 1000000 });
tests.add(new Object[]{ "chr1:1000,000", "chr1", 1000000 });
tests.add(new Object[]{ "chr1:1,000000", "chr1", 1000000 });
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "parseGenomeLoc")
public void testParsingPositions(final String string, final String contig, final int start) {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10000000);
GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
final GenomeLoc loc = genomeLocParser.parseGenomeLoc(string);
Assert.assertEquals(loc.getContig(), contig);
Assert.assertEquals(loc.getStart(), start);
Assert.assertEquals(loc.getStop(), start);
}
@Test( )
public void testCreationFromSAMRecord() {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5);
final GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
Assert.assertEquals(loc.getContig(), read.getReferenceName());
Assert.assertEquals(loc.getContigIndex(), (int)read.getReferenceIndex());
Assert.assertEquals(loc.getStart(), read.getAlignmentStart());
Assert.assertEquals(loc.getStop(), read.getAlignmentEnd());
}
@Test( )
public void testCreationFromSAMRecordUnmapped() {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5);
read.setReadUnmappedFlag(true);
read.setReferenceIndex(-1);
final GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
Assert.assertTrue(loc.isUnmapped());
}
@Test( )
public void testCreationFromSAMRecordUnmappedButOnGenome() {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5);
read.setReadUnmappedFlag(true);
read.setCigarString("*");
final GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
Assert.assertEquals(loc.getContig(), read.getReferenceName());
Assert.assertEquals(loc.getContigIndex(), (int)read.getReferenceIndex());
Assert.assertEquals(loc.getStart(), read.getAlignmentStart());
Assert.assertEquals(loc.getStop(), read.getAlignmentStart());
}
@Test
public void testCreationFromFeature() {
final Feature feature = new BasicFeature("chr1", 1, 5);
final GenomeLoc loc = genomeLocParser.createGenomeLoc(feature);
Assert.assertEquals(loc.getContig(), feature.getChr());
Assert.assertEquals(loc.getStart(), feature.getStart());
Assert.assertEquals(loc.getStop(), feature.getEnd());
}
@Test
public void testCreationFromVariantContext() {
final VariantContext feature = new VariantContextBuilder("x", "chr1", 1, 5, Arrays.asList(Allele.create("AAAAA", true))).make();
final GenomeLoc loc = genomeLocParser.createGenomeLoc(feature);
Assert.assertEquals(loc.getContig(), feature.getChr());
Assert.assertEquals(loc.getStart(), feature.getStart());
Assert.assertEquals(loc.getStop(), feature.getEnd());
}
@Test
public void testcreateGenomeLocOnContig() throws FileNotFoundException {
final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final SAMSequenceDictionary dict = seq.getSequenceDictionary();
final GenomeLocParser genomeLocParser = new GenomeLocParser(dict);
for ( final SAMSequenceRecord rec : dict.getSequences() ) {
final GenomeLoc loc = genomeLocParser.createOverEntireContig(rec.getSequenceName());
Assert.assertEquals(loc.getContig(), rec.getSequenceName());
Assert.assertEquals(loc.getStart(), 1);
Assert.assertEquals(loc.getStop(), rec.getSequenceLength());
}
}
@DataProvider(name = "GenomeLocOnContig")
public Object[][] makeGenomeLocOnContig() {
final List<Object[]> tests = new LinkedList<Object[]>();
final int contigLength = header.getSequence(0).getSequenceLength();
for ( int start = -10; start < contigLength + 10; start++ ) {
for ( final int len : Arrays.asList(1, 10, 20) ) {
tests.add(new Object[]{ "chr1", start, start + len });
}
}
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "GenomeLocOnContig")
public void testGenomeLocOnContig(final String contig, final int start, final int stop) {
final int contigLength = header.getSequence(0).getSequenceLength();
final GenomeLoc loc = genomeLocParser.createGenomeLocOnContig(contig, start, stop);
if ( stop < 1 || start > contigLength )
Assert.assertNull(loc, "GenomeLoc should be null if the start/stops are not meaningful");
else {
Assert.assertNotNull(loc);
Assert.assertEquals(loc.getContig(), contig);
Assert.assertEquals(loc.getStart(), Math.max(start, 1));
Assert.assertEquals(loc.getStop(), Math.min(stop, contigLength));
}
}
@DataProvider(name = "GenomeLocPadding")
public Object[][] makeGenomeLocPadding() {
final List<Object[]> tests = new LinkedList<Object[]>();
final int contigLength = header.getSequence(0).getSequenceLength();
for ( int pad = 0; pad < contigLength + 1; pad++) {
for ( int start = 1; start < contigLength; start++ ) {
for ( int stop = start; stop < contigLength; stop++ ) {
tests.add(new Object[]{ genomeLocParser.createGenomeLoc("chr1", start, stop), pad});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "GenomeLocPadding")
public void testGenomeLocPadding(final GenomeLoc input, final int pad) {
final int contigLength = header.getSequence(0).getSequenceLength();
final GenomeLoc padded = genomeLocParser.createPaddedGenomeLoc(input, pad);
Assert.assertNotNull(padded);
Assert.assertEquals(padded.getContig(), input.getContig());
Assert.assertEquals(padded.getStart(), Math.max(input.getStart() - pad, 1));
Assert.assertEquals(padded.getStop(), Math.min(input.getStop() + pad, contigLength));
}
}

View File

@ -0,0 +1,97 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.LinkedList;
import java.util.List;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
public class MRUCachingSAMSequencingDictionaryUnitTest extends BaseTest {
private static ReferenceSequenceFile seq;
private static SAMSequenceDictionary dict;
@BeforeClass
public void init() throws FileNotFoundException {
// sequence
seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
dict = seq.getSequenceDictionary();
}
@Test
public void testBasic() {
final MRUCachingSAMSequenceDictionary caching = new MRUCachingSAMSequenceDictionary(dict);
Assert.assertEquals(caching.getDictionary(), dict, "Dictionary not the one I expected");
for ( final SAMSequenceRecord rec : dict.getSequences() ) {
Assert.assertFalse(caching.isCached(rec.getSequenceIndex()), "Expected index to not be cached");
Assert.assertFalse(caching.isCached(rec.getSequenceName()), "Expected contig to not be cached");
Assert.assertEquals(caching.getSequence(rec.getSequenceName()), rec, "Couldn't query for sequence");
Assert.assertEquals(caching.getSequence(rec.getSequenceIndex()), rec, "Couldn't query for sequence index");
Assert.assertEquals(caching.hasContig(rec.getSequenceName()), true, "hasContig query for sequence");
Assert.assertEquals(caching.hasContigIndex(rec.getSequenceIndex()), true, "hasContigIndex query for sequence");
Assert.assertEquals(caching.getSequenceIndex(rec.getSequenceName()), rec.getSequenceIndex(), "Couldn't query for sequence");
Assert.assertEquals(caching.hasContig(rec.getSequenceName() + "asdfadsfa"), false, "hasContig query for unknown sequence");
Assert.assertEquals(caching.hasContigIndex(dict.getSequences().size()), false, "hasContigIndex query for unknown index");
Assert.assertTrue(caching.isCached(rec.getSequenceIndex()), "Expected index to be cached");
Assert.assertTrue(caching.isCached(rec.getSequenceName()), "Expected contig to be cached");
}
}
@Test(expectedExceptions = ReviewedStingException.class)
public void testBadGetSequence() {
final MRUCachingSAMSequenceDictionary caching = new MRUCachingSAMSequenceDictionary(dict);
caching.getSequence("notInDictionary");
}
@Test(expectedExceptions = ReviewedStingException.class)
public void testBadGetSequenceIndex() {
final MRUCachingSAMSequenceDictionary caching = new MRUCachingSAMSequenceDictionary(dict);
caching.getSequence(dict.getSequences().size());
}
}