diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java index a3b703ad3..2a1dbd277 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java @@ -193,7 +193,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { private boolean overlapsKnownCNV(VariantContext cnv) { if ( knownCNVs != null ) { - final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv, true); + final GenomeLoc loc = getWalker().getToolkit().getGenomeLocParser().createGenomeLoc(cnv); IntervalTree intervalTree = knownCNVs.get(loc.getContig()); final Iterator> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java index be689fe55..312e506a2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java @@ -77,7 +77,7 @@ public class IntervalStratification extends VariantStratifier { public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { if (eval != null) { - final GenomeLoc loc = getVariantEvalWalker().getToolkit().getGenomeLocParser().createGenomeLoc(eval, true); + final GenomeLoc loc = getVariantEvalWalker().getToolkit().getGenomeLocParser().createGenomeLoc(eval); IntervalTree intervalTree = intervalTreeByContig.get(loc.getContig()); IntervalTree.Node node = intervalTree.minOverlapper(loc.getStart(), loc.getStop()); //logger.info(String.format("Overlap %s found %s", loc, node)); diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index c81e8e853..4f1b35f62 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -530,4 +530,55 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome final int cmp = this.compareTo(other); return cmp == -1 ? other : this; } + + /** + * create a new genome loc from an existing loc, with a new start position + * Note that this function will NOT explicitly check the ending offset, in case someone wants to + * set the start of a new GenomeLoc pertaining to a read that goes off the end of the contig. + * + * @param loc the old location + * @param start a new start position + * + * @return a newly allocated GenomeLoc as loc but with start == start + */ + public GenomeLoc setStart(GenomeLoc loc, int start) { + return new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop()); + } + + /** + * create a new genome loc from an existing loc, with a new stop position + * Note that this function will NOT explicitly check the ending offset, in case someone wants to + * set the stop of a new GenomeLoc pertaining to a read that goes off the end of the contig. + * + * @param loc the old location + * @param stop a new stop position + * + * @return a newly allocated GenomeLoc as loc but with stop == stop + */ + public GenomeLoc setStop(GenomeLoc loc, int stop) { + return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop); + } + + /** + * return a new genome loc, with an incremented position + * + * @param loc the old location + * + * @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + 1 + */ + public GenomeLoc incPos(GenomeLoc loc) { + return incPos(loc, 1); + } + + /** + * return a new genome loc, with an incremented position + * + * @param loc the old location + * @param by how much to move the start and stop by + * + * @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + by + */ + public GenomeLoc incPos(GenomeLoc loc, int by) { + return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 36b23ae14..61478744d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -34,10 +34,8 @@ import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; -import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.VariantContext; /** * Factory class for creating GenomeLocs @@ -49,7 +47,9 @@ public final class GenomeLocParser { * How much validation should we do at runtime with this parser? */ public enum ValidationLevel { + /** Do the standard amount of validation */ STANDARD, + /** Don't do any real checking at all */ NONE } @@ -68,11 +68,11 @@ public final class GenomeLocParser { /** * A thread-local CachingSequenceDictionary */ - private final ThreadLocal contigInfoPerThread = - new ThreadLocal() { + private final ThreadLocal contigInfoPerThread = + new ThreadLocal() { @Override - protected CachingSequenceDictionary initialValue() { - return new CachingSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY); + protected MRUCachingSAMSequenceDictionary initialValue() { + return new MRUCachingSAMSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY); } }; @@ -84,107 +84,10 @@ public final class GenomeLocParser { /** * @return a caching sequence dictionary appropriate for this thread */ - private CachingSequenceDictionary getContigInfo() { + private MRUCachingSAMSequenceDictionary getContigInfo() { return contigInfoPerThread.get(); } - /** - * A wrapper class that provides efficient last used caching for the global - * SAMSequenceDictionary underlying all of the GATK engine capabilities. - */ - private final class CachingSequenceDictionary { - final private SAMSequenceDictionary dict; - - // cache - SAMSequenceRecord lastSSR = null; - String lastContig = ""; - int lastIndex = -1; - - @Requires({"dict != null", "dict.size() > 0"}) - public CachingSequenceDictionary(SAMSequenceDictionary dict) { - this.dict = dict; - } - - public final String internContigName(final String contig) { - return getContigInfo(contig).getSequenceName(); - } - - @Ensures("result > 0") - public final int getNSequences() { - return dict.size(); - } - - @Requires("contig != null") - public final boolean hasContig(final String contig) { - return contig.equals(lastContig) || dict.getSequence(contig) != null; - } - - @Requires("index >= 0") - public final boolean hasContig(final int index) { - return lastIndex == index || dict.getSequence(index) != null; - } - - @Requires("contig != null") - @Ensures("result != null") - public final SAMSequenceRecord getSequence(final String contig) { - if ( isCached(contig) ) - return lastSSR; - else - return updateCache(contig, -1); - } - - @Requires("index >= 0") - @Ensures("result != null") - public final SAMSequenceRecord getSequence(final int index) { - if ( isCached(index) ) - return lastSSR; - else - return updateCache(null, index); - } - - @Requires("contig != null") - @Ensures("result >= 0") - public final int getSequenceIndex(final String contig) { - if ( ! isCached(contig) ) { - updateCache(contig, -1); - } - - return lastIndex; - } - - @Requires({"contig != null", "lastContig != null"}) - private boolean isCached(final String contig) { - return lastContig.equals(contig); - } - - @Requires({"lastIndex != -1", "index >= 0"}) - private boolean isCached(final int index) { - return lastIndex == index; - } - - /** - * The key algorithm. Given a new record, update the last used record, contig - * name, and index. - * - * @param contig - * @param index - * @return - */ - @Requires("contig != null || index >= 0") - @Ensures("result != null") - private SAMSequenceRecord updateCache(final String contig, int index ) { - SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig); - if ( rec == null ) { - throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index); - } else { - lastSSR = rec; - lastContig = rec.getSequenceName(); - lastIndex = rec.getSequenceIndex(); - return rec; - } - } - } - /** * set our internal reference contig order * @param refFile the reference file @@ -205,16 +108,18 @@ public final class GenomeLocParser { /** * Create a genome loc parser based on seqDict with the specified level of validation * @param seqDict the sequence dictionary to use when creating genome locs - * @param validationLevel how much validation should we do of the genome locs at runtime? + * @param validationLevel how much validation should we do of the genome locs at runtime? Purely for testing purposes */ - public GenomeLocParser(SAMSequenceDictionary seqDict, final ValidationLevel validationLevel) { - this.validationLevel = validationLevel; + protected GenomeLocParser(SAMSequenceDictionary seqDict, final ValidationLevel validationLevel) { + if (validationLevel == null) + throw new IllegalArgumentException("validation level cannot be null"); if (seqDict == null) { // we couldn't load the reference dictionary //logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs"); throw new UserException.CommandLineException("Failed to load reference dictionary"); } - SINGLE_MASTER_SEQUENCE_DICTIONARY = seqDict; + this.validationLevel = validationLevel; + this.SINGLE_MASTER_SEQUENCE_DICTIONARY = seqDict; if ( logger.isDebugEnabled() ) { logger.debug(String.format("Prepared reference sequence contig dictionary")); for (SAMSequenceRecord contig : seqDict.getSequences()) { @@ -227,17 +132,13 @@ public final class GenomeLocParser { * Determines whether the given contig is valid with respect to the sequence dictionary * already installed in the GenomeLoc. * + * @param contig a potentially null string name for the contig * @return True if the contig is valid. False otherwise. */ - public final boolean contigIsInDictionary(String contig) { + public final boolean contigIsInDictionary(final String contig) { return contig != null && getContigInfo().hasContig(contig); } - public final boolean indexIsInDictionary(final int index) { - return index >= 0 && getContigInfo().hasContig(index); - } - - /** * get the contig's SAMSequenceRecord * @@ -278,7 +179,7 @@ public final class GenomeLocParser { * @return */ public final SAMSequenceDictionary getContigs() { - return getContigInfo().dict; + return getContigInfo().getDictionary(); } // -------------------------------------------------------------------------------------------------------------- @@ -286,14 +187,13 @@ public final class GenomeLocParser { // Low-level creation functions // // -------------------------------------------------------------------------------------------------------------- + /** - * create a genome loc, given the contig name, start, and stop + * @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation. * - * @param contig the contig name - * @param start the starting position - * @param stop the stop position - * - * @return a new genome loc + * Note that because this function doesn't take the contig index as an argument for contig, it + * has a slight performance penalty over the version that does take the contig index. Does not + * require the created genome loc on the reference genome */ @Ensures("result != null") @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop)"}) @@ -301,34 +201,61 @@ public final class GenomeLocParser { return createGenomeLoc(contig, getContigIndex(contig), start, stop); } - public GenomeLoc createGenomeLoc(String contig, final int start, final int stop, boolean mustBeOnReference) { + /** + * @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation. + * + * Note that because this function doesn't take the contig index as an argument for contig, it + * has a slight performance penalty over the version that does take the contig index. + */ + public GenomeLoc createGenomeLoc(final String contig, final int start, final int stop, boolean mustBeOnReference) { return createGenomeLoc(contig, getContigIndex(contig), start, stop, mustBeOnReference); } + /** + * @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation. + * + * Doesn't require the start and stop to be on the genome + */ @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop, false)"}) public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop) { return createGenomeLoc(contig, index, start, stop, false); } + /** + * Create a GenomeLoc on contig, starting at start and ending (inclusive) at stop. + * + * @param contig the contig name + * @param index the index into the GATK's SAMSequencingDictionary of contig (passed for efficiency to avoid the lookup) + * @param start the starting position + * @param stop the stop position of this loc, inclusive + * @param mustBeOnReference if true, this factory will throw a UserException.MalformedGenomeLoc if start or stop isn't on the contig + * + * @return a non-null GenomeLoc + */ @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop,mustBeOnReference)"}) - public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop, boolean mustBeOnReference) { + @Ensures("result != null") + public GenomeLoc createGenomeLoc(final String contig, int index, final int start, final int stop, boolean mustBeOnReference) { // optimization: by interning the string we ensure that future comparisons use == not the full string comp final String interned = validateGenomeLoc(contig, index, start, stop, mustBeOnReference); return new GenomeLoc(interned, index, start, stop); } /** - * Create a new genome loc, bounding start and stop by the start and end of contig - * @param contig our contig - * @param start our start as an arbitrary integer (may be negative, etc) - * @param stop our stop as an arbitrary integer (may be negative, etc) - * @throws ReviewedStingException if there's no way to create a meaningful genome loc given contig, start, and stop - * @return a valid genome loc over contig + * Create a new GenomeLoc, on contig, including the single position pos. + * + * Pos is not required to be on the reference + * + * @see #createGenomeLoc(String, int, int, int, boolean) for exact details of the creation. + * + * @param contig the contig name + * @param pos the start and stop of the created genome loc + * + * @return a genome loc representing a single base at the specified postion on the contig */ - public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) { - final GenomeLoc myLoc = createGenomeLoc(contig, start, stop); - final GenomeLoc contigLoc = createOverEntireContig(contig); - return myLoc.intersect(contigLoc); + @Ensures("result != null") + @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, pos, pos, true)"}) + public GenomeLoc createGenomeLoc(final String contig, final int pos) { + return createGenomeLoc(contig, getContigIndex(contig), pos, pos); } /** @@ -472,7 +399,7 @@ public final class GenomeLocParser { */ @Requires("pos != null") @Ensures("result >= 0") - private int parsePosition(final String pos) { + protected int parsePosition(final String pos) { if(pos.indexOf('-') != -1) { throw new NumberFormatException("Position: '" + pos + "' can't contain '-'." ); } @@ -533,89 +460,34 @@ public final class GenomeLocParser { } /** - * Creates a GenomeLoc corresponding to the variant context vc. If includeSymbolicEndIfPossible - * is true, and VC is a symbolic allele the end of the created genome loc will be the value - * of the END info field key, if it exists, or vc.getEnd() if not. - * - * @param vc - * @param includeSymbolicEndIfPossible - * @return + * @see GenomeLoc.setStart */ - public GenomeLoc createGenomeLoc(final VariantContext vc, boolean includeSymbolicEndIfPossible) { - if ( includeSymbolicEndIfPossible && vc.isSymbolic() ) { - int end = vc.getAttributeAsInt(VCFConstants.END_KEY, vc.getEnd()); - return createGenomeLoc(vc.getChr(), vc.getStart(), end); - } - else - return createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd()); - } - - public GenomeLoc createGenomeLoc(final VariantContext vc) { - return createGenomeLoc(vc, false); - } - - /** - * create a new genome loc, given the contig name, and a single position. Must be on the reference - * - * @param contig the contig name - * @param pos the postion - * - * @return a genome loc representing a single base at the specified postion on the contig - */ - @Ensures("result != null") - @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, pos, pos, true)"}) - public GenomeLoc createGenomeLoc(final String contig, final int pos) { - return createGenomeLoc(contig, getContigIndex(contig), pos, pos); - } - - /** - * create a new genome loc from an existing loc, with a new start position - * Note that this function will NOT explicitly check the ending offset, in case someone wants to - * set the start of a new GenomeLoc pertaining to a read that goes off the end of the contig. - * - * @param loc the old location - * @param start a new start position - * - * @return the newly created genome loc - */ - public GenomeLoc setStart(GenomeLoc loc, int start) { + @Deprecated + public GenomeLoc setStart(final GenomeLoc loc, final int start) { return createGenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop()); } /** - * create a new genome loc from an existing loc, with a new stop position - * Note that this function will NOT explicitly check the ending offset, in case someone wants to - * set the stop of a new GenomeLoc pertaining to a read that goes off the end of the contig. - * - * @param loc the old location - * @param stop a new stop position - * - * @return + * @see GenomeLoc.setStop */ - public GenomeLoc setStop(GenomeLoc loc, int stop) { + @Deprecated + public GenomeLoc setStop(final GenomeLoc loc, final int stop) { return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop); } /** - * return a new genome loc, with an incremented position - * - * @param loc the old location - * - * @return a new genome loc + * @see GenomeLoc.incPos */ - public GenomeLoc incPos(GenomeLoc loc) { + @Deprecated + public GenomeLoc incPos(final GenomeLoc loc) { return incPos(loc, 1); } /** - * return a new genome loc, with an incremented position - * - * @param loc the old location - * @param by how much to move the start and stop by - * - * @return a new genome loc + * @see GenomeLoc.incPos */ - public GenomeLoc incPos(GenomeLoc loc, int by) { + @Deprecated + public GenomeLoc incPos(final GenomeLoc loc, final int by) { return createGenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by); } @@ -626,7 +498,7 @@ public final class GenomeLocParser { */ @Requires("contigName != null") @Ensures("result != null") - public GenomeLoc createOverEntireContig(String contigName) { + public GenomeLoc createOverEntireContig(final String contigName) { SAMSequenceRecord contig = getContigInfo().getSequence(contigName); return createGenomeLoc(contigName,contig.getSequenceIndex(),1,contig.getSequenceLength(), true); } @@ -638,12 +510,12 @@ public final class GenomeLocParser { * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the start of the contig. */ @Requires({"loc != null", "maxBasePairs > 0"}) - public GenomeLoc createGenomeLocAtStart(GenomeLoc loc, int maxBasePairs) { + public GenomeLoc createGenomeLocAtStart(final GenomeLoc loc, final int maxBasePairs) { if (GenomeLoc.isUnmapped(loc)) return null; - String contigName = loc.getContig(); - SAMSequenceRecord contig = getContigInfo().getSequence(contigName); - int contigIndex = contig.getSequenceIndex(); + final String contigName = loc.getContig(); + final SAMSequenceRecord contig = getContigInfo().getSequence(contigName); + final int contigIndex = contig.getSequenceIndex(); int start = loc.getStart() - maxBasePairs; int stop = loc.getStart() - 1; @@ -662,19 +534,12 @@ public final class GenomeLocParser { * @param padding The number of base pairs to pad on either end * @return The contiguous loc of length up to the original length + 2*padding (depending on the start/end of the contig). */ - @Requires({"loc != null", "padding > 0"}) + @Requires({"loc != null", "padding >= 0"}) public GenomeLoc createPaddedGenomeLoc(final GenomeLoc loc, final int padding) { - if (GenomeLoc.isUnmapped(loc)) + if (GenomeLoc.isUnmapped(loc) || padding == 0) return loc; - final String contigName = loc.getContig(); - final SAMSequenceRecord contig = getContigInfo().getSequence(contigName); - final int contigIndex = contig.getSequenceIndex(); - final int contigLength = contig.getSequenceLength(); - - final int start = Math.max(1, loc.getStart() - padding); - final int stop = Math.min(contigLength, loc.getStop() + padding); - - return createGenomeLoc(contigName, contigIndex, start, stop, true); + else + return createGenomeLocOnContig(loc.getContig(), loc.getContigIndex(), loc.getStart() - padding, loc.getStop() + padding); } /** @@ -684,7 +549,7 @@ public final class GenomeLocParser { * @return The contiguous loc of up to maxBasePairs length or null if the loc is already at the end of the contig. */ @Requires({"loc != null", "maxBasePairs > 0"}) - public GenomeLoc createGenomeLocAtStop(GenomeLoc loc, int maxBasePairs) { + public GenomeLoc createGenomeLocAtStop(final GenomeLoc loc, final int maxBasePairs) { if (GenomeLoc.isUnmapped(loc)) return null; String contigName = loc.getContig(); @@ -702,4 +567,35 @@ public final class GenomeLocParser { return createGenomeLoc(contigName, contigIndex, start, stop, true); } + + /** + * @see #createGenomeLocOnContig(String, int, int, int) with the contig index looked up from contig + */ + public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) { + return createGenomeLocOnContig(contig, getContigIndex(contig), start, stop); + } + + /** + * Create a new genome loc, bounding start and stop by the start and end of contig + * + * This function will return null if start and stop cannot be adjusted in any reasonable way + * to be on the contig. For example, if start and stop are both past the end of the contig, + * there's no way to fix this, and null will be returned. + * + * @param contig our contig + * @param start our start as an arbitrary integer (may be negative, etc) + * @param stop our stop as an arbitrary integer (may be negative, etc) + * @return a valid genome loc over contig, or null if a meaningful genome loc cannot be created + */ + public GenomeLoc createGenomeLocOnContig(final String contig, final int contigIndex, final int start, final int stop) { + final int contigLength = getContigInfo().getSequence(contigIndex).getSequenceLength(); + final int boundedStart = Math.max(1, start); + final int boundedStop = Math.min(contigLength, stop); + + if ( boundedStart > contigLength || boundedStop < 1 ) + // there's no meaningful way to create this genome loc, as the start and stop are off the contig + return null; + else + return createGenomeLoc(contig, contigIndex, boundedStart, boundedStop); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java b/public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java new file mode 100644 index 000000000..c11aeb730 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/MRUCachingSAMSequenceDictionary.java @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * A wrapper class that provides efficient most recently used caching for the global + * SAMSequenceDictionary underlying all of the GATK engine capabilities. It is essential + * that these class be as efficient as possible. It doesn't need to be thread-safe, as + * GenomeLocParser uses a thread-local variable to ensure that each thread gets its own MRU + * cache. + * + * The MRU elements are the SAMSequenceRecord, the lastContig, and the lastIndex. The + * cached value is the actual SAMSequenceRecord of the most recently accessed value from + * getSequence, along with local variables for the contig index and contig string. + */ +final class MRUCachingSAMSequenceDictionary { + /** + * Our sequence dictionary + */ + private final SAMSequenceDictionary dict; + + SAMSequenceRecord lastSSR = null; + String lastContig = ""; + int lastIndex = -1; + + /** + * Create a new MRUCachingSAMSequenceDictionary that provides information about sequences in dict + * @param dict a non-null, non-empty sequencing dictionary + */ + @Ensures("lastSSR == null") + public MRUCachingSAMSequenceDictionary(final SAMSequenceDictionary dict) { + if ( dict == null ) throw new IllegalArgumentException("Dictionary cannot be null"); + if ( dict.size() == 0 ) throw new IllegalArgumentException("Dictionary cannot have size zero"); + + this.dict = dict; + } + + /** + * Get our sequence dictionary + * @return a non-null SAMSequenceDictionary + */ + @Ensures("result != null") + public SAMSequenceDictionary getDictionary() { + return dict; + } + + /** + * Is contig present in the dictionary? Efficiently caching. + * @param contig a non-null contig we want to test + * @return true if contig is in dictionary, false otherwise + */ + @Requires("contig != null") + public final boolean hasContig(final String contig) { + return contig.equals(lastContig) || dict.getSequence(contig) != null; + } + + /** + * Is contig index present in the dictionary? Efficiently caching. + * @param contigIndex an integer offset that might map to a contig in this dictionary + * @return true if contigIndex is in dictionary, false otherwise + */ + @Requires("contigIndex >= 0") + public final boolean hasContigIndex(final int contigIndex) { + return lastIndex == contigIndex || dict.getSequence(contigIndex) != null; + } + + /** + * Same as SAMSequenceDictionary.getSequence but uses a MRU cache for efficiency + * + * @param contig the contig name we want to get the sequence record of + * @throws ReviewedStingException if contig isn't present in the dictionary + * @return the sequence record for contig + */ + @Requires("contig != null") + @Ensures("result != null") + public final SAMSequenceRecord getSequence(final String contig) { + if ( isCached(contig) ) + return lastSSR; + else + return updateCache(contig, -1); + } + + /** + * Same as SAMSequenceDictionary.getSequence but uses a MRU cache for efficiency + * + * @param index the contig index we want to get the sequence record of + * @throws ReviewedStingException if contig isn't present in the dictionary + * @return the sequence record for contig + */ + @Requires("index >= 0") + @Ensures("result != null") + public final SAMSequenceRecord getSequence(final int index) { + if ( isCached(index) ) + return lastSSR; + else + return updateCache(null, index); + } + + /** + * Same as SAMSequenceDictionary.getSequenceIndex but uses a MRU cache for efficiency + * + * @param contig the contig we want to get the sequence record of + * @throws ReviewedStingException if index isn't present in the dictionary + * @return the sequence record index for contig + */ + @Requires("contig != null") + @Ensures("result >= 0") + public final int getSequenceIndex(final String contig) { + if ( ! isCached(contig) ) { + updateCache(contig, -1); + } + + return lastIndex; + } + + /** + * Is contig the MRU cached contig? + * @param contig the contig to test + * @return true if contig is the currently cached contig, false otherwise + */ + @Requires({"contig != null"}) + protected boolean isCached(final String contig) { + return contig.equals(lastContig); + } + + /** + * Is the contig index index the MRU cached index? + * @param index the contig index to test + * @return true if contig index is the currently cached contig index, false otherwise + */ + protected boolean isCached(final int index) { + return lastIndex == index; + } + + /** + * The key algorithm. Given a new record, update the last used record, contig + * name, and index. + * + * @param contig the contig we want to look up. If null, index is used instead + * @param index the contig index we want to look up. Only used if contig is null + * @throws ReviewedStingException if index isn't present in the dictionary + * @return the SAMSequenceRecord for contig / index + */ + @Requires("contig != null || index >= 0") + @Ensures("result != null") + private SAMSequenceRecord updateCache(final String contig, int index ) { + SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig); + if ( rec == null ) { + throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index); + } else { + lastSSR = rec; + lastContig = rec.getSequenceName(); + lastIndex = rec.getSequenceIndex(); + return rec; + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index 4a989b984..9621aecda 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -29,17 +29,31 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import org.broad.tribble.BasicFeature; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; - -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + /** * @author aaron *

@@ -49,10 +63,11 @@ import org.testng.annotations.Test; */ public class GenomeLocParserUnitTest extends BaseTest { private GenomeLocParser genomeLocParser; + private SAMFileHeader header; @BeforeClass public void init() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } @@ -231,7 +246,16 @@ public class GenomeLocParserUnitTest extends BaseTest { assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop + assertTrue( genomeLocParser.isValidGenomeLoc("chr1",-1,2, false)); // bad stop assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end + assertTrue( genomeLocParser.isValidGenomeLoc("chr1",10,11, false)); // bad start, past end + assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",2,1)); // stop < start + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testValidateGenomeLoc() { + // bad contig index + genomeLocParser.validateGenomeLoc("chr1", 1, 1, 2, false); } private static class FlankingGenomeLocTestData extends TestDataProvider { @@ -333,4 +357,153 @@ public class GenomeLocParserUnitTest extends BaseTest { data.toString(), data.original, actual, data.flankStop); assertEquals(actual, data.flankStop, description); } + + @DataProvider(name = "parseGenomeLoc") + public Object[][] makeParsingTest() { + final List tests = new LinkedList(); + + tests.add(new Object[]{ "chr1:10", "chr1", 10 }); + tests.add(new Object[]{ "chr1:100", "chr1", 100 }); + tests.add(new Object[]{ "chr1:1000", "chr1", 1000 }); + tests.add(new Object[]{ "chr1:1,000", "chr1", 1000 }); + tests.add(new Object[]{ "chr1:10000", "chr1", 10000 }); + tests.add(new Object[]{ "chr1:10,000", "chr1", 10000 }); + tests.add(new Object[]{ "chr1:100000", "chr1", 100000 }); + tests.add(new Object[]{ "chr1:100,000", "chr1", 100000 }); + tests.add(new Object[]{ "chr1:1000000", "chr1", 1000000 }); + tests.add(new Object[]{ "chr1:1,000,000", "chr1", 1000000 }); + tests.add(new Object[]{ "chr1:1000,000", "chr1", 1000000 }); + tests.add(new Object[]{ "chr1:1,000000", "chr1", 1000000 }); + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "parseGenomeLoc") + public void testParsingPositions(final String string, final String contig, final int start) { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10000000); + GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + final GenomeLoc loc = genomeLocParser.parseGenomeLoc(string); + Assert.assertEquals(loc.getContig(), contig); + Assert.assertEquals(loc.getStart(), start); + Assert.assertEquals(loc.getStop(), start); + } + + @Test( ) + public void testCreationFromSAMRecord() { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(read); + Assert.assertEquals(loc.getContig(), read.getReferenceName()); + Assert.assertEquals(loc.getContigIndex(), (int)read.getReferenceIndex()); + Assert.assertEquals(loc.getStart(), read.getAlignmentStart()); + Assert.assertEquals(loc.getStop(), read.getAlignmentEnd()); + } + + @Test( ) + public void testCreationFromSAMRecordUnmapped() { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5); + read.setReadUnmappedFlag(true); + read.setReferenceIndex(-1); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(read); + Assert.assertTrue(loc.isUnmapped()); + } + + @Test( ) + public void testCreationFromSAMRecordUnmappedButOnGenome() { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5); + read.setReadUnmappedFlag(true); + read.setCigarString("*"); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(read); + Assert.assertEquals(loc.getContig(), read.getReferenceName()); + Assert.assertEquals(loc.getContigIndex(), (int)read.getReferenceIndex()); + Assert.assertEquals(loc.getStart(), read.getAlignmentStart()); + Assert.assertEquals(loc.getStop(), read.getAlignmentStart()); + } + + @Test + public void testCreationFromFeature() { + final Feature feature = new BasicFeature("chr1", 1, 5); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(feature); + Assert.assertEquals(loc.getContig(), feature.getChr()); + Assert.assertEquals(loc.getStart(), feature.getStart()); + Assert.assertEquals(loc.getStop(), feature.getEnd()); + } + + @Test + public void testCreationFromVariantContext() { + final VariantContext feature = new VariantContextBuilder("x", "chr1", 1, 5, Arrays.asList(Allele.create("AAAAA", true))).make(); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(feature); + Assert.assertEquals(loc.getContig(), feature.getChr()); + Assert.assertEquals(loc.getStart(), feature.getStart()); + Assert.assertEquals(loc.getStop(), feature.getEnd()); + } + + @Test + public void testcreateGenomeLocOnContig() throws FileNotFoundException { + final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMSequenceDictionary dict = seq.getSequenceDictionary(); + final GenomeLocParser genomeLocParser = new GenomeLocParser(dict); + + for ( final SAMSequenceRecord rec : dict.getSequences() ) { + final GenomeLoc loc = genomeLocParser.createOverEntireContig(rec.getSequenceName()); + Assert.assertEquals(loc.getContig(), rec.getSequenceName()); + Assert.assertEquals(loc.getStart(), 1); + Assert.assertEquals(loc.getStop(), rec.getSequenceLength()); + } + } + + @DataProvider(name = "GenomeLocOnContig") + public Object[][] makeGenomeLocOnContig() { + final List tests = new LinkedList(); + + final int contigLength = header.getSequence(0).getSequenceLength(); + for ( int start = -10; start < contigLength + 10; start++ ) { + for ( final int len : Arrays.asList(1, 10, 20) ) { + tests.add(new Object[]{ "chr1", start, start + len }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "GenomeLocOnContig") + public void testGenomeLocOnContig(final String contig, final int start, final int stop) { + final int contigLength = header.getSequence(0).getSequenceLength(); + final GenomeLoc loc = genomeLocParser.createGenomeLocOnContig(contig, start, stop); + + if ( stop < 1 || start > contigLength ) + Assert.assertNull(loc, "GenomeLoc should be null if the start/stops are not meaningful"); + else { + Assert.assertNotNull(loc); + Assert.assertEquals(loc.getContig(), contig); + Assert.assertEquals(loc.getStart(), Math.max(start, 1)); + Assert.assertEquals(loc.getStop(), Math.min(stop, contigLength)); + } + } + + @DataProvider(name = "GenomeLocPadding") + public Object[][] makeGenomeLocPadding() { + final List tests = new LinkedList(); + + final int contigLength = header.getSequence(0).getSequenceLength(); + for ( int pad = 0; pad < contigLength + 1; pad++) { + for ( int start = 1; start < contigLength; start++ ) { + for ( int stop = start; stop < contigLength; stop++ ) { + tests.add(new Object[]{ genomeLocParser.createGenomeLoc("chr1", start, stop), pad}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "GenomeLocPadding") + public void testGenomeLocPadding(final GenomeLoc input, final int pad) { + final int contigLength = header.getSequence(0).getSequenceLength(); + final GenomeLoc padded = genomeLocParser.createPaddedGenomeLoc(input, pad); + + Assert.assertNotNull(padded); + Assert.assertEquals(padded.getContig(), input.getContig()); + Assert.assertEquals(padded.getStart(), Math.max(input.getStart() - pad, 1)); + Assert.assertEquals(padded.getStop(), Math.min(input.getStop() + pad, contigLength)); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java new file mode 100644 index 000000000..7a5fcf0c2 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/MRUCachingSAMSequencingDictionaryUnitTest.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + + +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.LinkedList; +import java.util.List; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class MRUCachingSAMSequencingDictionaryUnitTest extends BaseTest { + private static ReferenceSequenceFile seq; + private static SAMSequenceDictionary dict; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + dict = seq.getSequenceDictionary(); + } + + @Test + public void testBasic() { + final MRUCachingSAMSequenceDictionary caching = new MRUCachingSAMSequenceDictionary(dict); + + Assert.assertEquals(caching.getDictionary(), dict, "Dictionary not the one I expected"); + + for ( final SAMSequenceRecord rec : dict.getSequences() ) { + Assert.assertFalse(caching.isCached(rec.getSequenceIndex()), "Expected index to not be cached"); + Assert.assertFalse(caching.isCached(rec.getSequenceName()), "Expected contig to not be cached"); + + Assert.assertEquals(caching.getSequence(rec.getSequenceName()), rec, "Couldn't query for sequence"); + Assert.assertEquals(caching.getSequence(rec.getSequenceIndex()), rec, "Couldn't query for sequence index"); + Assert.assertEquals(caching.hasContig(rec.getSequenceName()), true, "hasContig query for sequence"); + Assert.assertEquals(caching.hasContigIndex(rec.getSequenceIndex()), true, "hasContigIndex query for sequence"); + Assert.assertEquals(caching.getSequenceIndex(rec.getSequenceName()), rec.getSequenceIndex(), "Couldn't query for sequence"); + + Assert.assertEquals(caching.hasContig(rec.getSequenceName() + "asdfadsfa"), false, "hasContig query for unknown sequence"); + Assert.assertEquals(caching.hasContigIndex(dict.getSequences().size()), false, "hasContigIndex query for unknown index"); + + Assert.assertTrue(caching.isCached(rec.getSequenceIndex()), "Expected index to be cached"); + Assert.assertTrue(caching.isCached(rec.getSequenceName()), "Expected contig to be cached"); + } + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testBadGetSequence() { + final MRUCachingSAMSequenceDictionary caching = new MRUCachingSAMSequenceDictionary(dict); + caching.getSequence("notInDictionary"); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testBadGetSequenceIndex() { + final MRUCachingSAMSequenceDictionary caching = new MRUCachingSAMSequenceDictionary(dict); + caching.getSequence(dict.getSequences().size()); + } +} \ No newline at end of file