From 8562bfaae1a028a2966d5cbd3aff78a88b7ac4dc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 29 Jan 2013 08:10:56 -0500 Subject: [PATCH] Optimize GenomeLocParser.createGenomeLoc -- The new version is roughly 2x faster than the previous version. The key here was to cleanup the workflow for validateGenomeLoc and remove the now unnecessary synchronization blocks from the CachingSequencingDictionary, since these are now thread local variables -- #resolves https://jira.broadinstitute.org/browse/GSA-724 --- .../sting/utils/GenomeLocParser.java | 169 ++++++++++++------ .../sting/utils/GenomeLocParserBenchmark.java | 81 +++++++++ 2 files changed, 191 insertions(+), 59 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index e70182acf..36b23ae14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -45,6 +45,14 @@ import org.broadinstitute.variant.variantcontext.VariantContext; public final class GenomeLocParser { private static Logger logger = Logger.getLogger(GenomeLocParser.class); + /** + * How much validation should we do at runtime with this parser? + */ + public enum ValidationLevel { + STANDARD, + NONE + } + // -------------------------------------------------------------------------------------------------------------- // // Ugly global variable defining the optional ordering of contig elements @@ -58,22 +66,25 @@ public final class GenomeLocParser { final private SAMSequenceDictionary SINGLE_MASTER_SEQUENCE_DICTIONARY; /** - * A thread-local caching contig info + * A thread-local CachingSequenceDictionary */ private final ThreadLocal contigInfoPerThread = - new ThreadLocal(); + new ThreadLocal() { + @Override + protected CachingSequenceDictionary initialValue() { + return new CachingSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY); + } + }; + + /** + * How much validation are we doing at runtime with this GenomeLocParser? + */ + private final ValidationLevel validationLevel; /** * @return a caching sequence dictionary appropriate for this thread */ private CachingSequenceDictionary getContigInfo() { - if ( contigInfoPerThread.get() == null ) { - // initialize for this thread - contigInfoPerThread.set(new CachingSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY)); - } - - assert contigInfoPerThread.get() != null; - return contigInfoPerThread.get(); } @@ -94,24 +105,28 @@ public final class GenomeLocParser { this.dict = dict; } + public final String internContigName(final String contig) { + return getContigInfo(contig).getSequenceName(); + } + @Ensures("result > 0") public final int getNSequences() { return dict.size(); } @Requires("contig != null") - public final synchronized boolean hasContig(final String contig) { + public final boolean hasContig(final String contig) { return contig.equals(lastContig) || dict.getSequence(contig) != null; } @Requires("index >= 0") - public final synchronized boolean hasContig(final int index) { + public final boolean hasContig(final int index) { return lastIndex == index || dict.getSequence(index) != null; } @Requires("contig != null") @Ensures("result != null") - public synchronized final SAMSequenceRecord getSequence(final String contig) { + public final SAMSequenceRecord getSequence(final String contig) { if ( isCached(contig) ) return lastSSR; else @@ -120,7 +135,7 @@ public final class GenomeLocParser { @Requires("index >= 0") @Ensures("result != null") - public synchronized final SAMSequenceRecord getSequence(final int index) { + public final SAMSequenceRecord getSequence(final int index) { if ( isCached(index) ) return lastSSR; else @@ -129,7 +144,7 @@ public final class GenomeLocParser { @Requires("contig != null") @Ensures("result >= 0") - public synchronized final int getSequenceIndex(final String contig) { + public final int getSequenceIndex(final String contig) { if ( ! isCached(contig) ) { updateCache(contig, -1); } @@ -138,12 +153,12 @@ public final class GenomeLocParser { } @Requires({"contig != null", "lastContig != null"}) - private synchronized boolean isCached(final String contig) { + private boolean isCached(final String contig) { return lastContig.equals(contig); } @Requires({"lastIndex != -1", "index >= 0"}) - private synchronized boolean isCached(final int index) { + private boolean isCached(final int index) { return lastIndex == index; } @@ -157,7 +172,7 @@ public final class GenomeLocParser { */ @Requires("contig != null || index >= 0") @Ensures("result != null") - private synchronized SAMSequenceRecord updateCache(final String contig, int index ) { + private SAMSequenceRecord updateCache(final String contig, int index ) { SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig); if ( rec == null ) { throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index); @@ -168,8 +183,6 @@ public final class GenomeLocParser { return rec; } } - - } /** @@ -181,16 +194,32 @@ public final class GenomeLocParser { this(refFile.getSequenceDictionary()); } + /** + * Create a new GenomeLocParser based on seqDictionary with the standard validation level + * @param seqDict a non-null sequence dictionary + */ public GenomeLocParser(SAMSequenceDictionary seqDict) { + this(seqDict, ValidationLevel.STANDARD); + } + + /** + * Create a genome loc parser based on seqDict with the specified level of validation + * @param seqDict the sequence dictionary to use when creating genome locs + * @param validationLevel how much validation should we do of the genome locs at runtime? + */ + public GenomeLocParser(SAMSequenceDictionary seqDict, final ValidationLevel validationLevel) { + this.validationLevel = validationLevel; if (seqDict == null) { // we couldn't load the reference dictionary //logger.info("Failed to load reference dictionary, falling back to lexicographic order for contigs"); throw new UserException.CommandLineException("Failed to load reference dictionary"); } SINGLE_MASTER_SEQUENCE_DICTIONARY = seqDict; - logger.debug(String.format("Prepared reference sequence contig dictionary")); - for (SAMSequenceRecord contig : seqDict.getSequences()) { - logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); + if ( logger.isDebugEnabled() ) { + logger.debug(String.format("Prepared reference sequence contig dictionary")); + for (SAMSequenceRecord contig : seqDict.getSequences()) { + logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); + } } } @@ -283,13 +312,23 @@ public final class GenomeLocParser { @ThrowEnsures({"UserException.MalformedGenomeLoc", "!isValidGenomeLoc(contig, start, stop,mustBeOnReference)"}) public GenomeLoc createGenomeLoc(String contig, int index, final int start, final int stop, boolean mustBeOnReference) { - validateGenomeLoc(contig, index, start, stop, mustBeOnReference, true); - return new GenomeLoc(contig, index, start, stop); + // optimization: by interning the string we ensure that future comparisons use == not the full string comp + final String interned = validateGenomeLoc(contig, index, start, stop, mustBeOnReference); + return new GenomeLoc(interned, index, start, stop); } + /** + * Create a new genome loc, bounding start and stop by the start and end of contig + * @param contig our contig + * @param start our start as an arbitrary integer (may be negative, etc) + * @param stop our stop as an arbitrary integer (may be negative, etc) + * @throws ReviewedStingException if there's no way to create a meaningful genome loc given contig, start, and stop + * @return a valid genome loc over contig + */ public GenomeLoc createGenomeLocOnContig(final String contig, final int start, final int stop) { - GenomeLoc contigLoc = createOverEntireContig(contig); - return new GenomeLoc(contig, getContigIndex(contig), start, stop).intersect(contigLoc); + final GenomeLoc myLoc = createGenomeLoc(contig, start, stop); + final GenomeLoc contigLoc = createOverEntireContig(contig); + return myLoc.intersect(contigLoc); } /** @@ -306,50 +345,62 @@ public final class GenomeLocParser { * @param start the start position * @param stop the stop position * - * @return true if it's valid, false otherwise. If exceptOnError, then throws a UserException if invalid + * @return the interned contig name, an optimization that ensures that contig == the string in the sequence dictionary */ - private boolean validateGenomeLoc(String contig, int contigIndex, int start, int stop, boolean mustBeOnReference, boolean exceptOnError) { - if ( ! getContigInfo().hasContig(contig) ) - return vglHelper(exceptOnError, String.format("Unknown contig %s", contig)); + protected String validateGenomeLoc(final String contig, final int contigIndex, final int start, final int stop, final boolean mustBeOnReference) { + if ( validationLevel == ValidationLevel.NONE ) + return contig; + else { + if (stop < start) + vglHelper(String.format("The stop position %d is less than start %d in contig %s", stop, start, contig)); - if (stop < start) - return vglHelper(exceptOnError, String.format("The stop position %d is less than start %d in contig %s", stop, start, contig)); + final SAMSequenceRecord contigInfo = getContigInfo().getSequence(contig); + if ( contigInfo.getSequenceIndex() != contigIndex ) + vglHelper(String.format("The contig index %d is bad, doesn't equal the contig index %d of the contig from a string %s", + contigIndex, contigInfo.getSequenceIndex(), contig)); - if (contigIndex < 0) - return vglHelper(exceptOnError, String.format("The contig index %d is less than 0", contigIndex)); + if ( mustBeOnReference ) { + if (start < 1) + vglHelper(String.format("The start position %d is less than 1", start)); - if (contigIndex >= getContigInfo().getNSequences()) - return vglHelper(exceptOnError, String.format("The contig index %d is greater than the stored sequence count (%d)", contigIndex, getContigInfo().getNSequences())); + if (stop < 1) + vglHelper(String.format("The stop position %d is less than 1", stop)); - if ( mustBeOnReference ) { - if (start < 1) - return vglHelper(exceptOnError, String.format("The start position %d is less than 1", start)); + final int contigSize = contigInfo.getSequenceLength(); + if (start > contigSize || stop > contigSize) + vglHelper(String.format("The genome loc coordinates %d-%d exceed the contig size (%d)", start, stop, contigSize)); + } - if (stop < 1) - return vglHelper(exceptOnError, String.format("The stop position %d is less than 1", stop)); - - int contigSize = getContigInfo().getSequence(contigIndex).getSequenceLength(); - if (start > contigSize || stop > contigSize) - return vglHelper(exceptOnError, String.format("The genome loc coordinates %d-%d exceed the contig size (%d)", start, stop, contigSize)); + return contigInfo.getSequenceName(); } - - // we passed - return true; } + /** + * Would a genome loc created with the given parameters be valid w.r.t. the master sequence dictionary? + * @param contig the contig we'd use + * @param start the start position + * @param stop the stop + * @param mustBeOnReference should we require the resulting genome loc to be completely on the reference genome? + * @return true if this would produce a valid genome loc, false otherwise + */ public boolean isValidGenomeLoc(String contig, int start, int stop, boolean mustBeOnReference ) { - return validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, mustBeOnReference, false); - } - - public boolean isValidGenomeLoc(String contig, int start, int stop ) { - return validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, true, false); - } - - private boolean vglHelper(boolean exceptOnError, String msg) { - if ( exceptOnError ) - throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect:" + msg); - else + try { + validateGenomeLoc(contig, getContigIndexWithoutException(contig), start, stop, mustBeOnReference); + return true; + } catch ( ReviewedStingException e) { return false; + } + } + + /** + * @see #isValidGenomeLoc(String, int, int) with mustBeOnReference == true + */ + public boolean isValidGenomeLoc(String contig, int start, int stop ) { + return isValidGenomeLoc(contig, start, stop, true); + } + + private void vglHelper(final String msg) { + throw new UserException.MalformedGenomeLoc("Parameters to GenomeLocParser are incorrect:" + msg); } // -------------------------------------------------------------------------------------------------------------- diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java new file mode 100644 index 000000000..478f02530 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserBenchmark.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; + +import java.io.File; + +/** + * Caliper microbenchmark of genome loc parser + */ +public class GenomeLocParserBenchmark extends SimpleBenchmark { + private IndexedFastaSequenceFile seq; + private final int ITERATIONS = 1000000; + + @Param({"NEW", "NONE"}) + GenomeLocParser.ValidationLevel validationLevel; // set automatically by framework + + @Param({"true", "false"}) + boolean useContigIndex; // set automatically by framework + + @Override protected void setUp() throws Exception { + seq = new CachingIndexedFastaSequenceFile(new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")); + } +// +// public void timeSequentialCreationFromGenomeLoc(int rep) { +// final GenomeLocParser genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary(), validationLevel); +// GenomeLoc last = genomeLocParser.createGenomeLoc("1", 1, 1); +// for ( int i = 0; i < rep; i++ ) { +// for ( int j = 1; j < ITERATIONS; j++ ) { +// if ( useContigIndex ) +// last = genomeLocParser.createGenomeLoc(last.getContig(), last.getContigIndex(), last.getStart() + 1); +// else +// last = genomeLocParser.createGenomeLoc(last.getContig(), last.getStart() + 1); +// } +// } +// } +// +// public void timeSequentialCreationFromGenomeLocOriginal(int rep) { +// final GenomeLocParserOriginal genomeLocParser = new GenomeLocParserOriginal(seq.getSequenceDictionary()); +// GenomeLoc last = genomeLocParser.createGenomeLoc("1", 1, 1); +// for ( int i = 0; i < rep; i++ ) { +// for ( int j = 1; j < ITERATIONS; j++ ) { +// if ( useContigIndex ) +// last = genomeLocParser.createGenomeLoc(last.getContig(), last.getContigIndex(), last.getStart() + 1); +// else +// last = genomeLocParser.createGenomeLoc(last.getContig(), last.getStart() + 1); +// } +// } +// } + + public static void main(String[] args) { + com.google.caliper.Runner.main(GenomeLocParserBenchmark.class, args); + } +}