From 3bbc6a05406496734c67595ae5f85afde0e059ff Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 17 Dec 2010 21:05:17 +0000 Subject: [PATCH] Slightly more thread safe CachingIndexedFastaSequenceFile.java. Likely passes parallel testing git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4869 348d0f76-0448-11de-a6fe-93d51630548a --- .../CachingIndexedFastaSequenceFile.java | 56 +++++++++++-------- ...chingIndexedFastaSequenceFileUnitTest.java | 7 ++- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 15c4f5d4f..ebafc4cb6 100644 --- a/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -39,32 +39,37 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; * Thread-safe! Uses a lock object to protect write and access to the cache. */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { + /** global enable flag */ private static final boolean USE_CACHE = true; + + /** do we want to print debugging information about cache efficiency? */ private static final boolean PRINT_EFFICIENCY = false; + + /** If we are printing efficiency info, what frequency should we do it at? */ private static final int PRINT_FREQUENCY = 10000; + + /** The default cache size in bp */ private static final long DEFAULT_CACHE_SIZE = 1000000; - private static long ourStaticCacheSize = DEFAULT_CACHE_SIZE; - + // information about checking efficiency long cacheHits = 0; long cacheMisses = 0; - private static class Cache { - long cacheStart = -1; - long cacheStop = -1; - long cacheSize = DEFAULT_CACHE_SIZE; - long cacheMissBackup = 100; - ReferenceSequence cachedSequence = null; + /** The cache size of this CachingIndexedFastaSequenceFile */ + long cacheSize = DEFAULT_CACHE_SIZE; - public Cache(long cacheSize) { - this.cacheSize = cacheSize; - this.cacheMissBackup = Math.max(cacheSize / 100, 1); - } + /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ + long cacheMissBackup = 100; + + /** Represents a specific cached sequence, with a specific start and stop, as well as the bases */ + private static class Cache { + long start = -1, stop = -1; + ReferenceSequence seq = null; } private static ThreadLocal cache = new ThreadLocal () { @Override protected Cache initialValue() { - return new Cache(ourStaticCacheSize); + return new Cache(); } }; @@ -76,7 +81,12 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { */ public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index, long cacheSize) { super(file, index); - ourStaticCacheSize = cacheSize; // BUG: class only supports one univeral cache size + setCacheSize(cacheSize); + } + + private void setCacheSize(long cacheSize) { + this.cacheSize = cacheSize; + this.cacheMissBackup = Math.max(cacheSize / 1000, 1); } /** @@ -99,7 +109,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { public CachingIndexedFastaSequenceFile(final File file, long cacheSize ) { super(file); - ourStaticCacheSize = cacheSize; // BUG: class only supports one univeral cache size + setCacheSize(cacheSize); } public void printEfficiency() { @@ -134,7 +144,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { Cache myCache = cache.get(); //System.out.printf("getSubsequentAt cache=%s%n", myCache); - if ( ! USE_CACHE || (stop - start) >= myCache.cacheSize ) { + if ( ! USE_CACHE || (stop - start) >= cacheSize ) { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); } else { @@ -144,25 +154,25 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { if (stop > contigInfo.getSequenceLength()) throw new PicardException("Query asks for data past end of contig"); - if ( start < myCache.cacheStart || stop > myCache.cacheStop || myCache.cachedSequence == null || myCache.cachedSequence.getContigIndex() != contigInfo.getSequenceIndex() ) { + if ( start < myCache.start || stop > myCache.stop || myCache.seq == null || myCache.seq.getContigIndex() != contigInfo.getSequenceIndex() ) { cacheMisses++; - myCache.cacheStart = Math.max(start - myCache.cacheMissBackup, 0); - myCache.cacheStop = Math.min(myCache.cacheStart + myCache.cacheSize, contigInfo.getSequenceLength()); - myCache.cachedSequence = super.getSubsequenceAt(contig, myCache.cacheStart, myCache.cacheStop); + myCache.start = Math.max(start - cacheMissBackup, 0); + myCache.stop = Math.min(myCache.start + cacheSize, contigInfo.getSequenceLength()); + myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop); //System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop); } else { cacheHits++; } // at this point we determine where in the cache we want to extract the requested subsequence - int cacheOffsetStart = (int)(start - myCache.cacheStart); + int cacheOffsetStart = (int)(start - myCache.start); int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1); try { - result = new ReferenceSequence(myCache.cachedSequence.getName(), myCache.cachedSequence.getContigIndex(), Arrays.copyOfRange(myCache.cachedSequence.getBases(), cacheOffsetStart, cacheOffsetStop)); + result = new ReferenceSequence(myCache.seq.getName(), myCache.seq.getContigIndex(), Arrays.copyOfRange(myCache.seq.getBases(), cacheOffsetStart, cacheOffsetStop)); } catch ( ArrayIndexOutOfBoundsException e ) { throw new ReviewedStingException(String.format("BUG: bad array indexing. Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d", - myCache.cacheStart, myCache.cacheStop, start, stop, cacheOffsetStart, cacheOffsetStop, myCache.cachedSequence.getBases().length), e); + myCache.start, myCache.stop, start, stop, cacheOffsetStart, cacheOffsetStop, myCache.seq.getBases().length), e); } } diff --git a/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index b5514c30c..aa720e521 100644 --- a/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -50,8 +50,8 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); - //logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", - // contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize)); + logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", + contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize)); for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) { int start = i; int stop = start + querySize; @@ -77,6 +77,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { int middleStart = (contig.getSequenceLength() - querySize) / 2; int middleStop = middleStart + querySize; + logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d with intermediate query", + contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize)); + for ( int i = 0; i < contig.getSequenceLength(); i += 10 ) { int start = i; int stop = start + querySize;