diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java index 90b7420c5..c5e3ea98d 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.file.FSLockWithShared; import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; @@ -177,7 +178,7 @@ public class ReferenceDataSource implements ReferenceDataSourceProgressListener logger.info("Treating existing index file as complete."); } - index = new IndexedFastaSequenceFile(fastaFile); + index = new CachingIndexedFastaSequenceFile(fastaFile); } catch (Exception e) { diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java b/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java index e7d12cd52..5bb65f9a2 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; import java.io.FileOutputStream; @@ -76,7 +77,7 @@ public class RMDIndexer extends CommandLineProgram { // --------------------------------------------------------------------------------- // setup the reference - ref = new IndexedFastaSequenceFile(referenceFile); + ref = new CachingIndexedFastaSequenceFile(referenceFile); genomeLocParser = new GenomeLocParser(ref); // get a track builder diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 8c7c38a9e..aa1f7a480 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -41,6 +41,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.pileup.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -132,7 +133,7 @@ public class UnifiedGenotyperEngine { filter.add(LOW_QUAL_FILTER_NAME); - referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile); + referenceReader = new CachingIndexedFastaSequenceFile(toolkit.getArguments().referenceFile); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 3b2e50cd2..c10c45373 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator; import org.broadinstitute.sting.utils.text.TextFormattingUtils; @@ -212,7 +213,7 @@ public class IndelRealigner extends ReadWalker { if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 ) throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); - referenceReader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); if ( !TARGET_NOT_SORTED ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java index ac64ceac0..981b89355 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java @@ -35,6 +35,7 @@ import org.broad.tribble.vcf.VCFWriter; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.File; @@ -72,7 +73,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) { this.innerWriter = innerWriter; this.genomeLocParser = genomeLocParser; - this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile); + this.referenceFileForMNPmerging = new CachingIndexedFastaSequenceFile(referenceFile); this.vcMergeRule = vcMergeRule; this.alleleMergeRule = alleleMergeRule; this.useSingleSample = singleSample; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java index d00dae153..7486eda1b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java @@ -82,7 +82,7 @@ public class ValidateBAQWalker extends ReadWalker { out.printf(" read bases : %s%n", new String(read.getReadBases())); out.printf(" ref length : %d%n", baq.refBases.length); out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG)); - printQuals(" BQ deltas : ", BAQ.getBAQDeltas(read), true); + printQuals(" BQ deltas : ", getBAQDeltas(read), true); printQuals(" original quals: ", read.getBaseQualities(), true); printQuals(" original quals: ", read.getBaseQualities()); printQuals(" tag quals: ", baqFromTag); @@ -133,6 +133,22 @@ public class ValidateBAQWalker extends ReadWalker { out.println(); } + /** + * Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present. + * @param read + * @return + */ + public static byte[] getBAQDeltas(SAMRecord read) { + byte[] baq = BAQ.getBAQTag(read); + if ( baq != null ) { + byte[] deltas = new byte[baq.length]; + for ( int i = 0; i < deltas.length; i++) + deltas[i] = (byte)(-1 * (baq[i] - 64)); + return deltas; + } else + return null; + } + public Integer reduceInit() { return 0; } public Integer reduce(Integer value, Integer sum) { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java index 40adde697..9a820c4e3 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.StringUtil; import net.sf.picard.reference.ReferenceSequence; @@ -82,7 +83,7 @@ public class TestReadFishingWalker extends ReadWalker { IndexedFastaSequenceFile referenceReader; FileInputStream indelCallInputStream; try { - referenceReader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); indelCallInputStream = new FileInputStream(indelCalls); } catch(IOException ex) { diff --git a/java/src/org/broadinstitute/sting/utils/BAQ.java b/java/src/org/broadinstitute/sting/utils/BAQ.java index 7b9a46ba9..fc102f061 100644 --- a/java/src/org/broadinstitute/sting/utils/BAQ.java +++ b/java/src/org/broadinstitute/sting/utils/BAQ.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMRecord; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMSequenceRecord; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -38,6 +39,7 @@ public class BAQ { public enum Mode { NONE, // don't apply a BAQ at all, the default USE_TAG_ONLY, // only use the TAG, if present, in the reads + ADD_TAG_ONLY, // calculate the BAQ, but write it into the reads as the BAQ tag, leaving QUAL field alone CALCULATE_AS_NECESSARY, // do HMM BAQ calculation on the fly, as necessary, if there's no tag RECALCULATE_ALWAYS // do HMM BAQ calculation on the fly, regardless of whether there's a tag present } @@ -266,6 +268,22 @@ public class BAQ { return 0; } + // --------------------------------------------------------------------------------------------------------------- + // + // Helper routines + // + // --------------------------------------------------------------------------------------------------------------- + + /** decode the bit encoded state array values */ + private static boolean stateIsIndel(int state) { + return (state & 3) != 0; + } + + /** decode the bit encoded state array values */ + private static int stateAlignedPosition(int state) { + return state >> 2; + } + /** * helper routine for hmm_glocal * @@ -296,6 +314,13 @@ public class BAQ { return out; } + + // --------------------------------------------------------------------------------------------------------------- + // + // Actually working with the BAQ tag now + // + // --------------------------------------------------------------------------------------------------------------- + /** * Get the BAQ attribute from the tag in read. Returns null if no BAQ tag is present. * @param read @@ -306,22 +331,21 @@ public class BAQ { return s != null ? s.getBytes() : null; } - /** - * Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present. - * @param read - * @return - */ - public static byte[] getBAQDeltas(SAMRecord read) { - byte[] baq = getBAQTag(read); - if ( baq != null ) { - byte[] deltas = new byte[baq.length]; - for ( int i = 0; i < deltas.length; i++) - deltas[i] = (byte)(-1 * (baq[i] - 64)); - return deltas; - } else - return null; + public static String encodeBQTag(SAMRecord read, byte[] baq) { + // Offset to base alignment quality (BAQ), of the same length as the read sequence. + // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality. + // so BQi = Qi - BAQi + 64 + byte[] bqTag = new byte[baq.length]; + for ( int i = 0; i < bqTag.length; i++) + bqTag[i] = (byte)(((int)read.getBaseQualities()[i] + 64) - baq[i]); + return new String(bqTag); } + public static void addBAQTag(SAMRecord read, byte[] baq) { + read.setAttribute(BAQ_TAG, encodeBQTag(read, baq)); + } + + /** * Returns true if the read has a BAQ tag, or false otherwise * @param read @@ -346,7 +370,7 @@ public class BAQ { if ( baq != null ) { // Offset to base alignment quality (BAQ), of the same length as the read sequence. - // At the i-th read base, BAQi = Qi ? (BQi ? 64) where Qi is the i-th base quality. + // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality. newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length]; for ( int i = 0; i < rawQuals.length; i++) { int val = rawQuals[i] - (baq[i] - 64); @@ -397,13 +421,18 @@ public class BAQ { public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) { // start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar int offset = getBandWidth() / 2; - int start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read); - int stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read); + long start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read); + long stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read); - // now that we have the start and stop, get the reference sequence covering it - ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop); + if ( stop > refReader.getSequenceDictionary().getSequence(read.getReferenceName()).getSequenceLength() ) { + return null; + } else { - return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read))); + // now that we have the start and stop, get the reference sequence covering it + ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop); + + return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read))); + } } // we need to bad ref by at least the bandwidth / 2 on either side @@ -463,15 +492,21 @@ public class BAQ { if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName()); if ( calculationType == Mode.NONE ) { // we don't want to do anything ; // just fall though - } else if ( read.getReadUnmappedFlag() ) { + } else if ( excludeReadFromBAQ(read) ) { ; // just fall through } else if ( calculationType == Mode.USE_TAG_ONLY ) { calcBAQFromTag(read, true, true); } else { - if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) ) { + if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) || calculationType == Mode.ADD_TAG_ONLY ) { if ( DEBUG ) System.out.printf(" Calculating BAQ on the fly%n"); BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader); - System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length); + if ( hmmResult != null ) { + if ( calculationType == Mode.ADD_TAG_ONLY ) + addBAQTag(read, hmmResult.bq); + else + // modify QUALs directly + System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length); + } } else { if ( DEBUG ) System.out.printf(" Taking BAQ from tag%n"); // this overwrites the original qualities @@ -482,11 +517,15 @@ public class BAQ { return read; } - private static boolean stateIsIndel(int state) { - return (state & 3) != 0; - } - - private static int stateAlignedPosition(int state) { - return state >> 2; + /** + * Returns true if we don't think this read is eligable for the BAQ calculation. Examples include non-PF reads, + * duplicates, or unmapped reads. Used by baqRead to determine if a read should fall through the calculation. + * + * @param read + * @return + */ + public boolean excludeReadFromBAQ(SAMRecord read) { + // keeping mapped reads, regardless of pairing status, or primary alignment status. + return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag(); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java new file mode 100644 index 000000000..1de11264f --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -0,0 +1,165 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.fasta; + +import net.sf.picard.PicardException; +import net.sf.picard.reference.*; +import net.sf.samtools.SAMSequenceRecord; + +import java.io.File; +import java.util.Arrays; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. + * + * Thread-safe! Uses a lock object to protect write and access to the cache. + */ +public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { + private static final boolean USE_CACHE = true; + private static final boolean PRINT_EFFICIENCY = false; + private static final int PRINT_FREQUENCY = 10000; + + private Object lock = new Object(); + + long cacheHits = 0; + long cacheMisses = 0; + + long cacheStart = -1; + long cacheStop = -1; + long cacheSize = 100000; + long cacheMissBackup = 100; + ReferenceSequence cache = null; + + /** + * Same as general constructor but allows one to override the default cacheSize + * @param file + * @param index + * @param cacheSize + */ + public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index, long cacheSize) { + this(file, index); + this.cacheSize = cacheSize; + this.cacheMissBackup = Math.max(cacheSize / 100, 1); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * @param file The file to open. + * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. + * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. + */ + public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index) { + super(file, index); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * @param file The file to open. + */ + public CachingIndexedFastaSequenceFile(final File file) { + super(file); + } + + public CachingIndexedFastaSequenceFile(final File file, long cacheSize ) { + super(file); + this.cacheSize = cacheSize; + this.cacheMissBackup = Math.max(cacheSize / 100, 1); + } + + public void printEfficiency() { + // comment out to disable tracking + if ( (cacheHits + cacheMisses) % PRINT_FREQUENCY == 0 ) { + System.out.printf("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency()); + } + } + + public double calcEfficiency() { + return 100.0 * cacheHits / (cacheMisses + cacheHits * 1.0); + } + + public long getCacheHits() { + return cacheHits; + } + + public long getCacheMisses() { + return cacheMisses; + } + + + /** + * Gets the subsequence of the contig in the range [start,stop] + * @param contig Contig whose subsequence to retrieve. + * @param start inclusive, 1-based start of region. + * @param stop inclusive, 1-based stop of region. + * @return The partial reference sequence associated with this range. + */ + public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) { + ReferenceSequence result; + + if ( ! USE_CACHE || (stop - start) >= cacheSize ) { + cacheMisses++; + result = super.getSubsequenceAt(contig, start, stop); + } else { + SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); + + if (stop > contigInfo.getSequenceLength()) + throw new PicardException("Query asks for data past end of contig"); + + synchronized (lock) { // access to shared cache information must be protected + if ( start < cacheStart || stop > cacheStop || cache == null || cache.getContigIndex() != contigInfo.getSequenceIndex() ) { + cacheMisses++; + + cacheStart = Math.max(start - cacheMissBackup, 0); + cacheStop = Math.min(cacheStart + cacheSize, contigInfo.getSequenceLength()); + cache = super.getSubsequenceAt(contig, cacheStart, cacheStop); + //System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop); + } else { + cacheHits++; + } + + // at this point we determine where in the cache we want to extract the requested subsequence + int cacheOffsetStart = (int)(start - cacheStart); + int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1); + + try { + result = new ReferenceSequence(cache.getName(), cache.getContigIndex(), Arrays.copyOfRange(cache.getBases(), cacheOffsetStart, cacheOffsetStop)); + } catch ( ArrayIndexOutOfBoundsException e ) { + throw new ReviewedStingException(String.format("BUG: bad array indexing. Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d", + cacheStart, cacheStop, start, stop, cacheOffsetStart, cacheOffsetStop, cache.getBases().length), e); + } + } + } + +// // comment out to disable testing +// ReferenceSequence verify = super.getSubsequenceAt(contig, start, stop); +// if ( ! Arrays.equals(verify.getBases(), result.getBases()) ) +// throw new ReviewedStingException(String.format("BUG: cached reference sequence not the same as clean fetched version at %s %d %d", contig, start, stop)); + + if ( PRINT_EFFICIENCY ) printEfficiency(); + return result; + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index a7dbf6e7c..e3e955a7c 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -53,7 +54,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { @BeforeClass public void init() throws FileNotFoundException { // sequence - seq = new IndexedFastaSequenceFile(new File(hg18Reference)); + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); genomeLocParser = new GenomeLocParser(seq); builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); } diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java index c8bcf559b..2f6165ac1 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java @@ -5,6 +5,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -43,7 +44,7 @@ public abstract class ReferenceViewTemplate extends BaseTest { */ @BeforeClass public void initialize() throws FileNotFoundException { - sequenceFile = new IndexedFastaSequenceFile( new File(hg18Reference) ); + sequenceFile = new CachingIndexedFastaSequenceFile( new File(hg18Reference) ); genomeLocParser = new GenomeLocParser(sequenceFile); } diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java index 85f82c839..6caf64bd8 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import static org.testng.Assert.assertTrue; import org.testng.annotations.BeforeMethod; @@ -49,7 +50,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { @BeforeClass public void init() throws FileNotFoundException { - seq = new IndexedFastaSequenceFile(new File(hg18Reference)); + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); genomeLocParser = new GenomeLocParser(seq); testSite1 = genomeLocParser.createGenomeLoc("chrM",10); diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceUnitTest.java index 373cb6634..d1d7658d4 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceUnitTest.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; @@ -64,7 +65,7 @@ public class SAMBAMDataSourceUnitTest extends BaseTest { readers = new ArrayList(); // sequence - seq = new IndexedFastaSequenceFile(new File(hg18Reference)); + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java index da1763b76..ce9b36fef 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.file.FSLockWithShared; import org.testng.annotations.BeforeMethod; @@ -58,7 +59,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { @BeforeMethod public void setup() { - seq = new IndexedFastaSequenceFile(new File(b36KGReference)); + seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); genomeLocParser = new GenomeLocParser(seq); builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); } diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 9d642519b..a56fc146a 100755 --- a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import static org.testng.Assert.fail; @@ -73,7 +74,7 @@ public class TraverseReadsUnitTest extends BaseTest { @BeforeClass public void doOnce() { - ref = new IndexedFastaSequenceFile(refFile); + ref = new CachingIndexedFastaSequenceFile(refFile); genomeLocParser = new GenomeLocParser(ref); engine = new GenomeAnalysisEngine(); diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java index 81d6446bf..60062583f 100644 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java @@ -9,6 +9,7 @@ import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; @@ -26,7 +27,7 @@ public class GenomeLocUnitTest extends BaseTest { @BeforeClass public void init() throws FileNotFoundException { // sequence - seq = new IndexedFastaSequenceFile(new File(hg18Reference)); + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); genomeLocParser = new GenomeLocParser(seq); } diff --git a/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java b/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java index 8a470303d..3167e41c7 100644 --- a/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java @@ -4,6 +4,7 @@ import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -22,7 +23,7 @@ public class BedParserUnitTest extends BaseTest { @BeforeClass public void beforeTests() { - seq = new IndexedFastaSequenceFile(new File(b36KGReference)); + seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); genomeLocParser = new GenomeLocParser(seq); } diff --git a/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java new file mode 100644 index 000000000..780b5130a --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -0,0 +1,94 @@ +// our package +package org.broadinstitute.sting.utils.fasta; + + +// the imports for unit testing. + + +import org.testng.Assert; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.broadinstitute.sting.BaseTest; + +import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.ReferenceSequence; +import net.sf.samtools.SAMSequenceRecord; + +/** + * Basic unit test for GenomeLoc + */ +public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { + private File simpleFasta = new File(testDir + "/exampleFASTA.fasta"); + private static final int STEP_SIZE = 1; + + //private static final List QUERY_SIZES = Arrays.asList(1); + private static final List QUERY_SIZES = Arrays.asList(1, 10, 100, 1000); + private static final List CACHE_SIZES = Arrays.asList(10, 1000); + + @DataProvider(name = "fastas") + public Object[][] createData1() { + List params = new ArrayList(); + for ( File fasta : Arrays.asList(simpleFasta) ) { + for ( int cacheSize : CACHE_SIZES ) { + for ( int querySize : QUERY_SIZES ) { + params.add(new Object[]{fasta, cacheSize, querySize}); + } + } + } + + return params.toArray(new Object[][]{}); + } + + @Test(dataProvider = "fastas", enabled = true) + public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) { + IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + IndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize); + + SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); + //logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", + // contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize)); + for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) { + int start = i; + int stop = start + querySize; + if ( stop <= contig.getSequenceLength() ) { + ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop); + ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop); + + Assert.assertEquals(cachedVal.getName(), uncachedVal.getName()); + Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex()); + Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases()); + } + } + } + + // Tests grabbing sequences around a middle cached value. + @Test(dataProvider = "fastas", enabled = true) + public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) { + IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + IndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize); + + SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); + + int middleStart = (contig.getSequenceLength() - querySize) / 2; + int middleStop = middleStart + querySize; + + for ( int i = 0; i < contig.getSequenceLength(); i += 10 ) { + int start = i; + int stop = start + querySize; + if ( stop <= contig.getSequenceLength() ) { + ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart, middleStop); + ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop); + ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop); + + Assert.assertEquals(cachedVal.getName(), uncachedVal.getName()); + Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex()); + Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases()); + } + } + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterUnitTest.java b/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterUnitTest.java index c2b622738..36c0336d8 100755 --- a/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterUnitTest.java @@ -4,6 +4,7 @@ import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.genotype.LikelihoodObject; import org.testng.annotations.BeforeMethod; @@ -68,7 +69,7 @@ public class GLFWriterUnitTest extends BaseTest { @BeforeClass public void beforeTests() { - seq = new IndexedFastaSequenceFile(new File(b36KGReference)); + seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); genomeLocParser = new GenomeLocParser(seq); } diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index 4f3a43e4d..61e31825c 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.annotations.Test; import org.testng.annotations.BeforeClass; @@ -39,7 +40,7 @@ public class VCFWriterUnitTest extends BaseTest { @BeforeClass public void beforeTests() { - IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference)); + IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); genomeLocParser = new GenomeLocParser(seq); } diff --git a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index da1913e9a..dc170a4d7 100644 --- a/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -7,6 +7,7 @@ import org.testng.Assert; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -27,7 +28,7 @@ public class IntervalUtilsUnitTest extends BaseTest { @BeforeClass public void init() { - ReferenceSequenceFile seq = new IndexedFastaSequenceFile(reference); + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(reference); genomeLocParser = new GenomeLocParser(seq); }