Improved BAQ implementation. Now supports adding BAQ tags to reads on the fly with ADD_TAG_ONLY option. Caching fasta reader implementation, and changes throughout the system to enable this. Many performance improvements throughout the system due to better reference access patterns.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4792 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8901e63879
commit
44feb4a362
|
|
@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.file.FSLockWithShared;
|
||||
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
|
|
@ -177,7 +178,7 @@ public class ReferenceDataSource implements ReferenceDataSourceProgressListener
|
|||
logger.info("Treating existing index file as complete.");
|
||||
}
|
||||
|
||||
index = new IndexedFastaSequenceFile(fastaFile);
|
||||
index = new CachingIndexedFastaSequenceFile(fastaFile);
|
||||
|
||||
}
|
||||
catch (Exception e) {
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
|
|||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
|
|
@ -76,7 +77,7 @@ public class RMDIndexer extends CommandLineProgram {
|
|||
// ---------------------------------------------------------------------------------
|
||||
|
||||
// setup the reference
|
||||
ref = new IndexedFastaSequenceFile(referenceFile);
|
||||
ref = new CachingIndexedFastaSequenceFile(referenceFile);
|
||||
genomeLocParser = new GenomeLocParser(ref);
|
||||
|
||||
// get a track builder
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -132,7 +133,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
|
||||
referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
|
||||
referenceReader = new CachingIndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
|||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
|
||||
import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
|
|
@ -212,7 +213,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 )
|
||||
throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");
|
||||
|
||||
referenceReader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
|
||||
if ( !TARGET_NOT_SORTED ) {
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import org.broad.tribble.vcf.VCFWriter;
|
|||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -72,7 +73,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
|
|||
public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
|
||||
this.innerWriter = innerWriter;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile);
|
||||
this.referenceFileForMNPmerging = new CachingIndexedFastaSequenceFile(referenceFile);
|
||||
this.vcMergeRule = vcMergeRule;
|
||||
this.alleleMergeRule = alleleMergeRule;
|
||||
this.useSingleSample = singleSample;
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
|
|||
out.printf(" read bases : %s%n", new String(read.getReadBases()));
|
||||
out.printf(" ref length : %d%n", baq.refBases.length);
|
||||
out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
|
||||
printQuals(" BQ deltas : ", BAQ.getBAQDeltas(read), true);
|
||||
printQuals(" BQ deltas : ", getBAQDeltas(read), true);
|
||||
printQuals(" original quals: ", read.getBaseQualities(), true);
|
||||
printQuals(" original quals: ", read.getBaseQualities());
|
||||
printQuals(" tag quals: ", baqFromTag);
|
||||
|
|
@ -133,6 +133,22 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
|
|||
out.println();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
|
||||
* @param read
|
||||
* @return
|
||||
*/
|
||||
public static byte[] getBAQDeltas(SAMRecord read) {
|
||||
byte[] baq = BAQ.getBAQTag(read);
|
||||
if ( baq != null ) {
|
||||
byte[] deltas = new byte[baq.length];
|
||||
for ( int i = 0; i < deltas.length; i++)
|
||||
deltas[i] = (byte)(-1 * (baq[i] - 64));
|
||||
return deltas;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ import org.broadinstitute.sting.commandline.Output;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
|
|
@ -82,7 +83,7 @@ public class TestReadFishingWalker extends ReadWalker<Integer,Long> {
|
|||
IndexedFastaSequenceFile referenceReader;
|
||||
FileInputStream indelCallInputStream;
|
||||
try {
|
||||
referenceReader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
indelCallInputStream = new FileInputStream(indelCalls);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -38,6 +39,7 @@ public class BAQ {
|
|||
public enum Mode {
|
||||
NONE, // don't apply a BAQ at all, the default
|
||||
USE_TAG_ONLY, // only use the TAG, if present, in the reads
|
||||
ADD_TAG_ONLY, // calculate the BAQ, but write it into the reads as the BAQ tag, leaving QUAL field alone
|
||||
CALCULATE_AS_NECESSARY, // do HMM BAQ calculation on the fly, as necessary, if there's no tag
|
||||
RECALCULATE_ALWAYS // do HMM BAQ calculation on the fly, regardless of whether there's a tag present
|
||||
}
|
||||
|
|
@ -266,6 +268,22 @@ public class BAQ {
|
|||
return 0;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Helper routines
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/** decode the bit encoded state array values */
|
||||
private static boolean stateIsIndel(int state) {
|
||||
return (state & 3) != 0;
|
||||
}
|
||||
|
||||
/** decode the bit encoded state array values */
|
||||
private static int stateAlignedPosition(int state) {
|
||||
return state >> 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* helper routine for hmm_glocal
|
||||
*
|
||||
|
|
@ -296,6 +314,13 @@ public class BAQ {
|
|||
return out;
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Actually working with the BAQ tag now
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Get the BAQ attribute from the tag in read. Returns null if no BAQ tag is present.
|
||||
* @param read
|
||||
|
|
@ -306,22 +331,21 @@ public class BAQ {
|
|||
return s != null ? s.getBytes() : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
|
||||
* @param read
|
||||
* @return
|
||||
*/
|
||||
public static byte[] getBAQDeltas(SAMRecord read) {
|
||||
byte[] baq = getBAQTag(read);
|
||||
if ( baq != null ) {
|
||||
byte[] deltas = new byte[baq.length];
|
||||
for ( int i = 0; i < deltas.length; i++)
|
||||
deltas[i] = (byte)(-1 * (baq[i] - 64));
|
||||
return deltas;
|
||||
} else
|
||||
return null;
|
||||
public static String encodeBQTag(SAMRecord read, byte[] baq) {
|
||||
// Offset to base alignment quality (BAQ), of the same length as the read sequence.
|
||||
// At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
|
||||
// so BQi = Qi - BAQi + 64
|
||||
byte[] bqTag = new byte[baq.length];
|
||||
for ( int i = 0; i < bqTag.length; i++)
|
||||
bqTag[i] = (byte)(((int)read.getBaseQualities()[i] + 64) - baq[i]);
|
||||
return new String(bqTag);
|
||||
}
|
||||
|
||||
public static void addBAQTag(SAMRecord read, byte[] baq) {
|
||||
read.setAttribute(BAQ_TAG, encodeBQTag(read, baq));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns true if the read has a BAQ tag, or false otherwise
|
||||
* @param read
|
||||
|
|
@ -346,7 +370,7 @@ public class BAQ {
|
|||
|
||||
if ( baq != null ) {
|
||||
// Offset to base alignment quality (BAQ), of the same length as the read sequence.
|
||||
// At the i-th read base, BAQi = Qi ? (BQi ? 64) where Qi is the i-th base quality.
|
||||
// At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
|
||||
newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length];
|
||||
for ( int i = 0; i < rawQuals.length; i++) {
|
||||
int val = rawQuals[i] - (baq[i] - 64);
|
||||
|
|
@ -397,13 +421,18 @@ public class BAQ {
|
|||
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
|
||||
// start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar
|
||||
int offset = getBandWidth() / 2;
|
||||
int start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read);
|
||||
int stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read);
|
||||
long start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read);
|
||||
long stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read);
|
||||
|
||||
// now that we have the start and stop, get the reference sequence covering it
|
||||
ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop);
|
||||
if ( stop > refReader.getSequenceDictionary().getSequence(read.getReferenceName()).getSequenceLength() ) {
|
||||
return null;
|
||||
} else {
|
||||
|
||||
return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read)));
|
||||
// now that we have the start and stop, get the reference sequence covering it
|
||||
ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop);
|
||||
|
||||
return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read)));
|
||||
}
|
||||
}
|
||||
|
||||
// we need to bad ref by at least the bandwidth / 2 on either side
|
||||
|
|
@ -463,15 +492,21 @@ public class BAQ {
|
|||
if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName());
|
||||
if ( calculationType == Mode.NONE ) { // we don't want to do anything
|
||||
; // just fall though
|
||||
} else if ( read.getReadUnmappedFlag() ) {
|
||||
} else if ( excludeReadFromBAQ(read) ) {
|
||||
; // just fall through
|
||||
} else if ( calculationType == Mode.USE_TAG_ONLY ) {
|
||||
calcBAQFromTag(read, true, true);
|
||||
} else {
|
||||
if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) ) {
|
||||
if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) || calculationType == Mode.ADD_TAG_ONLY ) {
|
||||
if ( DEBUG ) System.out.printf(" Calculating BAQ on the fly%n");
|
||||
BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader);
|
||||
System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length);
|
||||
if ( hmmResult != null ) {
|
||||
if ( calculationType == Mode.ADD_TAG_ONLY )
|
||||
addBAQTag(read, hmmResult.bq);
|
||||
else
|
||||
// modify QUALs directly
|
||||
System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length);
|
||||
}
|
||||
} else {
|
||||
if ( DEBUG ) System.out.printf(" Taking BAQ from tag%n");
|
||||
// this overwrites the original qualities
|
||||
|
|
@ -482,11 +517,15 @@ public class BAQ {
|
|||
return read;
|
||||
}
|
||||
|
||||
private static boolean stateIsIndel(int state) {
|
||||
return (state & 3) != 0;
|
||||
}
|
||||
|
||||
private static int stateAlignedPosition(int state) {
|
||||
return state >> 2;
|
||||
/**
|
||||
* Returns true if we don't think this read is eligable for the BAQ calculation. Examples include non-PF reads,
|
||||
* duplicates, or unmapped reads. Used by baqRead to determine if a read should fall through the calculation.
|
||||
*
|
||||
* @param read
|
||||
* @return
|
||||
*/
|
||||
public boolean excludeReadFromBAQ(SAMRecord read) {
|
||||
// keeping mapped reads, regardless of pairing status, or primary alignment status.
|
||||
return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
/*
|
||||
* The MIT License
|
||||
*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.picard.reference.*;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
* A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer.
|
||||
*
|
||||
* Thread-safe! Uses a lock object to protect write and access to the cache.
|
||||
*/
|
||||
public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
||||
private static final boolean USE_CACHE = true;
|
||||
private static final boolean PRINT_EFFICIENCY = false;
|
||||
private static final int PRINT_FREQUENCY = 10000;
|
||||
|
||||
private Object lock = new Object();
|
||||
|
||||
long cacheHits = 0;
|
||||
long cacheMisses = 0;
|
||||
|
||||
long cacheStart = -1;
|
||||
long cacheStop = -1;
|
||||
long cacheSize = 100000;
|
||||
long cacheMissBackup = 100;
|
||||
ReferenceSequence cache = null;
|
||||
|
||||
/**
|
||||
* Same as general constructor but allows one to override the default cacheSize
|
||||
* @param file
|
||||
* @param index
|
||||
* @param cacheSize
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index, long cacheSize) {
|
||||
this(file, index);
|
||||
this.cacheSize = cacheSize;
|
||||
this.cacheMissBackup = Math.max(cacheSize / 100, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
* @param file The file to open.
|
||||
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
|
||||
* @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index) {
|
||||
super(file, index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
* @param file The file to open.
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File file) {
|
||||
super(file);
|
||||
}
|
||||
|
||||
public CachingIndexedFastaSequenceFile(final File file, long cacheSize ) {
|
||||
super(file);
|
||||
this.cacheSize = cacheSize;
|
||||
this.cacheMissBackup = Math.max(cacheSize / 100, 1);
|
||||
}
|
||||
|
||||
public void printEfficiency() {
|
||||
// comment out to disable tracking
|
||||
if ( (cacheHits + cacheMisses) % PRINT_FREQUENCY == 0 ) {
|
||||
System.out.printf("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency());
|
||||
}
|
||||
}
|
||||
|
||||
public double calcEfficiency() {
|
||||
return 100.0 * cacheHits / (cacheMisses + cacheHits * 1.0);
|
||||
}
|
||||
|
||||
public long getCacheHits() {
|
||||
return cacheHits;
|
||||
}
|
||||
|
||||
public long getCacheMisses() {
|
||||
return cacheMisses;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the subsequence of the contig in the range [start,stop]
|
||||
* @param contig Contig whose subsequence to retrieve.
|
||||
* @param start inclusive, 1-based start of region.
|
||||
* @param stop inclusive, 1-based stop of region.
|
||||
* @return The partial reference sequence associated with this range.
|
||||
*/
|
||||
public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) {
|
||||
ReferenceSequence result;
|
||||
|
||||
if ( ! USE_CACHE || (stop - start) >= cacheSize ) {
|
||||
cacheMisses++;
|
||||
result = super.getSubsequenceAt(contig, start, stop);
|
||||
} else {
|
||||
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
|
||||
|
||||
if (stop > contigInfo.getSequenceLength())
|
||||
throw new PicardException("Query asks for data past end of contig");
|
||||
|
||||
synchronized (lock) { // access to shared cache information must be protected
|
||||
if ( start < cacheStart || stop > cacheStop || cache == null || cache.getContigIndex() != contigInfo.getSequenceIndex() ) {
|
||||
cacheMisses++;
|
||||
|
||||
cacheStart = Math.max(start - cacheMissBackup, 0);
|
||||
cacheStop = Math.min(cacheStart + cacheSize, contigInfo.getSequenceLength());
|
||||
cache = super.getSubsequenceAt(contig, cacheStart, cacheStop);
|
||||
//System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop);
|
||||
} else {
|
||||
cacheHits++;
|
||||
}
|
||||
|
||||
// at this point we determine where in the cache we want to extract the requested subsequence
|
||||
int cacheOffsetStart = (int)(start - cacheStart);
|
||||
int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1);
|
||||
|
||||
try {
|
||||
result = new ReferenceSequence(cache.getName(), cache.getContigIndex(), Arrays.copyOfRange(cache.getBases(), cacheOffsetStart, cacheOffsetStop));
|
||||
} catch ( ArrayIndexOutOfBoundsException e ) {
|
||||
throw new ReviewedStingException(String.format("BUG: bad array indexing. Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d",
|
||||
cacheStart, cacheStop, start, stop, cacheOffsetStart, cacheOffsetStop, cache.getBases().length), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// // comment out to disable testing
|
||||
// ReferenceSequence verify = super.getSubsequenceAt(contig, start, stop);
|
||||
// if ( ! Arrays.equals(verify.getBases(), result.getBases()) )
|
||||
// throw new ReviewedStingException(String.format("BUG: cached reference sequence not the same as clean fetched version at %s %d %d", contig, start, stop));
|
||||
|
||||
if ( PRINT_EFFICIENCY ) printEfficiency();
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
|
|||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -53,7 +54,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
|
|||
@BeforeClass
|
||||
public void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
|
|||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
|
@ -43,7 +44,7 @@ public abstract class ReferenceViewTemplate extends BaseTest {
|
|||
*/
|
||||
@BeforeClass
|
||||
public void initialize() throws FileNotFoundException {
|
||||
sequenceFile = new IndexedFastaSequenceFile( new File(hg18Reference) );
|
||||
sequenceFile = new CachingIndexedFastaSequenceFile( new File(hg18Reference) );
|
||||
genomeLocParser = new GenomeLocParser(sequenceFile);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
|
|||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import static org.testng.Assert.assertTrue;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
|
|
@ -49,7 +50,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
|
|||
|
||||
@BeforeClass
|
||||
public void init() throws FileNotFoundException {
|
||||
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
|
||||
testSite1 = genomeLocParser.createGenomeLoc("chrM",10);
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
|||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.AfterMethod;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
|
|
@ -64,7 +65,7 @@ public class SAMBAMDataSourceUnitTest extends BaseTest {
|
|||
readers = new ArrayList<SAMReaderID>();
|
||||
|
||||
// sequence
|
||||
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
|
||||
genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.testng.Assert;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.file.FSLockWithShared;
|
||||
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
|
|
@ -58,7 +59,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
|
|||
|
||||
@BeforeMethod
|
||||
public void setup() {
|
||||
seq = new IndexedFastaSequenceFile(new File(b36KGReference));
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
|||
import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import static org.testng.Assert.fail;
|
||||
|
||||
|
|
@ -73,7 +74,7 @@ public class TraverseReadsUnitTest extends BaseTest {
|
|||
|
||||
@BeforeClass
|
||||
public void doOnce() {
|
||||
ref = new IndexedFastaSequenceFile(refFile);
|
||||
ref = new CachingIndexedFastaSequenceFile(refFile);
|
||||
genomeLocParser = new GenomeLocParser(ref);
|
||||
|
||||
engine = new GenomeAnalysisEngine();
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import org.testng.Assert;
|
|||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
|
@ -26,7 +27,7 @@ public class GenomeLocUnitTest extends BaseTest {
|
|||
@BeforeClass
|
||||
public void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import org.testng.Assert;
|
|||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
|
@ -22,7 +23,7 @@ public class BedParserUnitTest extends BaseTest {
|
|||
|
||||
@BeforeClass
|
||||
public void beforeTests() {
|
||||
seq = new IndexedFastaSequenceFile(new File(b36KGReference));
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,94 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
/**
|
||||
* Basic unit test for GenomeLoc
|
||||
*/
|
||||
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
||||
private File simpleFasta = new File(testDir + "/exampleFASTA.fasta");
|
||||
private static final int STEP_SIZE = 1;
|
||||
|
||||
//private static final List<Integer> QUERY_SIZES = Arrays.asList(1);
|
||||
private static final List<Integer> QUERY_SIZES = Arrays.asList(1, 10, 100, 1000);
|
||||
private static final List<Integer> CACHE_SIZES = Arrays.asList(10, 1000);
|
||||
|
||||
@DataProvider(name = "fastas")
|
||||
public Object[][] createData1() {
|
||||
List<Object[]> params = new ArrayList<Object[]>();
|
||||
for ( File fasta : Arrays.asList(simpleFasta) ) {
|
||||
for ( int cacheSize : CACHE_SIZES ) {
|
||||
for ( int querySize : QUERY_SIZES ) {
|
||||
params.add(new Object[]{fasta, cacheSize, querySize});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return params.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "fastas", enabled = true)
|
||||
public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) {
|
||||
IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
|
||||
IndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize);
|
||||
|
||||
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
|
||||
//logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
|
||||
// contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));
|
||||
for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) {
|
||||
int start = i;
|
||||
int stop = start + querySize;
|
||||
if ( stop <= contig.getSequenceLength() ) {
|
||||
ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
|
||||
ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
|
||||
|
||||
Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
|
||||
Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
|
||||
Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tests grabbing sequences around a middle cached value.
|
||||
@Test(dataProvider = "fastas", enabled = true)
|
||||
public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) {
|
||||
IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
|
||||
IndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize);
|
||||
|
||||
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
|
||||
|
||||
int middleStart = (contig.getSequenceLength() - querySize) / 2;
|
||||
int middleStop = middleStart + querySize;
|
||||
|
||||
for ( int i = 0; i < contig.getSequenceLength(); i += 10 ) {
|
||||
int start = i;
|
||||
int stop = start + querySize;
|
||||
if ( stop <= contig.getSequenceLength() ) {
|
||||
ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart, middleStop);
|
||||
ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
|
||||
ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
|
||||
|
||||
Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
|
||||
Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
|
||||
Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -4,6 +4,7 @@ import org.testng.Assert;
|
|||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
|
|
@ -68,7 +69,7 @@ public class GLFWriterUnitTest extends BaseTest {
|
|||
|
||||
@BeforeClass
|
||||
public void beforeTests() {
|
||||
seq = new IndexedFastaSequenceFile(new File(b36KGReference));
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.BaseTest;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
|
|
@ -39,7 +40,7 @@ public class VCFWriterUnitTest extends BaseTest {
|
|||
|
||||
@BeforeClass
|
||||
public void beforeTests() {
|
||||
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference));
|
||||
IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import org.testng.Assert;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -27,7 +28,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
|||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
ReferenceSequenceFile seq = new IndexedFastaSequenceFile(reference);
|
||||
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(reference);
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue