Improved BAQ implementation. Now supports adding BAQ tags to reads on the fly with ADD_TAG_ONLY option. Caching fasta reader implementation, and changes throughout the system to enable this. Many performance improvements throughout the system due to better reference access patterns.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4792 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-12-05 18:29:39 +00:00
parent 8901e63879
commit 44feb4a362
21 changed files with 378 additions and 47 deletions

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.file.FSLockWithShared; import org.broadinstitute.sting.utils.file.FSLockWithShared;
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.File; import java.io.File;
@ -177,7 +178,7 @@ public class ReferenceDataSource implements ReferenceDataSourceProgressListener
logger.info("Treating existing index file as complete."); logger.info("Treating existing index file as complete.");
} }
index = new IndexedFastaSequenceFile(fastaFile); index = new CachingIndexedFastaSequenceFile(fastaFile);
} }
catch (Exception e) { catch (Exception e) {

View File

@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
@ -76,7 +77,7 @@ public class RMDIndexer extends CommandLineProgram {
// --------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------
// setup the reference // setup the reference
ref = new IndexedFastaSequenceFile(referenceFile); ref = new CachingIndexedFastaSequenceFile(referenceFile);
genomeLocParser = new GenomeLocParser(ref); genomeLocParser = new GenomeLocParser(ref);
// get a track builder // get a track builder

View File

@ -41,6 +41,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.pileup.*; import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter; import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -132,7 +133,7 @@ public class UnifiedGenotyperEngine {
filter.add(LOW_QUAL_FILTER_NAME); filter.add(LOW_QUAL_FILTER_NAME);
referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile); referenceReader = new CachingIndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
} }
/** /**

View File

@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator; import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator;
import org.broadinstitute.sting.utils.text.TextFormattingUtils; import org.broadinstitute.sting.utils.text.TextFormattingUtils;
@ -212,7 +213,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 ) if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 )
throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");
referenceReader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
if ( !TARGET_NOT_SORTED ) { if ( !TARGET_NOT_SORTED ) {

View File

@ -35,6 +35,7 @@ import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File; import java.io.File;
@ -72,7 +73,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) { public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
this.innerWriter = innerWriter; this.innerWriter = innerWriter;
this.genomeLocParser = genomeLocParser; this.genomeLocParser = genomeLocParser;
this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile); this.referenceFileForMNPmerging = new CachingIndexedFastaSequenceFile(referenceFile);
this.vcMergeRule = vcMergeRule; this.vcMergeRule = vcMergeRule;
this.alleleMergeRule = alleleMergeRule; this.alleleMergeRule = alleleMergeRule;
this.useSingleSample = singleSample; this.useSingleSample = singleSample;

View File

@ -82,7 +82,7 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
out.printf(" read bases : %s%n", new String(read.getReadBases())); out.printf(" read bases : %s%n", new String(read.getReadBases()));
out.printf(" ref length : %d%n", baq.refBases.length); out.printf(" ref length : %d%n", baq.refBases.length);
out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG)); out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
printQuals(" BQ deltas : ", BAQ.getBAQDeltas(read), true); printQuals(" BQ deltas : ", getBAQDeltas(read), true);
printQuals(" original quals: ", read.getBaseQualities(), true); printQuals(" original quals: ", read.getBaseQualities(), true);
printQuals(" original quals: ", read.getBaseQualities()); printQuals(" original quals: ", read.getBaseQualities());
printQuals(" tag quals: ", baqFromTag); printQuals(" tag quals: ", baqFromTag);
@ -133,6 +133,22 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
out.println(); out.println();
} }
/**
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
* @param read
* @return
*/
public static byte[] getBAQDeltas(SAMRecord read) {
byte[] baq = BAQ.getBAQTag(read);
if ( baq != null ) {
byte[] deltas = new byte[baq.length];
for ( int i = 0; i < deltas.length; i++)
deltas[i] = (byte)(-1 * (baq[i] - 64));
return deltas;
} else
return null;
}
public Integer reduceInit() { return 0; } public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) { public Integer reduce(Integer value, Integer sum) {

View File

@ -37,6 +37,7 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil; import net.sf.samtools.util.StringUtil;
import net.sf.picard.reference.ReferenceSequence; import net.sf.picard.reference.ReferenceSequence;
@ -82,7 +83,7 @@ public class TestReadFishingWalker extends ReadWalker<Integer,Long> {
IndexedFastaSequenceFile referenceReader; IndexedFastaSequenceFile referenceReader;
FileInputStream indelCallInputStream; FileInputStream indelCallInputStream;
try { try {
referenceReader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
indelCallInputStream = new FileInputStream(indelCalls); indelCallInputStream = new FileInputStream(indelCalls);
} }
catch(IOException ex) { catch(IOException ex) {

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import net.sf.samtools.CigarElement; import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator; import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence; import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -38,6 +39,7 @@ public class BAQ {
public enum Mode { public enum Mode {
NONE, // don't apply a BAQ at all, the default NONE, // don't apply a BAQ at all, the default
USE_TAG_ONLY, // only use the TAG, if present, in the reads USE_TAG_ONLY, // only use the TAG, if present, in the reads
ADD_TAG_ONLY, // calculate the BAQ, but write it into the reads as the BAQ tag, leaving QUAL field alone
CALCULATE_AS_NECESSARY, // do HMM BAQ calculation on the fly, as necessary, if there's no tag CALCULATE_AS_NECESSARY, // do HMM BAQ calculation on the fly, as necessary, if there's no tag
RECALCULATE_ALWAYS // do HMM BAQ calculation on the fly, regardless of whether there's a tag present RECALCULATE_ALWAYS // do HMM BAQ calculation on the fly, regardless of whether there's a tag present
} }
@ -266,6 +268,22 @@ public class BAQ {
return 0; return 0;
} }
// ---------------------------------------------------------------------------------------------------------------
//
// Helper routines
//
// ---------------------------------------------------------------------------------------------------------------
/** decode the bit encoded state array values */
private static boolean stateIsIndel(int state) {
return (state & 3) != 0;
}
/** decode the bit encoded state array values */
private static int stateAlignedPosition(int state) {
return state >> 2;
}
/** /**
* helper routine for hmm_glocal * helper routine for hmm_glocal
* *
@ -296,6 +314,13 @@ public class BAQ {
return out; return out;
} }
// ---------------------------------------------------------------------------------------------------------------
//
// Actually working with the BAQ tag now
//
// ---------------------------------------------------------------------------------------------------------------
/** /**
* Get the BAQ attribute from the tag in read. Returns null if no BAQ tag is present. * Get the BAQ attribute from the tag in read. Returns null if no BAQ tag is present.
* @param read * @param read
@ -306,22 +331,21 @@ public class BAQ {
return s != null ? s.getBytes() : null; return s != null ? s.getBytes() : null;
} }
/** public static String encodeBQTag(SAMRecord read, byte[] baq) {
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present. // Offset to base alignment quality (BAQ), of the same length as the read sequence.
* @param read // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
* @return // so BQi = Qi - BAQi + 64
*/ byte[] bqTag = new byte[baq.length];
public static byte[] getBAQDeltas(SAMRecord read) { for ( int i = 0; i < bqTag.length; i++)
byte[] baq = getBAQTag(read); bqTag[i] = (byte)(((int)read.getBaseQualities()[i] + 64) - baq[i]);
if ( baq != null ) { return new String(bqTag);
byte[] deltas = new byte[baq.length];
for ( int i = 0; i < deltas.length; i++)
deltas[i] = (byte)(-1 * (baq[i] - 64));
return deltas;
} else
return null;
} }
public static void addBAQTag(SAMRecord read, byte[] baq) {
read.setAttribute(BAQ_TAG, encodeBQTag(read, baq));
}
/** /**
* Returns true if the read has a BAQ tag, or false otherwise * Returns true if the read has a BAQ tag, or false otherwise
* @param read * @param read
@ -346,7 +370,7 @@ public class BAQ {
if ( baq != null ) { if ( baq != null ) {
// Offset to base alignment quality (BAQ), of the same length as the read sequence. // Offset to base alignment quality (BAQ), of the same length as the read sequence.
// At the i-th read base, BAQi = Qi ? (BQi ? 64) where Qi is the i-th base quality. // At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length]; newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length];
for ( int i = 0; i < rawQuals.length; i++) { for ( int i = 0; i < rawQuals.length; i++) {
int val = rawQuals[i] - (baq[i] - 64); int val = rawQuals[i] - (baq[i] - 64);
@ -397,14 +421,19 @@ public class BAQ {
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) { public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
// start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar // start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar
int offset = getBandWidth() / 2; int offset = getBandWidth() / 2;
int start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read); long start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read);
int stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read); long stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read);
if ( stop > refReader.getSequenceDictionary().getSequence(read.getReferenceName()).getSequenceLength() ) {
return null;
} else {
// now that we have the start and stop, get the reference sequence covering it // now that we have the start and stop, get the reference sequence covering it
ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop); ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop);
return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read))); return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read)));
} }
}
// we need to bad ref by at least the bandwidth / 2 on either side // we need to bad ref by at least the bandwidth / 2 on either side
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) { public BAQCalculationResult calcBAQFromHMM(SAMRecord read, byte[] ref, int refOffset) {
@ -463,15 +492,21 @@ public class BAQ {
if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName()); if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName());
if ( calculationType == Mode.NONE ) { // we don't want to do anything if ( calculationType == Mode.NONE ) { // we don't want to do anything
; // just fall though ; // just fall though
} else if ( read.getReadUnmappedFlag() ) { } else if ( excludeReadFromBAQ(read) ) {
; // just fall through ; // just fall through
} else if ( calculationType == Mode.USE_TAG_ONLY ) { } else if ( calculationType == Mode.USE_TAG_ONLY ) {
calcBAQFromTag(read, true, true); calcBAQFromTag(read, true, true);
} else { } else {
if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) ) { if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) || calculationType == Mode.ADD_TAG_ONLY ) {
if ( DEBUG ) System.out.printf(" Calculating BAQ on the fly%n"); if ( DEBUG ) System.out.printf(" Calculating BAQ on the fly%n");
BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader); BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader);
if ( hmmResult != null ) {
if ( calculationType == Mode.ADD_TAG_ONLY )
addBAQTag(read, hmmResult.bq);
else
// modify QUALs directly
System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length); System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length);
}
} else { } else {
if ( DEBUG ) System.out.printf(" Taking BAQ from tag%n"); if ( DEBUG ) System.out.printf(" Taking BAQ from tag%n");
// this overwrites the original qualities // this overwrites the original qualities
@ -482,11 +517,15 @@ public class BAQ {
return read; return read;
} }
private static boolean stateIsIndel(int state) { /**
return (state & 3) != 0; * Returns true if we don't think this read is eligable for the BAQ calculation. Examples include non-PF reads,
} * duplicates, or unmapped reads. Used by baqRead to determine if a read should fall through the calculation.
*
private static int stateAlignedPosition(int state) { * @param read
return state >> 2; * @return
*/
public boolean excludeReadFromBAQ(SAMRecord read) {
// keeping mapped reads, regardless of pairing status, or primary alignment status.
return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag();
} }
} }

View File

@ -0,0 +1,165 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.fasta;
import net.sf.picard.PicardException;
import net.sf.picard.reference.*;
import net.sf.samtools.SAMSequenceRecord;
import java.io.File;
import java.util.Arrays;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer.
*
* Thread-safe! Uses a lock object to protect write and access to the cache.
*/
public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
private static final boolean USE_CACHE = true;
private static final boolean PRINT_EFFICIENCY = false;
private static final int PRINT_FREQUENCY = 10000;
private Object lock = new Object();
long cacheHits = 0;
long cacheMisses = 0;
long cacheStart = -1;
long cacheStop = -1;
long cacheSize = 100000;
long cacheMissBackup = 100;
ReferenceSequence cache = null;
/**
* Same as general constructor but allows one to override the default cacheSize
* @param file
* @param index
* @param cacheSize
*/
public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index, long cacheSize) {
this(file, index);
this.cacheSize = cacheSize;
this.cacheMissBackup = Math.max(cacheSize / 100, 1);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
* @param file The file to open.
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
* @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
*/
public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index) {
super(file, index);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
* @param file The file to open.
*/
public CachingIndexedFastaSequenceFile(final File file) {
super(file);
}
public CachingIndexedFastaSequenceFile(final File file, long cacheSize ) {
super(file);
this.cacheSize = cacheSize;
this.cacheMissBackup = Math.max(cacheSize / 100, 1);
}
public void printEfficiency() {
// comment out to disable tracking
if ( (cacheHits + cacheMisses) % PRINT_FREQUENCY == 0 ) {
System.out.printf("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency());
}
}
public double calcEfficiency() {
return 100.0 * cacheHits / (cacheMisses + cacheHits * 1.0);
}
public long getCacheHits() {
return cacheHits;
}
public long getCacheMisses() {
return cacheMisses;
}
/**
* Gets the subsequence of the contig in the range [start,stop]
* @param contig Contig whose subsequence to retrieve.
* @param start inclusive, 1-based start of region.
* @param stop inclusive, 1-based stop of region.
* @return The partial reference sequence associated with this range.
*/
public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) {
ReferenceSequence result;
if ( ! USE_CACHE || (stop - start) >= cacheSize ) {
cacheMisses++;
result = super.getSubsequenceAt(contig, start, stop);
} else {
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
if (stop > contigInfo.getSequenceLength())
throw new PicardException("Query asks for data past end of contig");
synchronized (lock) { // access to shared cache information must be protected
if ( start < cacheStart || stop > cacheStop || cache == null || cache.getContigIndex() != contigInfo.getSequenceIndex() ) {
cacheMisses++;
cacheStart = Math.max(start - cacheMissBackup, 0);
cacheStop = Math.min(cacheStart + cacheSize, contigInfo.getSequenceLength());
cache = super.getSubsequenceAt(contig, cacheStart, cacheStop);
//System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop);
} else {
cacheHits++;
}
// at this point we determine where in the cache we want to extract the requested subsequence
int cacheOffsetStart = (int)(start - cacheStart);
int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1);
try {
result = new ReferenceSequence(cache.getName(), cache.getContigIndex(), Arrays.copyOfRange(cache.getBases(), cacheOffsetStart, cacheOffsetStop));
} catch ( ArrayIndexOutOfBoundsException e ) {
throw new ReviewedStingException(String.format("BUG: bad array indexing. Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d",
cacheStart, cacheStop, start, stop, cacheOffsetStart, cacheOffsetStop, cache.getBases().length), e);
}
}
}
// // comment out to disable testing
// ReferenceSequence verify = super.getSubsequenceAt(contig, start, stop);
// if ( ! Arrays.equals(verify.getBases(), result.getBases()) )
// throw new ReviewedStingException(String.format("BUG: cached reference sequence not the same as clean fetched version at %s %d %d", contig, start, stop));
if ( PRINT_EFFICIENCY ) printEfficiency();
return result;
}
}

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -53,7 +54,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
@BeforeClass @BeforeClass
public void init() throws FileNotFoundException { public void init() throws FileNotFoundException {
// sequence // sequence
seq = new IndexedFastaSequenceFile(new File(hg18Reference)); seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq); genomeLocParser = new GenomeLocParser(seq);
builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null);
} }

View File

@ -5,6 +5,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -43,7 +44,7 @@ public abstract class ReferenceViewTemplate extends BaseTest {
*/ */
@BeforeClass @BeforeClass
public void initialize() throws FileNotFoundException { public void initialize() throws FileNotFoundException {
sequenceFile = new IndexedFastaSequenceFile( new File(hg18Reference) ); sequenceFile = new CachingIndexedFastaSequenceFile( new File(hg18Reference) );
genomeLocParser = new GenomeLocParser(sequenceFile); genomeLocParser = new GenomeLocParser(sequenceFile);
} }

View File

@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import static org.testng.Assert.assertTrue; import static org.testng.Assert.assertTrue;
import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeMethod;
@ -49,7 +50,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
@BeforeClass @BeforeClass
public void init() throws FileNotFoundException { public void init() throws FileNotFoundException {
seq = new IndexedFastaSequenceFile(new File(hg18Reference)); seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq); genomeLocParser = new GenomeLocParser(seq);
testSite1 = genomeLocParser.createGenomeLoc("chrM",10); testSite1 = genomeLocParser.createGenomeLoc("chrM",10);

View File

@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.AfterMethod; import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeMethod;
@ -64,7 +65,7 @@ public class SAMBAMDataSourceUnitTest extends BaseTest {
readers = new ArrayList<SAMReaderID>(); readers = new ArrayList<SAMReaderID>();
// sequence // sequence
seq = new IndexedFastaSequenceFile(new File(hg18Reference)); seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary());
} }

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.testng.Assert; import org.testng.Assert;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.file.FSLockWithShared; import org.broadinstitute.sting.utils.file.FSLockWithShared;
import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeMethod;
@ -58,7 +59,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@BeforeMethod @BeforeMethod
public void setup() { public void setup() {
seq = new IndexedFastaSequenceFile(new File(b36KGReference)); seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
genomeLocParser = new GenomeLocParser(seq); genomeLocParser = new GenomeLocParser(seq);
builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null); builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null);
} }

View File

@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker;
import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import static org.testng.Assert.fail; import static org.testng.Assert.fail;
@ -73,7 +74,7 @@ public class TraverseReadsUnitTest extends BaseTest {
@BeforeClass @BeforeClass
public void doOnce() { public void doOnce() {
ref = new IndexedFastaSequenceFile(refFile); ref = new CachingIndexedFastaSequenceFile(refFile);
genomeLocParser = new GenomeLocParser(ref); genomeLocParser = new GenomeLocParser(ref);
engine = new GenomeAnalysisEngine(); engine = new GenomeAnalysisEngine();

View File

@ -9,6 +9,7 @@ import org.testng.Assert;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
@ -26,7 +27,7 @@ public class GenomeLocUnitTest extends BaseTest {
@BeforeClass @BeforeClass
public void init() throws FileNotFoundException { public void init() throws FileNotFoundException {
// sequence // sequence
seq = new IndexedFastaSequenceFile(new File(hg18Reference)); seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq); genomeLocParser = new GenomeLocParser(seq);
} }

View File

@ -4,6 +4,7 @@ import org.testng.Assert;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -22,7 +23,7 @@ public class BedParserUnitTest extends BaseTest {
@BeforeClass @BeforeClass
public void beforeTests() { public void beforeTests() {
seq = new IndexedFastaSequenceFile(new File(b36KGReference)); seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
genomeLocParser = new GenomeLocParser(seq); genomeLocParser = new GenomeLocParser(seq);
} }

View File

@ -0,0 +1,94 @@
// our package
package org.broadinstitute.sting.utils.fasta;
// the imports for unit testing.
import org.testng.Assert;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
import org.broadinstitute.sting.BaseTest;
import java.io.File;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.SAMSequenceRecord;
/**
* Basic unit test for GenomeLoc
*/
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
private File simpleFasta = new File(testDir + "/exampleFASTA.fasta");
private static final int STEP_SIZE = 1;
//private static final List<Integer> QUERY_SIZES = Arrays.asList(1);
private static final List<Integer> QUERY_SIZES = Arrays.asList(1, 10, 100, 1000);
private static final List<Integer> CACHE_SIZES = Arrays.asList(10, 1000);
@DataProvider(name = "fastas")
public Object[][] createData1() {
List<Object[]> params = new ArrayList<Object[]>();
for ( File fasta : Arrays.asList(simpleFasta) ) {
for ( int cacheSize : CACHE_SIZES ) {
for ( int querySize : QUERY_SIZES ) {
params.add(new Object[]{fasta, cacheSize, querySize});
}
}
}
return params.toArray(new Object[][]{});
}
@Test(dataProvider = "fastas", enabled = true)
public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) {
IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
IndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
//logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
// contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));
for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) {
int start = i;
int stop = start + querySize;
if ( stop <= contig.getSequenceLength() ) {
ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
}
}
}
// Tests grabbing sequences around a middle cached value.
@Test(dataProvider = "fastas", enabled = true)
public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) {
IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
IndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
int middleStart = (contig.getSequenceLength() - querySize) / 2;
int middleStop = middleStart + querySize;
for ( int i = 0; i < contig.getSequenceLength(); i += 10 ) {
int start = i;
int stop = start + querySize;
if ( stop <= contig.getSequenceLength() ) {
ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart, middleStop);
ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
}
}
}
}

View File

@ -4,6 +4,7 @@ import org.testng.Assert;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject; import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeMethod;
@ -68,7 +69,7 @@ public class GLFWriterUnitTest extends BaseTest {
@BeforeClass @BeforeClass
public void beforeTests() { public void beforeTests() {
seq = new IndexedFastaSequenceFile(new File(b36KGReference)); seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
genomeLocParser = new GenomeLocParser(seq); genomeLocParser = new GenomeLocParser(seq);
} }

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
@ -39,7 +40,7 @@ public class VCFWriterUnitTest extends BaseTest {
@BeforeClass @BeforeClass
public void beforeTests() { public void beforeTests() {
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference)); IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq); genomeLocParser = new GenomeLocParser(seq);
} }

View File

@ -7,6 +7,7 @@ import org.testng.Assert;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -27,7 +28,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
@BeforeClass @BeforeClass
public void init() { public void init() {
ReferenceSequenceFile seq = new IndexedFastaSequenceFile(reference); ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(reference);
genomeLocParser = new GenomeLocParser(seq); genomeLocParser = new GenomeLocParser(seq);
} }