Improved BAQ implementation. Now supports adding BAQ tags to reads on the fly with ADD_TAG_ONLY option. Caching fasta reader implementation, and changes throughout the system to enable this. Many performance improvements throughout the system due to better reference access patterns.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4792 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-12-05 18:29:39 +00:00
parent 8901e63879
commit 44feb4a362
21 changed files with 378 additions and 47 deletions

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.file.FSLockWithShared;
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.File;
@ -177,7 +178,7 @@ public class ReferenceDataSource implements ReferenceDataSourceProgressListener
logger.info("Treating existing index file as complete.");
}
index = new IndexedFastaSequenceFile(fastaFile);
index = new CachingIndexedFastaSequenceFile(fastaFile);
}
catch (Exception e) {

View File

@ -16,6 +16,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.File;
import java.io.FileOutputStream;
@ -76,7 +77,7 @@ public class RMDIndexer extends CommandLineProgram {
// ---------------------------------------------------------------------------------
// setup the reference
ref = new IndexedFastaSequenceFile(referenceFile);
ref = new CachingIndexedFastaSequenceFile(referenceFile);
genomeLocParser = new GenomeLocParser(ref);
// get a track builder

View File

@ -41,6 +41,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -132,7 +133,7 @@ public class UnifiedGenotyperEngine {
filter.add(LOW_QUAL_FILTER_NAME);
referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
referenceReader = new CachingIndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
}
/**

View File

@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
@ -212,7 +213,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 )
throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1");
referenceReader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
if ( !TARGET_NOT_SORTED ) {

View File

@ -35,6 +35,7 @@ import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
@ -72,7 +73,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter {
public MergeSegregatingAlternateAllelesVCFWriter(VCFWriter innerWriter, GenomeLocParser genomeLocParser, File referenceFile, VariantContextMergeRule vcMergeRule, VariantContextUtils.AlleleMergeRule alleleMergeRule, String singleSample, boolean emitOnlyMergedRecords, Logger logger, boolean takeOwnershipOfInner, boolean trackAltAlleleStats) {
this.innerWriter = innerWriter;
this.genomeLocParser = genomeLocParser;
this.referenceFileForMNPmerging = new IndexedFastaSequenceFile(referenceFile);
this.referenceFileForMNPmerging = new CachingIndexedFastaSequenceFile(referenceFile);
this.vcMergeRule = vcMergeRule;
this.alleleMergeRule = alleleMergeRule;
this.useSingleSample = singleSample;

View File

@ -82,7 +82,7 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
out.printf(" read bases : %s%n", new String(read.getReadBases()));
out.printf(" ref length : %d%n", baq.refBases.length);
out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
printQuals(" BQ deltas : ", BAQ.getBAQDeltas(read), true);
printQuals(" BQ deltas : ", getBAQDeltas(read), true);
printQuals(" original quals: ", read.getBaseQualities(), true);
printQuals(" original quals: ", read.getBaseQualities());
printQuals(" tag quals: ", baqFromTag);
@ -133,6 +133,22 @@ public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
out.println();
}
/**
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
* @param read
* @return
*/
public static byte[] getBAQDeltas(SAMRecord read) {
byte[] baq = BAQ.getBAQTag(read);
if ( baq != null ) {
byte[] deltas = new byte[baq.length];
for ( int i = 0; i < deltas.length; i++)
deltas[i] = (byte)(-1 * (baq[i] - 64));
return deltas;
} else
return null;
}
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {

View File

@ -37,6 +37,7 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import net.sf.picard.reference.ReferenceSequence;
@ -82,7 +83,7 @@ public class TestReadFishingWalker extends ReadWalker<Integer,Long> {
IndexedFastaSequenceFile referenceReader;
FileInputStream indelCallInputStream;
try {
referenceReader = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
indelCallInputStream = new FileInputStream(indelCalls);
}
catch(IOException ex) {

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -38,6 +39,7 @@ public class BAQ {
public enum Mode {
NONE, // don't apply a BAQ at all, the default
USE_TAG_ONLY, // only use the TAG, if present, in the reads
ADD_TAG_ONLY, // calculate the BAQ, but write it into the reads as the BAQ tag, leaving QUAL field alone
CALCULATE_AS_NECESSARY, // do HMM BAQ calculation on the fly, as necessary, if there's no tag
RECALCULATE_ALWAYS // do HMM BAQ calculation on the fly, regardless of whether there's a tag present
}
@ -266,6 +268,22 @@ public class BAQ {
return 0;
}
// ---------------------------------------------------------------------------------------------------------------
//
// Helper routines
//
// ---------------------------------------------------------------------------------------------------------------
/** decode the bit encoded state array values */
private static boolean stateIsIndel(int state) {
return (state & 3) != 0;
}
/** decode the bit encoded state array values */
private static int stateAlignedPosition(int state) {
return state >> 2;
}
/**
* helper routine for hmm_glocal
*
@ -296,6 +314,13 @@ public class BAQ {
return out;
}
// ---------------------------------------------------------------------------------------------------------------
//
// Actually working with the BAQ tag now
//
// ---------------------------------------------------------------------------------------------------------------
/**
* Get the BAQ attribute from the tag in read. Returns null if no BAQ tag is present.
* @param read
@ -306,22 +331,21 @@ public class BAQ {
return s != null ? s.getBytes() : null;
}
/**
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
* @param read
* @return
*/
public static byte[] getBAQDeltas(SAMRecord read) {
byte[] baq = getBAQTag(read);
if ( baq != null ) {
byte[] deltas = new byte[baq.length];
for ( int i = 0; i < deltas.length; i++)
deltas[i] = (byte)(-1 * (baq[i] - 64));
return deltas;
} else
return null;
public static String encodeBQTag(SAMRecord read, byte[] baq) {
// Offset to base alignment quality (BAQ), of the same length as the read sequence.
// At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
// so BQi = Qi - BAQi + 64
byte[] bqTag = new byte[baq.length];
for ( int i = 0; i < bqTag.length; i++)
bqTag[i] = (byte)(((int)read.getBaseQualities()[i] + 64) - baq[i]);
return new String(bqTag);
}
public static void addBAQTag(SAMRecord read, byte[] baq) {
read.setAttribute(BAQ_TAG, encodeBQTag(read, baq));
}
/**
* Returns true if the read has a BAQ tag, or false otherwise
* @param read
@ -346,7 +370,7 @@ public class BAQ {
if ( baq != null ) {
// Offset to base alignment quality (BAQ), of the same length as the read sequence.
// At the i-th read base, BAQi = Qi ? (BQi ? 64) where Qi is the i-th base quality.
// At the i-th read base, BAQi = Qi - (BQi - 64) where Qi is the i-th base quality.
newQuals = overwriteOriginalQuals ? rawQuals : new byte[rawQuals.length];
for ( int i = 0; i < rawQuals.length; i++) {
int val = rawQuals[i] - (baq[i] - 64);
@ -397,13 +421,18 @@ public class BAQ {
public BAQCalculationResult calcBAQFromHMM(SAMRecord read, IndexedFastaSequenceFile refReader) {
// start is alignment start - band width / 2 - size of first I element, if there is one. Stop is similar
int offset = getBandWidth() / 2;
int start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read);
int stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read);
long start = read.getAlignmentStart() - offset - getFirstInsertionOffset(read);
long stop = read.getAlignmentEnd() + offset + getLastInsertionOffset(read);
// now that we have the start and stop, get the reference sequence covering it
ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop);
if ( stop > refReader.getSequenceDictionary().getSequence(read.getReferenceName()).getSequenceLength() ) {
return null;
} else {
return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read)));
// now that we have the start and stop, get the reference sequence covering it
ReferenceSequence refSeq = refReader.getSubsequenceAt(read.getReferenceName(), start, stop);
return calcBAQFromHMM(read, refSeq.getBases(), - (offset + getFirstInsertionOffset(read)));
}
}
// we need to bad ref by at least the bandwidth / 2 on either side
@ -463,15 +492,21 @@ public class BAQ {
if ( DEBUG ) System.out.printf("BAQ %s read %s%n", calculationType, read.getReadName());
if ( calculationType == Mode.NONE ) { // we don't want to do anything
; // just fall though
} else if ( read.getReadUnmappedFlag() ) {
} else if ( excludeReadFromBAQ(read) ) {
; // just fall through
} else if ( calculationType == Mode.USE_TAG_ONLY ) {
calcBAQFromTag(read, true, true);
} else {
if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) ) {
if ( calculationType == Mode.RECALCULATE_ALWAYS || ! hasBAQTag(read) || calculationType == Mode.ADD_TAG_ONLY ) {
if ( DEBUG ) System.out.printf(" Calculating BAQ on the fly%n");
BAQCalculationResult hmmResult = calcBAQFromHMM(read, refReader);
System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length);
if ( hmmResult != null ) {
if ( calculationType == Mode.ADD_TAG_ONLY )
addBAQTag(read, hmmResult.bq);
else
// modify QUALs directly
System.arraycopy(hmmResult.bq, 0, read.getBaseQualities(), 0, hmmResult.bq.length);
}
} else {
if ( DEBUG ) System.out.printf(" Taking BAQ from tag%n");
// this overwrites the original qualities
@ -482,11 +517,15 @@ public class BAQ {
return read;
}
private static boolean stateIsIndel(int state) {
return (state & 3) != 0;
}
private static int stateAlignedPosition(int state) {
return state >> 2;
/**
* Returns true if we don't think this read is eligable for the BAQ calculation. Examples include non-PF reads,
* duplicates, or unmapped reads. Used by baqRead to determine if a read should fall through the calculation.
*
* @param read
* @return
*/
public boolean excludeReadFromBAQ(SAMRecord read) {
// keeping mapped reads, regardless of pairing status, or primary alignment status.
return read.getReadUnmappedFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getDuplicateReadFlag();
}
}

View File

@ -0,0 +1,165 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.fasta;
import net.sf.picard.PicardException;
import net.sf.picard.reference.*;
import net.sf.samtools.SAMSequenceRecord;
import java.io.File;
import java.util.Arrays;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer.
*
* Thread-safe! Uses a lock object to protect write and access to the cache.
*/
public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
private static final boolean USE_CACHE = true;
private static final boolean PRINT_EFFICIENCY = false;
private static final int PRINT_FREQUENCY = 10000;
private Object lock = new Object();
long cacheHits = 0;
long cacheMisses = 0;
long cacheStart = -1;
long cacheStop = -1;
long cacheSize = 100000;
long cacheMissBackup = 100;
ReferenceSequence cache = null;
/**
* Same as general constructor but allows one to override the default cacheSize
* @param file
* @param index
* @param cacheSize
*/
public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index, long cacheSize) {
this(file, index);
this.cacheSize = cacheSize;
this.cacheMissBackup = Math.max(cacheSize / 100, 1);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
* @param file The file to open.
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
* @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found.
*/
public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index) {
super(file, index);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
* @param file The file to open.
*/
public CachingIndexedFastaSequenceFile(final File file) {
super(file);
}
public CachingIndexedFastaSequenceFile(final File file, long cacheSize ) {
super(file);
this.cacheSize = cacheSize;
this.cacheMissBackup = Math.max(cacheSize / 100, 1);
}
public void printEfficiency() {
// comment out to disable tracking
if ( (cacheHits + cacheMisses) % PRINT_FREQUENCY == 0 ) {
System.out.printf("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency());
}
}
public double calcEfficiency() {
return 100.0 * cacheHits / (cacheMisses + cacheHits * 1.0);
}
public long getCacheHits() {
return cacheHits;
}
public long getCacheMisses() {
return cacheMisses;
}
/**
* Gets the subsequence of the contig in the range [start,stop]
* @param contig Contig whose subsequence to retrieve.
* @param start inclusive, 1-based start of region.
* @param stop inclusive, 1-based stop of region.
* @return The partial reference sequence associated with this range.
*/
public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) {
ReferenceSequence result;
if ( ! USE_CACHE || (stop - start) >= cacheSize ) {
cacheMisses++;
result = super.getSubsequenceAt(contig, start, stop);
} else {
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
if (stop > contigInfo.getSequenceLength())
throw new PicardException("Query asks for data past end of contig");
synchronized (lock) { // access to shared cache information must be protected
if ( start < cacheStart || stop > cacheStop || cache == null || cache.getContigIndex() != contigInfo.getSequenceIndex() ) {
cacheMisses++;
cacheStart = Math.max(start - cacheMissBackup, 0);
cacheStop = Math.min(cacheStart + cacheSize, contigInfo.getSequenceLength());
cache = super.getSubsequenceAt(contig, cacheStart, cacheStop);
//System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop);
} else {
cacheHits++;
}
// at this point we determine where in the cache we want to extract the requested subsequence
int cacheOffsetStart = (int)(start - cacheStart);
int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1);
try {
result = new ReferenceSequence(cache.getName(), cache.getContigIndex(), Arrays.copyOfRange(cache.getBases(), cacheOffsetStart, cacheOffsetStop));
} catch ( ArrayIndexOutOfBoundsException e ) {
throw new ReviewedStingException(String.format("BUG: bad array indexing. Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d",
cacheStart, cacheStop, start, stop, cacheOffsetStart, cacheOffsetStop, cache.getBases().length), e);
}
}
}
// // comment out to disable testing
// ReferenceSequence verify = super.getSubsequenceAt(contig, start, stop);
// if ( ! Arrays.equals(verify.getBases(), result.getBases()) )
// throw new ReviewedStingException(String.format("BUG: cached reference sequence not the same as clean fetched version at %s %d %d", contig, start, stop));
if ( PRINT_EFFICIENCY ) printEfficiency();
return result;
}
}

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
@ -53,7 +54,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
@BeforeClass
public void init() throws FileNotFoundException {
// sequence
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq);
builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null);
}

View File

@ -5,6 +5,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
@ -43,7 +44,7 @@ public abstract class ReferenceViewTemplate extends BaseTest {
*/
@BeforeClass
public void initialize() throws FileNotFoundException {
sequenceFile = new IndexedFastaSequenceFile( new File(hg18Reference) );
sequenceFile = new CachingIndexedFastaSequenceFile( new File(hg18Reference) );
genomeLocParser = new GenomeLocParser(sequenceFile);
}

View File

@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.BeforeMethod;
@ -49,7 +50,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
@BeforeClass
public void init() throws FileNotFoundException {
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq);
testSite1 = genomeLocParser.createGenomeLoc("chrM",10);

View File

@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
@ -64,7 +65,7 @@ public class SAMBAMDataSourceUnitTest extends BaseTest {
readers = new ArrayList<SAMReaderID>();
// sequence
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary());
}

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.testng.Assert;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.file.FSLockWithShared;
import org.testng.annotations.BeforeMethod;
@ -58,7 +59,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@BeforeMethod
public void setup() {
seq = new IndexedFastaSequenceFile(new File(b36KGReference));
seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
genomeLocParser = new GenomeLocParser(seq);
builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null);
}

View File

@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import static org.testng.Assert.fail;
@ -73,7 +74,7 @@ public class TraverseReadsUnitTest extends BaseTest {
@BeforeClass
public void doOnce() {
ref = new IndexedFastaSequenceFile(refFile);
ref = new CachingIndexedFastaSequenceFile(refFile);
genomeLocParser = new GenomeLocParser(ref);
engine = new GenomeAnalysisEngine();

View File

@ -9,6 +9,7 @@ import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import java.io.File;
import java.io.FileNotFoundException;
@ -26,7 +27,7 @@ public class GenomeLocUnitTest extends BaseTest {
@BeforeClass
public void init() throws FileNotFoundException {
// sequence
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq);
}

View File

@ -4,6 +4,7 @@ import org.testng.Assert;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
@ -22,7 +23,7 @@ public class BedParserUnitTest extends BaseTest {
@BeforeClass
public void beforeTests() {
seq = new IndexedFastaSequenceFile(new File(b36KGReference));
seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
genomeLocParser = new GenomeLocParser(seq);
}

View File

@ -0,0 +1,94 @@
// our package
package org.broadinstitute.sting.utils.fasta;
// the imports for unit testing.
import org.testng.Assert;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
import org.broadinstitute.sting.BaseTest;
import java.io.File;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.SAMSequenceRecord;
/**
* Basic unit test for GenomeLoc
*/
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
private File simpleFasta = new File(testDir + "/exampleFASTA.fasta");
private static final int STEP_SIZE = 1;
//private static final List<Integer> QUERY_SIZES = Arrays.asList(1);
private static final List<Integer> QUERY_SIZES = Arrays.asList(1, 10, 100, 1000);
private static final List<Integer> CACHE_SIZES = Arrays.asList(10, 1000);
@DataProvider(name = "fastas")
public Object[][] createData1() {
List<Object[]> params = new ArrayList<Object[]>();
for ( File fasta : Arrays.asList(simpleFasta) ) {
for ( int cacheSize : CACHE_SIZES ) {
for ( int querySize : QUERY_SIZES ) {
params.add(new Object[]{fasta, cacheSize, querySize});
}
}
}
return params.toArray(new Object[][]{});
}
@Test(dataProvider = "fastas", enabled = true)
public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) {
IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
IndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
//logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
// contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));
for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) {
int start = i;
int stop = start + querySize;
if ( stop <= contig.getSequenceLength() ) {
ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
}
}
}
// Tests grabbing sequences around a middle cached value.
@Test(dataProvider = "fastas", enabled = true)
public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) {
IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
IndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
int middleStart = (contig.getSequenceLength() - querySize) / 2;
int middleStop = middleStart + querySize;
for ( int i = 0; i < contig.getSequenceLength(); i += 10 ) {
int start = i;
int stop = start + querySize;
if ( stop <= contig.getSequenceLength() ) {
ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart, middleStop);
ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);
Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
}
}
}
}

View File

@ -4,6 +4,7 @@ import org.testng.Assert;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import org.testng.annotations.BeforeMethod;
@ -68,7 +69,7 @@ public class GLFWriterUnitTest extends BaseTest {
@BeforeClass
public void beforeTests() {
seq = new IndexedFastaSequenceFile(new File(b36KGReference));
seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference));
genomeLocParser = new GenomeLocParser(seq);
}

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.Test;
import org.testng.annotations.BeforeClass;
@ -39,7 +40,7 @@ public class VCFWriterUnitTest extends BaseTest {
@BeforeClass
public void beforeTests() {
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference));
IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq);
}

View File

@ -7,6 +7,7 @@ import org.testng.Assert;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
@ -27,7 +28,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
@BeforeClass
public void init() {
ReferenceSequenceFile seq = new IndexedFastaSequenceFile(reference);
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(reference);
genomeLocParser = new GenomeLocParser(seq);
}