diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index af330bba9..c8bf1e3e8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.contexts; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -39,10 +38,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; * @author hanna * @version 0.1 */ - public class ReferenceContext { - final public static boolean UPPERCASE_REFERENCE = true; - /** * Facilitates creation of new GenomeLocs. */ @@ -59,7 +55,8 @@ public class ReferenceContext { final private GenomeLoc window; /** - * The bases in the window around the current locus. If null, then bases haven't been fetched yet + * The bases in the window around the current locus. If null, then bases haven't been fetched yet. + * Bases are always upper cased */ private byte[] basesCache = null; @@ -81,7 +78,7 @@ public class ReferenceContext { * * @return */ - @Ensures("result != null") + @Ensures({"result != null", "BaseUtils.isUpperCase(result)"}) public byte[] getBases(); } @@ -146,7 +143,6 @@ public class ReferenceContext { private void fetchBasesFromProvider() { if ( basesCache == null ) { basesCache = basesProvider.getBases(); - if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache); } } @@ -176,6 +172,7 @@ public class ReferenceContext { * Get the base at the given locus. * @return The base at the given locus from the reference. */ + @Ensures("BaseUtils.isUpperCase(result)") public byte getBase() { return getBases()[(locus.getStart() - window.getStart())]; } @@ -185,7 +182,7 @@ public class ReferenceContext { * @return All bases available. If the window is of size [0,0], the array will * contain only the base at the given locus. */ - @Ensures({"result != null", "result.length > 0"}) + @Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"}) public byte[] getBases() { fetchBasesFromProvider(); return basesCache; @@ -194,6 +191,7 @@ public class ReferenceContext { /** * All the bases in the window from the current base forward to the end of the window. */ + @Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"}) public byte[] getForwardBases() { final byte[] bases = getBases(); final int mid = locus.getStart() - window.getStart(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 8f2528e23..345f79b2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; -import net.sf.picard.reference.IndexedFastaSequenceFile; +import com.google.java.contract.Requires; import net.sf.samtools.*; import net.sf.samtools.util.RuntimeIOException; import net.sf.samtools.util.SequenceUtil; @@ -276,7 +276,7 @@ public class IndelRealigner extends ReadWalker { protected String OUT_SNPS = null; // fasta reference reader to supplement the edges of the reference sequence - private IndexedFastaSequenceFile referenceReader; + private CachingIndexedFastaSequenceFile referenceReader; // the intervals input by the user private Iterator intervals = null; @@ -1603,7 +1603,8 @@ public class IndelRealigner extends ReadWalker { public List getReads() { return reads; } - public byte[] getReference(IndexedFastaSequenceFile referenceReader) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getReference(CachingIndexedFastaSequenceFile referenceReader) { // set up the reference if we haven't done so yet if ( reference == null ) { // first, pad the reference to handle deletions in narrow windows (e.g. those with only 1 read) @@ -1611,7 +1612,6 @@ public class IndelRealigner extends ReadWalker { int padRight = Math.min(loc.getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(loc.getContig()).getSequenceLength()); loc = getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(), padLeft, padRight); reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); - StringUtil.toUpperCase(reference); } return reference; diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 69920ece4..53a49d8b2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -24,33 +24,6 @@ public class BaseUtils { public final static byte[] BASES = {'A', 'C', 'G', 'T'}; public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; - public enum Base { - A('A', 0), - C('C', 1), - G('G', 2), - T('T', 3); - - byte b; - int index; - - private Base(char base, int index) { - this.b = (byte) base; - this.index = index; - } - - public byte getBase() { return b; } - - public char getBaseAsChar() { return (char) b; } - - public int getIndex() { return index; } - - public boolean sameBase(byte o) { return b == o; } - - public boolean sameBase(char o) { return b == (byte) o; } - - public boolean sameBase(int i) { return index == i; } - } - static private final int[] baseIndexMap = new int[256]; static { Arrays.fill(baseIndexMap, -1); @@ -130,6 +103,17 @@ public class BaseUtils { return false; } + public static boolean isUpperCase(final byte[] bases) { + for ( byte base : bases ) + if ( ! isUpperCase(base) ) + return false; + return true; + } + + public static boolean isUpperCase(final byte base) { + return base >= 'A' && base <= 'Z'; + } + /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -271,59 +255,6 @@ public class BaseUtils { } } - /** - * Converts a base index to a base index representing its cross-talk partner - * - * @param baseIndex 0, 1, 2, 3 - * @return 1, 0, 3, 2, or -1 if the index can't be understood - */ - static public int crossTalkPartnerIndex(int baseIndex) { - switch (baseIndex) { - case 0: - return 1; // A -> C - case 1: - return 0; // C -> A - case 2: - return 3; // G -> T - case 3: - return 2; // T -> G - default: - return -1; - } - } - - /** - * Converts a base to the base representing its cross-talk partner - * - * @param base [AaCcGgTt] - * @return C, A, T, G, or '.' if the base can't be understood - */ - @Deprecated - static public char crossTalkPartnerBase(char base) { - return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base))); - } - - /** - * Return the complement of a base index. - * - * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T) - * @return the complementary base index - */ - static public byte complementIndex(int baseIndex) { - switch (baseIndex) { - case 0: - return 3; // a -> t - case 1: - return 2; // c -> g - case 2: - return 1; // g -> c - case 3: - return 0; // t -> a - default: - return -1; // wtf? - } - } - /** * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base). * @@ -350,7 +281,7 @@ public class BaseUtils { } @Deprecated - static public char simpleComplement(char base) { + static private char simpleComplement(char base) { return (char) simpleComplement((byte) base); } @@ -370,22 +301,6 @@ public class BaseUtils { return rcbases; } - /** - * Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form) - * - * @param bases the byte array of bases - * @return the complement of the base byte array - */ - static public byte[] simpleComplement(byte[] bases) { - byte[] rcbases = new byte[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = simpleComplement(bases[i]); - } - - return rcbases; - } - /** * Reverse complement a char array of bases * @@ -403,23 +318,6 @@ public class BaseUtils { return rcbases; } - /** - * Complement a char array of bases - * - * @param bases the char array of bases - * @return the complement of the base char array - */ - @Deprecated - static public char[] simpleComplement(char[] bases) { - char[] rcbases = new char[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = simpleComplement(bases[i]); - } - - return rcbases; - } - /** * Reverse complement a String of bases. Preserves ambiguous bases. * @@ -431,17 +329,6 @@ public class BaseUtils { return new String(simpleReverseComplement(bases.getBytes())); } - /** - * Complement a String of bases. Preserves ambiguous bases. - * - * @param bases the String of bases - * @return the complement of the String - */ - @Deprecated - static public String simpleComplement(String bases) { - return new String(simpleComplement(bases.getBytes())); - } - /** * Returns the uppercased version of the bases * @@ -543,82 +430,4 @@ public class BaseUtils { return randomBaseIndex; } - - /** - * Return a random base (A, C, G, T). - * - * @return a random base (A, C, G, T) - */ - @Deprecated - static public byte getRandomBase() { - return getRandomBase('.'); - } - - /** - * Return a random base, excluding some base. - * - * @param excludeBase the base to exclude - * @return a random base, excluding the one specified (A, C, G, T) - */ - @Deprecated - static public byte getRandomBase(char excludeBase) { - return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase))); - } - - /** - * Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, - * that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p). - * The sequence does not have to contain whole number of periods. For instance, "ACACACAC" has a period - * of 2 (it has a period of 4 as well), and so does - * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is - * the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range - * or if specified minPeriod is greater than the sequence length. - * - * @param seq - * @return - */ - public static int sequencePeriod(byte[] seq, int minPeriod) { - int period = (minPeriod > seq.length ? seq.length : minPeriod); - // we assume that bases [0,period-1] repeat themselves and check this assumption - // until we find correct period - - for (int pos = period; pos < seq.length; pos++) { - - int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' - // if our current hypothesis holds, base[pos] must be the same as base[offset] - - if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) { - - // period we have been trying so far does not work. - // two possibilities: - // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not; - // in this case only bases from start up to the current one, inclusive, may form a repeat, if at all; - // so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance - // pos will be autoincremented and we will be checking next base - // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one? - // hence we should first check if it matches the first base of the sequence, and to do that - // we set period to pos (thus trying the hypothesis that bases from start up to the current one, - // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base - // on the next loop re-entrance after pos is autoincremented) - if (offset == 0) - period = pos + 1; - else - period = pos--; - - } - } - return period; - } } - -/* code snippet for testing sequencePeriod(): - * - * String str = "CCTTG"; - int p = 0; - System.out.print("Periods of " + str +" are:"); - while ( p < str.length() ) { - p = sequencePeriod(str, p+1); - System.out.print(" "+p); - } - System.out.println(); System.exit(1); -*/ diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index db54851dd..0e8a3ea70 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -29,6 +29,7 @@ import net.sf.picard.reference.FastaSequenceIndex; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.StringUtil; import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -40,6 +41,8 @@ import java.util.Arrays; * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. * * Thread-safe! Uses a thread-local cache + * + * Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); @@ -54,10 +57,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { public static final long DEFAULT_CACHE_SIZE = 1000000; /** The cache size of this CachingIndexedFastaSequenceFile */ - final long cacheSize; + private final long cacheSize; /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ - final long cacheMissBackup; + private final long cacheMissBackup; + + /** + * If true, we will preserve the case of the original base in the genome, not + */ + private final boolean preserveCase; // information about checking efficiency long cacheHits = 0; @@ -84,37 +92,17 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { /** * Same as general constructor but allows one to override the default cacheSize * - * @param fasta - * @param index - * @param cacheSize + * @param fasta the file we will read our FASTA sequence from. + * @param index the index of the fasta file, used for efficient random access + * @param cacheSize the size in bp of the cache we will use for this reader + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) { super(fasta, index); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * @param fasta The file to open. - * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. - * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. - */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { - this(fasta, index, DEFAULT_CACHE_SIZE); - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * Looks for a index file for fasta on disk - * - * @param fasta The file to open. - */ - public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { - this(fasta, DEFAULT_CACHE_SIZE); + this.preserveCase = preserveCase; } /** @@ -124,12 +112,76 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * Uses provided cacheSize instead of the default * * @param fasta The file to open. + * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ - public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException { super(fasta); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); + this.preserveCase = preserveCase; + } + +// /** +// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. +// * +// * @param fasta The file to open. +// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. +// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. +// */ +// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { +// this(fasta, index, DEFAULT_CACHE_SIZE); +// } + + /** + * Same as general constructor but allows one to override the default cacheSize + * + * By default, this CachingIndexedFastaReader converts all incoming bases to upper case + * + * @param fasta the file we will read our FASTA sequence from. + * @param index the index of the fasta file, used for efficient random access + * @param cacheSize the size in bp of the cache we will use for this reader + */ + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { + this(fasta, index, cacheSize, false); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk. + * This CachingIndexedFastaReader will convert all FASTA bases to upper cases under the hood + * + * @param fasta The file to open. + */ + public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { + this(fasta, false); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * + * @param fasta The file to open. + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + */ + public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException { + this(fasta, DEFAULT_CACHE_SIZE, preserveCase); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * Uses provided cacheSize instead of the default + * + * @param fasta The file to open. + * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 + */ + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { + this(fasta, cacheSize, false); } /** @@ -168,6 +220,25 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { return cacheSize; } + /** + * Is this CachingIndexedFastaReader keeping the original case of bases in the fasta, or is + * everything being made upper case? + * + * @return true if the bases coming from this reader are in the original case in the fasta, false if they are all upper cased + */ + public boolean isPreservingCase() { + return preserveCase; + } + + /** + * Is uppercasing bases? + * + * @return true if bases coming from this CachingIndexedFastaSequenceFile are all upper cased, false if this reader are in the original case in the fasta + */ + public boolean isUppercasingBases() { + return ! isPreservingCase(); + } + /** * Gets the subsequence of the contig in the range [start,stop] * @@ -177,8 +248,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param contig Contig whose subsequence to retrieve. * @param start inclusive, 1-based start of region. * @param stop inclusive, 1-based stop of region. - * @return The partial reference sequence associated with this range. + * @return The partial reference sequence associated with this range. If preserveCase is false, then + * all of the bases in the ReferenceSequence returned by this method will be upper cased. */ + @Override public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) { final ReferenceSequence result; final Cache myCache = cache.get(); @@ -186,6 +259,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { if ( (stop - start) >= cacheSize ) { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); + if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); } else { // todo -- potential optimization is to check if contig.name == contig, as this in generally will be true SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); @@ -198,7 +272,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { myCache.start = Math.max(start - cacheMissBackup, 0); myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength()); myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop); - //System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop); + + // convert all of the bases in the sequence to upper case if we aren't preserving cases + if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); } else { cacheHits++; } @@ -215,8 +291,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { } } + // for debugging -- print out our efficiency if requested if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 ) printEfficiency(Priority.INFO); + return result; } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index 736162300..bcd846184 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -30,6 +30,7 @@ import java.util.concurrent.Executors; public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta"); private static final int STEP_SIZE = 1; + private final static boolean DEBUG = false; //private static final List QUERY_SIZES = Arrays.asList(1); private static final List QUERY_SIZES = Arrays.asList(1, 10, 100); @@ -53,9 +54,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { return cacheSizeRequested == -1 ? CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE : cacheSizeRequested; } - @Test(dataProvider = "fastas", enabled = true) + @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0); logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", @@ -64,6 +65,8 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException { + Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers"); + final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -92,10 +95,10 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } // Tests grabbing sequences around a middle cached value. - @Test(dataProvider = "fastas", enabled = true) + @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -123,11 +126,6 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { @DataProvider(name = "ParallelFastaTest") public Object[][] createParallelFastaTest() { List params = new ArrayList(); -// for ( int nt : Arrays.asList(1, 2, 3) ) { -// for ( int cacheSize : CACHE_SIZES ) { -// params.add(new Object[]{simpleFasta, cacheSize, 10, nt}); -// } -// } for ( File fasta : Arrays.asList(simpleFasta) ) { for ( int cacheSize : CACHE_SIZES ) { @@ -143,9 +141,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } - @Test(dataProvider = "ParallelFastaTest", enabled = true, timeOut = 60000) + @Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000) public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt)); for ( int iterations = 0; iterations < 1; iterations++ ) { @@ -163,4 +161,49 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { executor.shutdownNow(); } } + + // make sure some bases are lower case and some are upper case + @Test(enabled = true) + public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException { + final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA)); + final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(new File(exampleFASTA), true); + final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(new File(exampleFASTA)); + + int nMixedCase = 0; + for ( SAMSequenceRecord contig : original.getSequenceDictionary().getSequences() ) { + nMixedCase += testCases(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1); + + final int step = 100; + for ( int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step ) { + testCases(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos); + } + } + + Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file. Unexpected test state"); + } + + private int testCases(final IndexedFastaSequenceFile original, + final IndexedFastaSequenceFile casePreserving, + final IndexedFastaSequenceFile allUpper, + final String contig, final int start, final int stop ) { + final String orig = fetchBaseString(original, contig, start, stop); + final String keptCase = fetchBaseString(casePreserving, contig, start, stop); + final String upperCase = fetchBaseString(allUpper, contig, start, stop).toUpperCase(); + + final String origToUpper = orig.toUpperCase(); + if ( ! orig.equals(origToUpper) ) { + Assert.assertEquals(keptCase, orig, "Case preserving operation not equal to the original case for contig " + contig); + Assert.assertEquals(upperCase, origToUpper, "All upper case reader not equal to the uppercase of original case for contig " + contig); + return 1; + } else { + return 0; + } + } + + private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start, final int stop) { + if ( start == -1 ) + return new String(reader.getSequence(contig).getBases()); + else + return new String(reader.getSubsequenceAt(contig, start, stop).getBases()); + } }