From 4995950d04772437eec2b7c85c28b02732516780 Mon Sep 17 00:00:00 2001 From: hanna Date: Thu, 1 Jul 2010 04:40:31 +0000 Subject: [PATCH] IndexedFastaSequenceFile is now in Picard; transitioning to that implementation. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3701 348d0f76-0448-11de-a6fe-93d51630548a --- .../reference}/FastaSequenceIndexBuilder.java | 24 +- .../bwa/java/AlignerTestHarness.java | 2 +- .../providers/LocusShardDataProvider.java | 3 +- .../providers/ReadShardDataProvider.java | 3 +- .../datasources/providers/ReferenceView.java | 2 +- .../providers/ShardDataProvider.java | 3 +- .../shards/LocusShardStrategy.java | 2 +- .../shards/ShardStrategyFactory.java | 2 +- .../ReferenceDataSource.java | 10 +- .../executive/HierarchicalMicroScheduler.java | 3 +- .../gatk/executive/LinearMicroScheduler.java | 2 +- .../sting/gatk/executive/MicroScheduler.java | 3 +- .../walkers/TestReadFishingWalker.java | 2 +- .../HybSelPerformanceWalker.java | 7 +- .../sting/utils/fasta/FastaSequenceIndex.java | 268 ---------------- .../utils/fasta/IndexedFastaSequenceFile.java | 285 ------------------ .../FastaSequenceIndexBuilderUnitTest.java | 17 +- .../VariantContextUnitTest.java | 2 +- .../ReferenceOrderedViewUnitTest.java | 3 +- .../providers/ReferenceViewTemplate.java | 2 +- .../ReferenceOrderedDataPoolUnitTest.java | 2 +- .../SAMBAMDataSourceUnitTest.java | 2 +- .../BoundedReadIteratorUnitTest.java | 2 +- .../gatk/refdata/TabularRODUnitTest.java | 2 +- .../VariantContextAdaptorsUnitTest.java | 8 +- .../refdata/features/vcf4/VCF4UnitTest.java | 9 +- .../tracks/RMDTrackManagerUnitTest.java | 8 +- .../traversals/TraverseReadsUnitTest.java | 9 +- .../sting/utils/GenomeLocUnitTest.java | 2 +- .../sting/utils/bed/BedParserUnitTest.java | 9 +- .../fasta/FastaSequenceIndexUnitTest.java | 250 --------------- .../IndexedFastaSequenceFileUnitTest.java | 272 ----------------- .../utils/genotype/glf/GLFWriterUnitTest.java | 8 +- .../utils/genotype/vcf/VCFWriterUnitTest.java | 11 +- 34 files changed, 72 insertions(+), 1167 deletions(-) rename java/src/{org/broadinstitute/sting/utils/fasta => net/sf/picard/reference}/FastaSequenceIndexBuilder.java (93%) delete mode 100755 java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java delete mode 100755 java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java rename java/test/{org/broadinstitute/sting/utils/fasta => net/sf/picard/reference}/FastaSequenceIndexBuilderUnitTest.java (86%) delete mode 100755 java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexUnitTest.java delete mode 100755 java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileUnitTest.java diff --git a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilder.java b/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java similarity index 93% rename from java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilder.java rename to java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java index 9a5cd2e75..49929fcfc 100644 --- a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilder.java +++ b/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java @@ -23,15 +23,17 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.fasta; +package net.sf.picard.reference; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSourceProgressListener; import org.broadinstitute.sting.utils.StingException; -import static org.broadinstitute.sting.utils.fasta.FastaSequenceIndexBuilder.Status.*; +import static net.sf.picard.reference.FastaSequenceIndexBuilder.Status.*; import java.io.*; import java.util.Iterator; +import net.sf.picard.reference.FastaSequenceIndex; + /** * Builds FastaSequenceIndex from fasta file. * Produces fai file with same output as samtools faidx @@ -46,6 +48,7 @@ public class FastaSequenceIndexBuilder { // vars that store information about the contig that is currently being read String contig; long location, size, bytesPerLine, basesPerLine, basesThisLine; + int thisSequenceIndex = 0; // vars that keep loop state byte lastByte = 0, currentByte = 0, nextByte = 0; @@ -241,7 +244,7 @@ public class FastaSequenceIndexBuilder { * Reset iterators and add contig to sequence index */ private void finishReadingContig(FastaSequenceIndex sequenceIndex) { - sequenceIndex.addIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine); + sequenceIndex.add(new FastaSequenceIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++)); status = Status.NONE; contig = ""; size = 0; @@ -271,11 +274,9 @@ public class FastaSequenceIndexBuilder { faiFile.getAbsolutePath()), e); } - Iterator iter = sequenceIndex.iterator(); - try { - while (iter.hasNext()) { - out.write(iter.next().toIndexFileLine()); + for(FastaSequenceIndexEntry entry: sequenceIndex) { + out.write(toIndexFileLine(entry)); out.newLine(); } out.close(); @@ -284,4 +285,13 @@ public class FastaSequenceIndexBuilder { throw new StingException(String.format("An error occurred while writing file %s", e)); } } + + /** + * Print string in format of fai file line + * @return Contig as one line in a fai file + */ + private static String toIndexFileLine(FastaSequenceIndexEntry entry) { + return String.format("%s\t%d\t%d\t%d\t%d", entry.getContig(), entry.getSize(), entry.getLocation(), entry.getBasesPerLine(), entry.getBytesPerLine()); + } + } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java b/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java index 10c5e6452..1365f9e2b 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java @@ -4,12 +4,12 @@ import org.broadinstitute.sting.alignment.Aligner; import org.broadinstitute.sting.alignment.Alignment; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; import net.sf.samtools.*; +import net.sf.picard.reference.IndexedFastaSequenceFile; /** * A test harness to ensure that the perfect aligner works. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java index 168cd98f0..0d728a668 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.iterators.LocusIterator; @@ -9,6 +8,8 @@ import org.broadinstitute.sting.gatk.Reads; import java.util.Collection; +import net.sf.picard.reference.IndexedFastaSequenceFile; + /** * Presents data sharded by locus to the traversal engine. * diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java index 0141be312..b6490794b 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadShardDataProvider.java @@ -4,10 +4,11 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.util.Collection; +import net.sf.picard.reference.IndexedFastaSequenceFile; + /** * Present data sharded by read to a traversal engine. * diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java index f90c7c10c..1ce1886bf 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReferenceView.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.datasources.providers; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.StingException; @@ -14,6 +13,7 @@ import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.StringUtil; import net.sf.picard.reference.ReferenceSequence; +import net.sf.picard.reference.IndexedFastaSequenceFile; /** * User: hanna * Date: May 22, 2009 diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java index 216f14eb7..d0c9914a7 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java @@ -2,12 +2,13 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.StingException; import java.util.ArrayList; import java.util.List; import java.util.Collection; + +import net.sf.picard.reference.IndexedFastaSequenceFile; /** * User: hanna * Date: May 8, 2009 diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShardStrategy.java index 8561aa5dc..664b2886c 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/LocusShardStrategy.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.datasources.shards; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; @@ -36,6 +35,7 @@ import java.util.*; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.SAMFileSpan; +import net.sf.picard.reference.IndexedFastaSequenceFile; /** * A sharding strategy for loci based on reading of the index. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java index 3da56c110..c8ff705f1 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java @@ -1,9 +1,9 @@ package org.broadinstitute.sting.gatk.datasources.shards; import net.sf.samtools.SAMSequenceDictionary; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; /** diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java index bcc79e024..4defcdd80 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java @@ -25,17 +25,13 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.fasta.FastaSequenceIndex; -import org.broadinstitute.sting.utils.fasta.FastaSequenceIndexBuilder; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import net.sf.picard.reference.FastaSequenceIndexBuilder; import net.sf.picard.sam.CreateSequenceDictionary; -import org.broadinstitute.sting.utils.file.FSLock; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.reference.FastaSequenceIndex; import java.io.File; -import java.io.IOException; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import java.nio.channels.FileLock; diff --git a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 5134f0eeb..6f552ba2f 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -9,7 +9,6 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrde import org.broadinstitute.sting.gatk.io.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor; import javax.management.MBeanServer; @@ -24,6 +23,8 @@ import java.util.concurrent.Future; import java.util.concurrent.FutureTask; import java.lang.management.ManagementFactory; +import net.sf.picard.reference.IndexedFastaSequenceFile; + /** * A microscheduler that schedules shards according to a tree-like structure. * Requires a special walker tagged with a 'TreeReducible' interface. diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index e66c86450..7da467d16 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -13,12 +13,12 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.util.Collection; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.CloseableIterator; +import net.sf.picard.reference.IndexedFastaSequenceFile; /** A micro-scheduling manager for single-threaded execution of a traversal. */ diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 01a729c74..603696616 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -38,12 +38,13 @@ import org.broadinstitute.sting.gatk.iterators.NullSAMIterator; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.StingException; import java.util.*; import java.io.File; +import net.sf.picard.reference.IndexedFastaSequenceFile; + /** * Created by IntelliJ IDEA. diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java index e5337f199..b32d7e61e 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestReadFishingWalker.java @@ -36,10 +36,10 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.StringUtil; import net.sf.picard.reference.ReferenceSequence; +import net.sf.picard.reference.IndexedFastaSequenceFile; import java.io.File; import java.io.FileInputStream; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java index b7b086394..2758df86e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.hybridselection; import net.sf.picard.reference.ReferenceSequence; +import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.picard.util.OverlapDetector; @@ -49,7 +50,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; import java.io.IOException; @@ -300,7 +300,6 @@ public class HybSelPerformanceWalker extends LocusWalker sequenceEntries = new LinkedHashMap(); - - /** - * Build a sequence index from the specified file. - * @param indexFile File to open. - * @throws FileNotFoundException if the index file cannot be found. - */ - protected FastaSequenceIndex( File indexFile ) throws FileNotFoundException { - if(!indexFile.exists()) - throw new FileNotFoundException(String.format("Fasta index file is missing: %s",indexFile.getAbsolutePath())); - - IoUtil.assertFileIsReadable(indexFile); - parseIndexFile(indexFile); - } - - /** - * Build an empty sequence index. Entries can be added later. - */ - protected FastaSequenceIndex() { - - } - - /** - * Parse the contents of an index file, caching the results internally. - * @param indexFile File to parse. - * @throws FileNotFoundException Thrown if file could not be opened. - */ - private void parseIndexFile(File indexFile) throws FileNotFoundException { - Scanner scanner = new Scanner(indexFile); - - while( scanner.hasNext() ) { - // Tokenize and validate the index line. - String result = scanner.findInLine("(.+)\\t+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)"); - if( result == null ) - throw new PicardException("Found invalid line in index file:" + scanner.nextLine()); - MatchResult tokens = scanner.match(); - if( tokens.groupCount() != 5 ) - throw new PicardException("Found invalid line in index file:" + scanner.nextLine()); - - // Skip past the line separator - scanner.nextLine(); - - // Parse the index line. - String contig = tokens.group(1); - long size = Long.valueOf(tokens.group(2)); - long location = Long.valueOf(tokens.group(3)); - int basesPerLine = Integer.valueOf(tokens.group(4)); - int bytesPerLine = Integer.valueOf(tokens.group(5)); - - // Build sequence structure - sequenceEntries.put( contig,new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine) ); - } - } - - /** - * Does the given contig name have a corresponding entry? - * @param contigName The contig name for which to search. - * @return True if contig name is present; false otherwise. - */ - public boolean hasIndexEntry( String contigName ) { - return sequenceEntries.containsKey(contigName); - } - - /** - * Retrieve the index entry associated with the given contig. - * @param contigName Name of the contig for which to search. - * @return Index entry associated with the given contig. - * @throws PicardException if the associated index entry can't be found. - */ - public FastaSequenceIndexEntry getIndexEntry( String contigName ) { - if( !hasIndexEntry(contigName) ) - throw new PicardException("Unable to find entry for contig: " + contigName); - - return sequenceEntries.get(contigName); - } - - /** - * Creates an iterator which can iterate through all entries in a fasta index. - * @return iterator over all fasta index entries. - */ - public Iterator iterator() { - return sequenceEntries.values().iterator(); - } - - /** - * Returns the number of elements in the index. - * @return Number of elements in the index. - */ - public int size() { - return sequenceEntries.size(); - } - - /** - * Adds entry to index. Used by Fai file generator to create index entry on the fly. - * @param contig The name of the contig - * @param location Byte-referenced location of contig in file - * @param size Number of bases in contig - * @param basesPerLine Number of bases in each line. Must be uniform. - * @param bytesPerLine Number of bytes in each line. Must be uniform. - */ - public void addIndexEntry(String contig, long location, long size, int basesPerLine, int bytesPerLine) { - sequenceEntries.put( contig,new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine) ); - } - - /** - * Compare two FastaSequenceIndex objects. Built for use in testing. No hash function has been created. - * @param other Another FastaSequenceIndex to compare - * @return True if index has the same entries as other instance, in the same order - */ - public boolean equals(FastaSequenceIndex other) { - Iterator iter = this.iterator(); - Iterator otherIter = other.iterator(); - while (iter.hasNext()) { - if (!otherIter.hasNext()) - return false; - if (!iter.next().equals(otherIter.next())) - return false; - } - return true; - } -} - -/** - * Hold an individual entry in a fasta sequence index file. - */ -class FastaSequenceIndexEntry { - private final String contig; - private final long location; - private final long size; - private final int basesPerLine; - private final int bytesPerLine; - - /** - * Create a new entry with the given parameters. - * @param contig Contig this entry represents. - * @param location Location (byte coordinate) in the fasta file. - * @param size The number of bases in the contig. - * @param basesPerLine How many bases are on each line. - * @param bytesPerLine How many bytes are on each line (includes newline characters). - */ - public FastaSequenceIndexEntry( String contig, - long location, - long size, - int basesPerLine, - int bytesPerLine ) { - this.contig = contig; - this.location = location; - this.size = size; - this.basesPerLine = basesPerLine; - this.bytesPerLine = bytesPerLine; - } - - /** - * Gets the contig associated with this entry. - * @return String representation of the contig. - */ - public String getContig() { - return contig; - } - - /** - * Gets the location of this contig within the fasta. - * @return seek position within the fasta. - */ - public long getLocation() { - return location; - } - - /** - * Gets the size, in bytes, of the data in the contig. - * @return size of the contig bases in bytes. - */ - public long getSize() { - return size; - } - - /** - * Gets the number of bases in a given line. - * @return Number of bases in the fasta line. - */ - public int getBasesPerLine() { - return basesPerLine; - } - - /** - * How many bytes (bases + whitespace) are consumed by the - * given line? - * @return Number of bytes in a line. - */ - public int getBytesPerLine() { - return bytesPerLine; - } - - /** - * For debugging. Emit the contents of each contig line. - * @return A string representation of the contig line. - */ - public String toString() { - return String.format("contig %s; location %d; size %d; basesPerLine %d; bytesPerLine %d", contig, - location, - size, - basesPerLine, - bytesPerLine ); - } - - /** - * Print string in format of fai file line - * @return Contig as one line in a fai file - */ - public String toIndexFileLine() { - return String.format("%s\t%d\t%d\t%d\t%d", contig, size, location, basesPerLine, bytesPerLine); - } - - /** - * Compare entry to another instance - * @param other another FastaSequenceIndexEntry - * @return True if each has the same name, location, size, basesPerLine and bytesPerLine - */ - public boolean equals(FastaSequenceIndexEntry other) { - return (contig.equals(other.contig) && size == other.size && location == other.location - && basesPerLine == other.basesPerLine && bytesPerLine == other.bytesPerLine); - } -} diff --git a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java deleted file mode 100755 index 47007a344..000000000 --- a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java +++ /dev/null @@ -1,285 +0,0 @@ -package org.broadinstitute.sting.utils.fasta; - -import net.sf.picard.PicardException; -import net.sf.picard.io.IoUtil; -import net.sf.picard.reference.ReferenceSequence; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import net.sf.samtools.SAMTextHeaderCodec; -import net.sf.samtools.util.AsciiLineReader; -import org.broadinstitute.sting.utils.StingException; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.util.Iterator; - -/** - * A fasta file driven by an index for fast, concurrent lookups. Supports two interfaces: - * the ReferenceSequenceFile for old-style, stateful lookups and a direct getter. - */ -public class IndexedFastaSequenceFile implements ReferenceSequenceFile { - /** - * Size of the read buffer. - */ - private static final int BUFFER_SIZE = 128 * 1024; - - /** - * Stores the main fasta file. - */ - private final File file; - - /** - * The interface facilitating direct access to the fasta. - */ - private FileChannel channel; - - /** - * A representation of the sequence dictionary, stored alongside the fasta in a .dict file. - */ - private SAMSequenceDictionary sequenceDictionary = null; - - /** - * A representation of the sequence index, stored alongside the fasta in a .fasta.fai file. - */ - private FastaSequenceIndex index; - - /** - * An iterator into the fasta index, for traversing iteratively across the fasta. - */ - private Iterator indexIterator; - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * @param file The file to open. - * @throws FileNotFoundException If the fasta or any of its supporting files cannot be found. - */ - public IndexedFastaSequenceFile(File file) throws FileNotFoundException { - this.file = file; - FileInputStream in = new FileInputStream(file); - channel = in.getChannel(); - - loadDictionary(file); - loadIndex(file); - sanityCheckDictionaryAgainstIndex(); - } - - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * @param file The file to open. - * @param sequenceIndex FastaSequenceIndex that was previously created - * @throws FileNotFoundException If the fasta or any of its supporting files cannot be found. - */ - public IndexedFastaSequenceFile(File file, FastaSequenceIndex sequenceIndex) throws FileNotFoundException { - this.file = file; - FileInputStream in = new FileInputStream(file); - channel = in.getChannel(); - - loadDictionary(file); - // Temporary change: sequenceIndex is passed in directly. See comments in ReferenceDataSource. - index = sequenceIndex; - sanityCheckDictionaryAgainstIndex(); - } - - /** - * Always returns true for this implementation. - * @return True. - */ - public boolean isIndexed() { - return true; - } - - /** - * Loads a dictionary, if available. - * @param fastaFile File to check for a match. - */ - private void loadDictionary( File fastaFile ) { - // Try and locate the dictionary - String dictionaryName = fastaFile.getAbsolutePath(); - dictionaryName = dictionaryName.substring(0, getFastaFileExtensionStart(dictionaryName)); - dictionaryName += ".dict"; - final File dictionary = new File(dictionaryName); - if (!dictionary.exists()) - throw new PicardException("Unable to load .dict file. Dictionary is required for the indexed fasta reader."); - - IoUtil.assertFileIsReadable(dictionary); - - try { - final SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); - final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), - dictionary.toString()); - if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) { - this.sequenceDictionary = header.getSequenceDictionary(); - } - } - catch (Exception e) { - throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e); - } - - } - - /** - * Gets the index of the first character in the fasta file's extension. - * @param filename The filename of the fasta. Must not be null, and must end with either '.fasta' or '.fa'. - * @return The index of the start of the extension within the filename. If neither '.fasta' nor '.fa' are - * present in the filename, a StingException will be thrown. - */ - private int getFastaFileExtensionStart( String filename ) { - if( filename.endsWith(".fasta") ) - return filename.lastIndexOf(".fasta"); - else if( filename.endsWith(".fa") ) - return filename.lastIndexOf(".fa"); - else - throw new StingException("Invalid fasta filename; fasta filename must end with '.fasta' or '.fa'."); - } - - /** - * Loads the index for the fasta, if present. Throws an exception if now present. - * @param fastaFile FASTA file to load. - * @throws FileNotFoundException if FASTA file cannot be found. - */ - private void loadIndex( File fastaFile ) throws FileNotFoundException { - File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); - if (!indexFile.exists()) - throw new PicardException(String.format("Unable to load fasta index file %s. "+ - "Please create it using 'samtools faidx'.",indexFile.getAbsolutePath())); - index = new FastaSequenceIndex(indexFile); - reset(); - } - - /** - * Do some basic checking to make sure the dictionary and the index match. - */ - private void sanityCheckDictionaryAgainstIndex() { - // Make sure dictionary and index are the same size. - if( sequenceDictionary.getSequences().size() != index.size() ) - throw new PicardException("Sequence dictionary and index contain different numbers of contigs"); - - for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() ) { - // Make sure sequence name is present in the index. - String sequenceName = sequenceRecord.getSequenceName(); - if( !index.hasIndexEntry(sequenceName) ) - throw new PicardException("Index does not contain dictionary entry: " + sequenceName ); - - // Make sure sequence length matches index length. - if( sequenceRecord.getSequenceLength() != index.getIndexEntry(sequenceName).getSize()) - throw new PicardException("Index length does not match dictionary length for contig: " + sequenceName ); - } - } - - /** - * Retrieves the sequence dictionary for the fasta file. - * @return sequence dictionary of the fasta. - */ - public SAMSequenceDictionary getSequenceDictionary() { - return sequenceDictionary; - } - - /** - * Retrieves the complete sequence described by this contig. - * @param contig contig whose data should be returned. - * @return The full sequence associated with this contig. - */ - public ReferenceSequence getSequence( String contig ) { - return getSubsequenceAt( contig, 1, (int)index.getIndexEntry(contig).getSize() ); - } - - /** - * Gets the subsequence of the contig in the range [start,stop] - * @param contig Contig whose subsequence to retrieve. - * @param start inclusive, 1-based start of region. - * @param stop inclusive, 1-based stop of region. - * @return The partial reference sequence associated with this range. - */ - public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) { - if(start > stop) - throw new PicardException(String.format("Malformed query; start point %d lies after end point %d",start,stop)); - - FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig); - - if(stop > indexEntry.getSize()) - throw new PicardException("Query asks for data past end of contig"); - - int length = (int)(stop - start + 1); - - byte[] target = new byte[length]; - ByteBuffer targetBuffer = ByteBuffer.wrap(target); - - final int basesPerLine = indexEntry.getBasesPerLine(); - final int bytesPerLine = indexEntry.getBytesPerLine(); - final int terminatorLength = bytesPerLine - basesPerLine; - - long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine; - - // Allocate a 128K buffer for reading in sequence data. - ByteBuffer channelBuffer = ByteBuffer.allocate(BUFFER_SIZE); - - while(targetBuffer.position() < length) { - // If the bufferOffset is currently within the eol characters in the string, push the bufferOffset forward to the next printable character. - startOffset += Math.max((int)(startOffset%bytesPerLine - basesPerLine + 1),0); - - try { - startOffset += channel.read(channelBuffer,indexEntry.getLocation()+startOffset); - } - catch(IOException ex) { - throw new PicardException("Unable to map FASTA file into memory."); - } - - // Reset the buffer for outbound transfers. - channelBuffer.flip(); - - // Calculate the size of the next run of bases based on the contents we've already retrieved. - final int positionInContig = (int)start-1+targetBuffer.position(); - final int nextBaseSpan = Math.min(basesPerLine-positionInContig%basesPerLine,length-targetBuffer.position()); - // Cap the bytes to transfer by limiting the nextBaseSpan to the size of the channel buffer. - int bytesToTransfer = Math.min(nextBaseSpan,channelBuffer.capacity()); - - channelBuffer.limit(channelBuffer.position()+bytesToTransfer); - - while(channelBuffer.hasRemaining()) { - targetBuffer.put(channelBuffer); - - bytesToTransfer = Math.min(basesPerLine,length-targetBuffer.position()); - channelBuffer.limit(Math.min(channelBuffer.position()+bytesToTransfer+terminatorLength,channelBuffer.capacity())); - channelBuffer.position(Math.min(channelBuffer.position()+terminatorLength,channelBuffer.capacity())); - } - - // Reset the buffer for inbound transfers. - channelBuffer.flip(); - } - - return new ReferenceSequence( contig, sequenceDictionary.getSequenceIndex(contig), target ); - } - - /** - * Gets the next sequence if available, or null if not present. - * @return next sequence if available, or null if not present. - */ - public ReferenceSequence nextSequence() { - if( !indexIterator.hasNext() ) - return null; - return getSequence( indexIterator.next().getContig() ); - } - - /** - * Reset the iterator over the index. - */ - @Override - public void reset() { - indexIterator = index.iterator(); - } - - /** - * A simple toString implementation for debugging. - * @return String representation of the file. - */ - public String toString() { - return this.file.getAbsolutePath(); - } -} diff --git a/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilderUnitTest.java b/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java similarity index 86% rename from java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilderUnitTest.java rename to java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java index af28a4e88..904de15d0 100644 --- a/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilderUnitTest.java +++ b/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java @@ -23,9 +23,8 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.fasta; +package net.sf.picard.reference; -import net.sf.picard.PicardException; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSourceProgressListener; import org.junit.Assert; @@ -61,7 +60,7 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest { fastaFile = new File(validationDataLocation + "exampleFASTA.fasta"); builder = new FastaSequenceIndexBuilder(fastaFile, progress); FastaSequenceIndex index = builder.createIndex(); - controlIndex.addIndexEntry("chr1", 6, 100000, 60, 61); + controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 100000, 60, 61,0)); Assert.assertTrue(index.equals(controlIndex)); } @@ -78,7 +77,7 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest { fastaFile = new File(validationDataLocation + "exampleFASTA-windows.fasta"); builder = new FastaSequenceIndexBuilder(fastaFile, progress); FastaSequenceIndex index = builder.createIndex(); - controlIndex.addIndexEntry("chr2", 7, 29, 7, 9); + controlIndex.add(new FastaSequenceIndexEntry("chr2", 7, 29, 7, 9,0)); Assert.assertTrue(index.equals(controlIndex)); } @@ -94,8 +93,8 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest { fastaFile = new File(validationDataLocation + "exampleFASTA-combined.fasta"); builder = new FastaSequenceIndexBuilder(fastaFile, progress); FastaSequenceIndex index = builder.createIndex(); - controlIndex.addIndexEntry("chr1", 6, 100000, 60, 61); - controlIndex.addIndexEntry("chr2", 101680, 29, 7, 9); + controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 100000, 60, 61,0)); + controlIndex.add(new FastaSequenceIndexEntry("chr2", 101680, 29, 7, 9,1)); Assert.assertTrue(index.equals(controlIndex)); } @@ -111,9 +110,9 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest { fastaFile = new File(validationDataLocation + "exampleFASTA-3contigs.fasta"); builder = new FastaSequenceIndexBuilder(fastaFile, progress); FastaSequenceIndex index = builder.createIndex(); - controlIndex.addIndexEntry("chr1", 6, 17, 5, 6); - controlIndex.addIndexEntry("chr2", 35, 21, 7, 8); - controlIndex.addIndexEntry("chr3", 66, 100, 10, 11); + controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 17, 5, 6,0)); + controlIndex.add(new FastaSequenceIndexEntry("chr2", 35, 21, 7, 8,1)); + controlIndex.add(new FastaSequenceIndexEntry("chr3", 66, 100, 10, 11,2)); Assert.assertTrue(index.equals(controlIndex)); } diff --git a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUnitTest.java b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUnitTest.java index 5cf6f3abb..e565e9124 100755 --- a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUnitTest.java @@ -7,7 +7,6 @@ package org.broadinstitute.sting.gatk.contexts.variantcontext; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -19,6 +18,7 @@ import java.io.FileNotFoundException; import java.io.File; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.IndexedFastaSequenceFile; /** * Basic unit test for RecalData diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index f10dfb3a4..06fc52397 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -10,7 +10,6 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.TabularROD; import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; @@ -19,6 +18,8 @@ import java.io.File; import java.io.FileNotFoundException; import java.util.Arrays; import java.util.Collections; + +import net.sf.picard.reference.IndexedFastaSequenceFile; /** * User: hanna * Date: May 27, 2009 diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java index bf7bcdb9b..5d5cb9424 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceViewTemplate.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.gatk.datasources.providers; import net.sf.samtools.SAMSequenceRecord; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.junit.BeforeClass; import org.junit.Test; diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java index e0e80cfa2..bb16fd3ea 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java @@ -8,7 +8,6 @@ import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; @@ -18,6 +17,7 @@ import java.io.File; import java.io.FileNotFoundException; import static org.junit.Assert.assertTrue; +import net.sf.picard.reference.IndexedFastaSequenceFile; /** * User: hanna * Date: May 21, 2009 diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceUnitTest.java index 435065ba0..d5a612b76 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMBAMDataSourceUnitTest.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import static junit.framework.Assert.fail; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.shards.Shard; @@ -11,7 +12,6 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.junit.After; import org.junit.Before; import org.junit.Test; diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java b/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java index d63d40664..64f851eba 100755 --- a/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/iterators/BoundedReadIteratorUnitTest.java @@ -5,10 +5,10 @@ import static junit.framework.Assert.fail; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.junit.Assert; diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODUnitTest.java index 584101b64..05ad1f13f 100755 --- a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODUnitTest.java @@ -5,12 +5,12 @@ package org.broadinstitute.sting.gatk.refdata; // the imports for unit testing. import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsUnitTest.java index d842e264d..851081b4e 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsUnitTest.java @@ -4,13 +4,13 @@ import edu.mit.broad.picard.genotype.geli.GeliFileReader; import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods; import net.sf.samtools.SAMFileReader; import net.sf.samtools.util.CloseableIterator; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broad.tribble.gelitext.GeliTextCodec; import org.broad.tribble.gelitext.GeliTextFeature; import org.broad.tribble.util.AsciiLineReader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; import org.broadinstitute.sting.utils.genotype.geli.GeliGenotypeWriter; @@ -43,11 +43,7 @@ public class VariantContextAdaptorsUnitTest extends BaseTest { @BeforeClass public static void beforeClass() { - try { - seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "/reference/human_b36_both.fasta")); // TODO: make human reference use BaseTest - } catch (FileNotFoundException e) { - Assert.fail("Unable to load reference " + oneKGLocation + "/reference/human_b36_both.fasta"); - } + seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "/reference/human_b36_both.fasta")); // TODO: make human reference use BaseTest GenomeLocParser.setupRefContigOrdering(seq); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java index 01311b0da..e133cb95a 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java @@ -10,7 +10,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; import org.junit.Assert; import org.junit.BeforeClass; @@ -25,6 +24,8 @@ import java.util.List; import java.util.Map; import java.util.Set; +import net.sf.picard.reference.IndexedFastaSequenceFile; + /** * test out pieces of the VCF 4 codec. */ @@ -36,11 +37,7 @@ public class VCF4UnitTest extends BaseTest { @BeforeClass public static void setupContig() { IndexedFastaSequenceFile seq; - try { - seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); - } catch (FileNotFoundException e) { - throw new StingException("unable to load the sequence dictionary"); - } + seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); GenomeLocParser.setupRefContigOrdering(seq.getSequenceDictionary()); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java index ad812b7e4..a0ba235e4 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java @@ -24,11 +24,11 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.samtools.SAMSequenceRecord; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -100,11 +100,7 @@ public class RMDTrackManagerUnitTest extends BaseTest { // @Test used only to determine how fast queries are, don't uncomment! (unless you know what you're doing). public void testSpeedOfRealQuery() { IndexedFastaSequenceFile file = null; - try { - file = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta")); - } catch (FileNotFoundException e) { - Assert.assertTrue(false); - } + file = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta")); final int intervalSize = 10000000; GenomeLocParser.setupRefContigOrdering(file.getSequenceDictionary()); RMDTrackManager manager = new RMDTrackManager(); diff --git a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index c7e8d2c1b..52f125302 100755 --- a/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; @@ -12,7 +13,6 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import static org.junit.Assert.fail; import org.junit.Before; import org.junit.Test; @@ -106,12 +106,7 @@ public class TraverseReadsUnitTest extends BaseTest { @Test public void testUnmappedReadCount() { IndexedFastaSequenceFile ref = null; - try { - ref = new IndexedFastaSequenceFile(refFile); - } - catch (FileNotFoundException ex) { - throw new RuntimeException("File not found opening fasta file; please do this check before MicroManaging", ex); - } + ref = new IndexedFastaSequenceFile(refFile); GenomeLocParser.setupRefContigOrdering(ref); SAMDataSource dataSource = new SAMDataSource(new Reads(bamList)); diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java index f02f7fa86..d4ec57808 100644 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java @@ -8,12 +8,12 @@ import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.IndexedFastaSequenceFile; /** * Basic unit test for GenomeLoc diff --git a/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java b/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java index 377f60b49..0ca87e0fe 100644 --- a/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.bed; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLoc; @@ -14,6 +13,8 @@ import java.io.File; import java.io.FileNotFoundException; import java.util.List; +import net.sf.picard.reference.IndexedFastaSequenceFile; + public class BedParserUnitTest extends BaseTest { @@ -22,11 +23,7 @@ public class BedParserUnitTest extends BaseTest { @BeforeClass public static void beforeTests() { - try { - seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); - } catch (FileNotFoundException e) { - throw new StingException("unable to load the sequence dictionary"); - } + seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); GenomeLocParser.setupRefContigOrdering(seq); } diff --git a/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexUnitTest.java b/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexUnitTest.java deleted file mode 100755 index 584c6b36c..000000000 --- a/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexUnitTest.java +++ /dev/null @@ -1,250 +0,0 @@ -package org.broadinstitute.sting.utils.fasta; - -import net.sf.picard.PicardException; -import org.broadinstitute.sting.BaseTest; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.Iterator; - -/** - * Test the fasta sequence index reader. - */ -public class FastaSequenceIndexUnitTest extends BaseTest { - // our basic human 18 fai - private static String sequenceIndexName = null; - private FastaSequenceIndex sequenceIndex = null; - - // a custom index that tests the colon, and semi-colon, and other random characters - private static String sequenceIndexColonSemiColonTestName = null; - private FastaSequenceIndex sequenceIndexColonSemiColonTest = null; - - - @BeforeClass - public static void initialize() { - sequenceIndexName = seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai"; - sequenceIndexColonSemiColonTestName = validationDataLocation + "testing.fai"; - } - - @Before - public void doForEachTest() throws FileNotFoundException { - sequenceIndex = new FastaSequenceIndex( new File(sequenceIndexName) ); - sequenceIndexColonSemiColonTest = new FastaSequenceIndex( new File(sequenceIndexColonSemiColonTestName) ); - } - - @Test - public void testInitialContig() { - logger.warn("Executing testInitialContig"); - - Assert.assertTrue("Contig chrM is not present", sequenceIndex.hasIndexEntry("chrM")); - FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chrM"); - Assert.assertEquals("Contig chrM name is incorrect",entry.getContig(),"chrM"); - Assert.assertEquals("Contig chrM location is incorrect",entry.getLocation(),6L); - Assert.assertEquals("Contig chrM size is incorrect",entry.getSize(),16571L); - Assert.assertEquals("Contig chrM bases per line is incorrect",entry.getBasesPerLine(),50); - Assert.assertEquals("Contig chrM bytes per line is incorrect",entry.getBytesPerLine(),51); - } - - @Test - public void testMiddleContig() { - logger.warn("Executing testMiddleContig"); - - Assert.assertTrue("Contig chr8 is not present", sequenceIndex.hasIndexEntry("chr8")); - FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chr8"); - Assert.assertEquals("Contig chr8 name is incorrect",entry.getContig(),"chr8"); - Assert.assertEquals("Contig chr8 location is incorrect",entry.getLocation(),1419403101L); - Assert.assertEquals("Contig chr8 size is incorrect",entry.getSize(),146274826L); - Assert.assertEquals("Contig chr8 bases per line is incorrect",entry.getBasesPerLine(),50); - Assert.assertEquals("Contig chr8 bytes per line is incorrect",entry.getBytesPerLine(),51); - } - - @Test - public void testLastContig() { - logger.warn("Executing testLastContig"); - - Assert.assertTrue("Contig chrX_random is not present", sequenceIndex.hasIndexEntry("chrX_random")); - FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chrX_random"); - Assert.assertEquals("Contig chrX_random name is incorrect",entry.getContig(),"chrX_random"); - Assert.assertEquals("Contig chrX_random location is incorrect",entry.getLocation(),3156698441L); - Assert.assertEquals("Contig chrX_random size is incorrect",entry.getSize(),1719168L); - Assert.assertEquals("Contig chrX_random bases per line is incorrect",entry.getBasesPerLine(),50); - Assert.assertEquals("Contig chrX_random bytes per line is incorrect",entry.getBytesPerLine(),51); - } - - @Test - public void testAllContigsPresent() { - logger.warn("Executing testAllContigsPresent"); - - Assert.assertTrue("Contig chrM is not present", sequenceIndex.hasIndexEntry("chrM")); - Assert.assertTrue("Contig chr1 is not present", sequenceIndex.hasIndexEntry("chr1")); - Assert.assertTrue("Contig chr2 is not present", sequenceIndex.hasIndexEntry("chr2")); - Assert.assertTrue("Contig chr3 is not present", sequenceIndex.hasIndexEntry("chr3")); - Assert.assertTrue("Contig chr4 is not present", sequenceIndex.hasIndexEntry("chr4")); - Assert.assertTrue("Contig chr5 is not present", sequenceIndex.hasIndexEntry("chr5")); - Assert.assertTrue("Contig chr6 is not present", sequenceIndex.hasIndexEntry("chr6")); - Assert.assertTrue("Contig chr7 is not present", sequenceIndex.hasIndexEntry("chr7")); - Assert.assertTrue("Contig chr8 is not present", sequenceIndex.hasIndexEntry("chr8")); - Assert.assertTrue("Contig chr9 is not present", sequenceIndex.hasIndexEntry("chr9")); - Assert.assertTrue("Contig chr10 is not present", sequenceIndex.hasIndexEntry("chr10")); - Assert.assertTrue("Contig chr11 is not present", sequenceIndex.hasIndexEntry("chr11")); - Assert.assertTrue("Contig chr12 is not present", sequenceIndex.hasIndexEntry("chr12")); - Assert.assertTrue("Contig chr13 is not present", sequenceIndex.hasIndexEntry("chr13")); - Assert.assertTrue("Contig chr14 is not present", sequenceIndex.hasIndexEntry("chr14")); - Assert.assertTrue("Contig chr15 is not present", sequenceIndex.hasIndexEntry("chr15")); - Assert.assertTrue("Contig chr16 is not present", sequenceIndex.hasIndexEntry("chr16")); - Assert.assertTrue("Contig chr17 is not present", sequenceIndex.hasIndexEntry("chr17")); - Assert.assertTrue("Contig chr18 is not present", sequenceIndex.hasIndexEntry("chr18")); - Assert.assertTrue("Contig chr19 is not present", sequenceIndex.hasIndexEntry("chr19")); - Assert.assertTrue("Contig chr20 is not present", sequenceIndex.hasIndexEntry("chr20")); - Assert.assertTrue("Contig chr21 is not present", sequenceIndex.hasIndexEntry("chr21")); - Assert.assertTrue("Contig chr22 is not present", sequenceIndex.hasIndexEntry("chr22")); - Assert.assertTrue("Contig chrX is not present", sequenceIndex.hasIndexEntry("chrX")); - Assert.assertTrue("Contig chrY is not present", sequenceIndex.hasIndexEntry("chrY")); - Assert.assertTrue("Contig chr1_random is not present", sequenceIndex.hasIndexEntry("chr1_random")); - Assert.assertTrue("Contig chr2_random is not present", sequenceIndex.hasIndexEntry("chr2_random")); - Assert.assertTrue("Contig chr3_random is not present", sequenceIndex.hasIndexEntry("chr3_random")); - Assert.assertTrue("Contig chr4_random is not present", sequenceIndex.hasIndexEntry("chr4_random")); - Assert.assertTrue("Contig chr5_random is not present", sequenceIndex.hasIndexEntry("chr5_random")); - Assert.assertTrue("Contig chr6_random is not present", sequenceIndex.hasIndexEntry("chr6_random")); - Assert.assertTrue("Contig chr7_random is not present", sequenceIndex.hasIndexEntry("chr7_random")); - Assert.assertTrue("Contig chr8_random is not present", sequenceIndex.hasIndexEntry("chr8_random")); - Assert.assertTrue("Contig chr9_random is not present", sequenceIndex.hasIndexEntry("chr9_random")); - Assert.assertTrue("Contig chr10_random is not present", sequenceIndex.hasIndexEntry("chr10_random")); - Assert.assertTrue("Contig chr11_random is not present", sequenceIndex.hasIndexEntry("chr11_random")); - Assert.assertTrue("Contig chr13_random is not present", sequenceIndex.hasIndexEntry("chr13_random")); - Assert.assertTrue("Contig chr15_random is not present", sequenceIndex.hasIndexEntry("chr15_random")); - Assert.assertTrue("Contig chr16_random is not present", sequenceIndex.hasIndexEntry("chr16_random")); - Assert.assertTrue("Contig chr17_random is not present", sequenceIndex.hasIndexEntry("chr17_random")); - Assert.assertTrue("Contig chr18_random is not present", sequenceIndex.hasIndexEntry("chr18_random")); - Assert.assertTrue("Contig chr19_random is not present", sequenceIndex.hasIndexEntry("chr19_random")); - Assert.assertTrue("Contig chr21_random is not present", sequenceIndex.hasIndexEntry("chr21_random")); - Assert.assertTrue("Contig chr22_random is not present", sequenceIndex.hasIndexEntry("chr22_random")); - Assert.assertTrue("Contig chrX_random is not present", sequenceIndex.hasIndexEntry("chrX_random")); - } - - @Test - public void testHasInvalidEntry() { - logger.warn("Executing testHasInvalidEntry"); - - Assert.assertFalse("Found an invalid entry", sequenceIndex.hasIndexEntry("invalid")); - } - - @Test(expected= PicardException.class) - public void testGetInvalidEntry() { - logger.warn("Executing testGetInvalidEntry"); - - sequenceIndex.getIndexEntry("invalid"); - } - - @Test - public void testIteration() { - logger.warn("Executing testIteration"); - - Iterator sequenceIndexEntries = sequenceIndex.iterator(); - - Assert.assertEquals("Contig chrM is not present", "chrM", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr1 is not present", "chr1", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr2 is not present", "chr2", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr3 is not present", "chr3", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr4 is not present", "chr4", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr5 is not present", "chr5", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr6 is not present", "chr6", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr7 is not present", "chr7", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr8 is not present", "chr8", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr9 is not present", "chr9", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr10 is not present", "chr10", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr11 is not present", "chr11", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr12 is not present", "chr12", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr13 is not present", "chr13", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr14 is not present", "chr14", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr15 is not present", "chr15", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr16 is not present", "chr16", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr17 is not present", "chr17", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr18 is not present", "chr18", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr19 is not present", "chr19", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr20 is not present", "chr20", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr21 is not present", "chr21", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr22 is not present", "chr22", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chrX is not present", "chrX", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chrY is not present", "chrY", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr1_random is not present", "chr1_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr2_random is not present", "chr2_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr3_random is not present", "chr3_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr4_random is not present", "chr4_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr5_random is not present", "chr5_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr6_random is not present", "chr6_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr7_random is not present", "chr7_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr8_random is not present", "chr8_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr9_random is not present", "chr9_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr10_random is not present", "chr10_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr11_random is not present", "chr11_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr13_random is not present", "chr13_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr15_random is not present", "chr15_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr16_random is not present", "chr16_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr17_random is not present", "chr17_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr18_random is not present", "chr18_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr19_random is not present", "chr19_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr21_random is not present", "chr21_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chr22_random is not present", "chr22_random", sequenceIndexEntries.next().getContig()); - Assert.assertEquals("Contig chrX_random is not present", "chrX_random", sequenceIndexEntries.next().getContig()); - Assert.assertFalse("Iterator still has more entries", sequenceIndexEntries.hasNext()); - } - - @Test - public void testSpecialCharacters() { - /* file contents: - chrM 16571 6 50 51 - chr1;boat 247249719 16915 50 51 - chr2:money 242951149 252211635 50 51 - chr3::; 199501827 500021813 50 51 - ;;;;;; 1234 1234 1234 1234 - file:gi|17981852|ref|NC_001807.4| 16571 2911876801 70 71 - */ - Iterator sequenceIndexEntries = sequenceIndexColonSemiColonTest.iterator(); - FastaSequenceIndexEntry ent = sequenceIndexEntries.next(); - Assert.assertEquals("Contig chrM is not present","chrM",ent.getContig()); - Assert.assertEquals("Contig chrM size is not correct",16571,ent.getSize()); - Assert.assertEquals("Contig chrM location is not correct",6,ent.getLocation()); - Assert.assertEquals("Contig chrM bases per line is not correct",50,ent.getBasesPerLine()); - Assert.assertEquals("Contig chrM bytes per line is not correct",51,ent.getBytesPerLine()); - - ent = sequenceIndexEntries.next(); - Assert.assertEquals("Contig chr1;boat is not present","chr1;boat",ent.getContig()); - Assert.assertEquals("Contig chr1;boat size is not correct",247249719,ent.getSize()); - Assert.assertEquals("Contig chr1;boat location is not correct",16915,ent.getLocation()); - Assert.assertEquals("Contig chr1;boat bases per line is not correct",50,ent.getBasesPerLine()); - Assert.assertEquals("Contig chr1;boat bytes per line is not correct",51,ent.getBytesPerLine()); - - ent = sequenceIndexEntries.next(); - Assert.assertEquals("Contig chr2:money is not present","chr2:money",ent.getContig()); - Assert.assertEquals("Contig chr2:money size is not correct",242951149,ent.getSize()); - Assert.assertEquals("Contig chr2:money location is not correct",252211635,ent.getLocation()); - Assert.assertEquals("Contig chr2:money bases per line is not correct",50,ent.getBasesPerLine()); - Assert.assertEquals("Contig chr2:money bytes per line is not correct",51,ent.getBytesPerLine()); - - ent = sequenceIndexEntries.next(); - Assert.assertEquals("Contig chr3::; is not present","chr3::;",ent.getContig()); - Assert.assertEquals("Contig chr3::; size is not correct",199501827,ent.getSize()); - Assert.assertEquals("Contig chrM location is not correct",500021813,ent.getLocation()); - Assert.assertEquals("Contig chr3::; bases per line is not correct",50,ent.getBasesPerLine()); - Assert.assertEquals("Contig chr3::; bytes per line is not correct",51,ent.getBytesPerLine()); - - ent = sequenceIndexEntries.next(); - Assert.assertEquals("Contig ;;;;;;;; is not present",";;;;;;;;",ent.getContig()); - Assert.assertEquals("Contig ;;;;;;;; size is not correct",123,ent.getSize()); - Assert.assertEquals("Contig ;;;;;;;; location is not correct",234,ent.getLocation()); - Assert.assertEquals("Contig ;;;;;;;; bases per line is not correct",456,ent.getBasesPerLine()); - Assert.assertEquals("Contig ;;;;;;;; bytes per line is not correct",789,ent.getBytesPerLine()); - - ent = sequenceIndexEntries.next(); - Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| is not present","file:gi|17981852|ref|NC_001807.4|",ent.getContig()); - Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| size is not correct",16571,ent.getSize()); - Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| location is not correct",2911876801L,ent.getLocation()); - Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| bases per line is not correct",70,ent.getBasesPerLine()); - Assert.assertEquals("Contig file:gi|17981852|ref|NC_001807.4| bytes per line is not correct",71,ent.getBytesPerLine()); - } -} diff --git a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileUnitTest.java b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileUnitTest.java deleted file mode 100755 index 0cbf0ed8c..000000000 --- a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileUnitTest.java +++ /dev/null @@ -1,272 +0,0 @@ -package org.broadinstitute.sting.utils.fasta; - -import org.junit.BeforeClass; -import org.junit.Before; -import org.junit.Test; -import org.junit.Assert; -import org.broadinstitute.sting.BaseTest; - -import java.io.File; -import java.io.FileNotFoundException; - -import net.sf.picard.reference.ReferenceSequence; -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import net.sf.picard.PicardException; -import net.sf.samtools.util.StringUtil; - -/** - * Test the indexed fasta sequence file reader. - */ -public class IndexedFastaSequenceFileUnitTest extends BaseTest { - private static String sequenceFileName; - private IndexedFastaSequenceFile sequenceFile = null; - - private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT"; - private final String extendedBasesOfChrM = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT" + - "TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG" + - "GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT"; - private final String firstBasesOfChr1 = "taaccctaaccctaacccta"; - private final String firstBasesOfChr8 = "GCAATTATGACACAAAAAAT"; - - @BeforeClass - public static void initialize() { - sequenceFileName = seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; - } - - @Before - public void doForEachTest() throws FileNotFoundException { - sequenceFile = new IndexedFastaSequenceFile( new File(sequenceFileName) ); - } - - @Test - public void testOpenFile() { - long startTime = System.currentTimeMillis(); - Assert.assertNotNull( sequenceFile ); - long endTime = System.currentTimeMillis(); - - System.err.printf("testOpenFile runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testFirstSequence() { - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,firstBasesOfChrM.length()); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 0); - Assert.assertArrayEquals( "First n bases of chrM are incorrect",StringUtil.stringToBytes(firstBasesOfChrM),sequence.getBases()); - - System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testFirstSequenceExtended() { - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,extendedBasesOfChrM.length()); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 0); - Assert.assertArrayEquals( "First n bases of chrM are incorrect",StringUtil.stringToBytes(extendedBasesOfChrM),sequence.getBases()); - - System.err.printf("testFirstSequenceExtended runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testReadStartingInCenterOfFirstLine() { - final int bytesToChopOff = 5; - String truncated = extendedBasesOfChrM.substring(bytesToChopOff); - - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", - bytesToChopOff + 1, - bytesToChopOff + truncated.length()); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 0); - Assert.assertArrayEquals( "First n bases of chrM are incorrect",StringUtil.stringToBytes(truncated),sequence.getBases()); - - System.err.printf("testReadStartingInCenterOfFirstLine runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testReadStartingInCenterOfMiddleLine() { - final int bytesToChopOff = 120; - String truncated = extendedBasesOfChrM.substring(bytesToChopOff); - - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", - bytesToChopOff + 1, - bytesToChopOff + truncated.length()); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 0); - Assert.assertArrayEquals( "First n bases of chrM are incorrect",StringUtil.stringToBytes(truncated),sequence.getBases()); - - System.err.printf("testReadStartingInCenterOfMiddleLine runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testFirstCompleteContigRead() { - ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName)); - ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); - - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSequence("chrM"); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 0); - Assert.assertArrayEquals("chrM is incorrect",expectedSequence.getBases(),sequence.getBases()); - - System.err.printf("testFirstCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; - } - - @Test(expected= PicardException.class) - public void testReadThroughEndOfContig() { - long startTime = System.currentTimeMillis(); - try { - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",16500,16600); - } - finally { - long endTime = System.currentTimeMillis(); - System.err.printf("testReadThroughEndOfContig runtime: %dms%n", (endTime - startTime)) ; - } - } - - @Test(expected= PicardException.class) - public void testReadPastEndOfContig() { - long startTime = System.currentTimeMillis(); - try { - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",16800,16900); - } - finally { - long endTime = System.currentTimeMillis(); - System.err.printf("testReadPastEndOfContig runtime: %dms%n", (endTime - startTime)) ; - } - } - - @Test - public void testMiddleCompleteContigRead() { - ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName)); - ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); - while( !expectedSequence.getName().equals("chrY") ) - expectedSequence = originalSequenceFile.nextSequence(); - - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSequence("chrY"); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrY"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 24); - Assert.assertArrayEquals("chrY is incorrect",expectedSequence.getBases(),sequence.getBases()); - - System.err.printf("testMiddleCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testLastCompleteContigRead() { - ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName)); - ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); - while( !expectedSequence.getName().equals("chrX_random") ) - expectedSequence = originalSequenceFile.nextSequence(); - - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSequence("chrX_random"); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrX_random"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 44); - Assert.assertArrayEquals("chrX_random is incorrect",expectedSequence.getBases(),sequence.getBases()); - - System.err.printf("testLastCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; - } - - - @Test - public void testFirstOfChr1() { - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr1",1,firstBasesOfChr1.length()); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chr1"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 1); - Assert.assertArrayEquals( "First n bases of chr1 are incorrect",StringUtil.stringToBytes(firstBasesOfChr1),sequence.getBases()); - - System.err.printf("testFirstOfChr1 runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testFirstOfChr8() { - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr8",1,firstBasesOfChr8.length()); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chr8"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 8); - Assert.assertArrayEquals( "First n bases of chr8 are incorrect",StringUtil.stringToBytes(firstBasesOfChr8),sequence.getBases()); - - System.err.printf("testFirstOfChr8 runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testFirstElementOfIterator() { - ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName)); - ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); - - long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.nextSequence(); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chrM"); - Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 0); - Assert.assertArrayEquals("chrM is incorrect",expectedSequence.getBases(),sequence.getBases()); - - System.err.printf("testFirstElementOfIterator runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testNextElementOfIterator() { - ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName)); - // Skip past the first one and load the second one. - originalSequenceFile.nextSequence(); - ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); - - long startTime = System.currentTimeMillis(); - sequenceFile.nextSequence(); - ReferenceSequence sequence = sequenceFile.nextSequence(); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", "chr1", sequence.getName()); - Assert.assertEquals("Sequence contig index is not correct", 1, sequence.getContigIndex()); - Assert.assertEquals("Sequence size is not correct", expectedSequence.length(), sequence.length()); - Assert.assertArrayEquals("chr1 is incorrect",expectedSequence.getBases(),sequence.getBases()); - - System.err.printf("testNextElementOfIterator runtime: %dms%n", (endTime - startTime)) ; - } - - @Test - public void testReset() { - ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName)); - // Skip past the first one and load the second one. - ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); - - long startTime = System.currentTimeMillis(); - sequenceFile.nextSequence(); - sequenceFile.nextSequence(); - sequenceFile.reset(); - ReferenceSequence sequence = sequenceFile.nextSequence(); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", "chrM", sequence.getName()); - Assert.assertEquals("Sequence contig index is not correct", 0, sequence.getContigIndex()); - Assert.assertEquals("Sequence size is not correct", expectedSequence.length(), sequence.length()); - Assert.assertArrayEquals("chrM is incorrect", expectedSequence.getBases(),sequence.getBases()); - - System.err.printf("testReset runtime: %dms%n", (endTime - startTime)) ; - } -} diff --git a/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterUnitTest.java b/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterUnitTest.java index 3be13ae8a..ab7416f1b 100755 --- a/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterUnitTest.java @@ -4,7 +4,6 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.genotype.LikelihoodObject; import org.junit.Assert; import org.junit.Before; @@ -15,6 +14,7 @@ import java.io.File; import java.io.FileNotFoundException; import net.sf.samtools.SAMSequenceRecord; +import net.sf.picard.reference.IndexedFastaSequenceFile; /* @@ -66,11 +66,7 @@ public class GLFWriterUnitTest extends BaseTest { @BeforeClass public static void beforeTests() { IndexedFastaSequenceFile seq; - try { - seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); - } catch (FileNotFoundException e) { - throw new StingException("unable to load the sequence dictionary"); - } + seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "reference/human_b36_both.fasta")); GenomeLocParser.setupRefContigOrdering(seq); } diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index 97fcf1e30..3e183a9bc 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils.genotype.vcf; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; import org.junit.Assert; @@ -14,6 +13,8 @@ import java.io.File; import java.io.FileNotFoundException; import java.util.*; +import net.sf.picard.reference.IndexedFastaSequenceFile; + /** * @author aaron @@ -29,12 +30,8 @@ public class VCFWriterUnitTest extends BaseTest { @BeforeClass public static void beforeTests() { - try { - IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); - GenomeLocParser.setupRefContigOrdering(seq); - } catch (FileNotFoundException e) { - throw new StingException("unable to load the sequence dictionary"); - } + IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + GenomeLocParser.setupRefContigOrdering(seq); } /** test, using the writer and reader, that we can output and input a VCF file without problems */