From 339261c4a91c13d1685b64bd650fea314633b2cc Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 15 Apr 2009 18:04:13 +0000 Subject: [PATCH] Load the dictionary and sanity check it against the index. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@430 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/fasta/FastaSequenceIndex.java | 8 +++ .../utils/fasta/IndexedFastaSequenceFile.java | 72 +++++++++++++++++-- 2 files changed, 76 insertions(+), 4 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java index 7c140097c..89b25ccc1 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java @@ -95,6 +95,14 @@ public class FastaSequenceIndex implements Iterable { public Iterator iterator() { return sequenceEntries.values().iterator(); } + + /** + * Returns the number of elements in the index. + * @return Number of elements in the index. + */ + public int size() { + return sequenceEntries.size(); + } } class FastaSequenceIndexEntry { diff --git a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java index 0e983fa30..a3f0a0444 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.fasta; import edu.mit.broad.picard.reference.ReferenceSequenceFile; import edu.mit.broad.picard.reference.ReferenceSequence; import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.io.IoUtil; import java.io.File; import java.io.FileInputStream; @@ -17,6 +18,10 @@ import java.util.Scanner; import java.util.Iterator; import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMTextHeaderCodec; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.AsciiLineReader; /** * Created by IntelliJ IDEA. @@ -35,8 +40,10 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { private FileInputStream in; private FileChannel channel; - private final FastaSequenceIndex index; - private final Iterator indexIterator; + private SAMSequenceDictionary sequenceDictionary = null; + + private FastaSequenceIndex index; + private Iterator indexIterator; public IndexedFastaSequenceFile(File file) throws FileNotFoundException { this.file = file; @@ -44,13 +51,70 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { in = new FileInputStream(file); channel = in.getChannel(); - File indexFile = new File(file.getAbsolutePath() + ".fai"); + loadDictionary(file); + loadIndex(file); + sanityCheckDictionaryAgainstIndex(); + } + + /** + * Loads a dictionary, if available. + * @param fastaFile File to check for a match. + * TODO: This code is copied directly from FastaSequenceFile / FastaSequenceFile2. Bring it into a shared utility. + */ + private void loadDictionary( File fastaFile ) { + // Try and locate the dictionary + String dictionaryName = fastaFile.getAbsolutePath(); + dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta")); + dictionaryName += ".dict"; + final File dictionary = new File(dictionaryName); + if (dictionary.exists()) { + IoUtil.assertFileIsReadable(dictionary); + + try { + final SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); + final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary); + if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) { + this.sequenceDictionary = header.getSequenceDictionary(); + } + } + catch (Exception e) { + throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e); + } + } + + } + + /** + * Loads the index for the fasta, if present. Throws an exception if now present. + */ + private void loadIndex( File fastaFile ) throws FileNotFoundException { + File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); index = new FastaSequenceIndex(indexFile); indexIterator = index.iterator(); } + /** + * Do some basic checking to make sure the dictionary and the index match. + */ + private void sanityCheckDictionaryAgainstIndex() { + // Make sure dictionary and index are the same size. + if( sequenceDictionary.getSequences().size() != index.size() ) + throw new PicardException("Sequence dictionary and index contain different numbers of contigs"); + + for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() ) { + // Make sure sequence name is present in the index. + String sequenceName = sequenceRecord.getSequenceName(); + if( !index.hasIndexEntry(sequenceName) ) + throw new PicardException("Index does not contain dictionary entry: " + sequenceName ); + + // Make sure sequence length matches index length. + if( sequenceRecord.getSequenceLength() != index.getIndexEntry(sequenceName).getSize()) + throw new PicardException("Index length does not match dictionary length for contig: " + sequenceName ); + } + } + public SAMSequenceDictionary getSequenceDictionary() { - throw new UnsupportedOperationException("Indexed fasta files do not require dictionaries"); + return sequenceDictionary; } public ReferenceSequence getSequence( String contig ) {