Load the dictionary and sanity check it against the index.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@430 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-04-15 18:04:13 +00:00
parent 26e84d7fd6
commit 339261c4a9
2 changed files with 76 additions and 4 deletions

View File

@ -95,6 +95,14 @@ public class FastaSequenceIndex implements Iterable {
public Iterator<FastaSequenceIndexEntry> iterator() {
return sequenceEntries.values().iterator();
}
/**
* Returns the number of elements in the index.
* @return Number of elements in the index.
*/
public int size() {
return sequenceEntries.size();
}
}
class FastaSequenceIndexEntry {

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.fasta;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.io.IoUtil;
import java.io.File;
import java.io.FileInputStream;
@ -17,6 +18,10 @@ import java.util.Scanner;
import java.util.Iterator;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMTextHeaderCodec;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.AsciiLineReader;
/**
* Created by IntelliJ IDEA.
@ -35,8 +40,10 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
private FileInputStream in;
private FileChannel channel;
private final FastaSequenceIndex index;
private final Iterator<FastaSequenceIndexEntry> indexIterator;
private SAMSequenceDictionary sequenceDictionary = null;
private FastaSequenceIndex index;
private Iterator<FastaSequenceIndexEntry> indexIterator;
public IndexedFastaSequenceFile(File file) throws FileNotFoundException {
this.file = file;
@ -44,13 +51,70 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
in = new FileInputStream(file);
channel = in.getChannel();
File indexFile = new File(file.getAbsolutePath() + ".fai");
loadDictionary(file);
loadIndex(file);
sanityCheckDictionaryAgainstIndex();
}
/**
* Loads a dictionary, if available.
* @param fastaFile File to check for a match.
* TODO: This code is copied directly from FastaSequenceFile / FastaSequenceFile2. Bring it into a shared utility.
*/
private void loadDictionary( File fastaFile ) {
// Try and locate the dictionary
String dictionaryName = fastaFile.getAbsolutePath();
dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta"));
dictionaryName += ".dict";
final File dictionary = new File(dictionaryName);
if (dictionary.exists()) {
IoUtil.assertFileIsReadable(dictionary);
try {
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary);
if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) {
this.sequenceDictionary = header.getSequenceDictionary();
}
}
catch (Exception e) {
throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e);
}
}
}
/**
* Loads the index for the fasta, if present. Throws an exception if now present.
*/
private void loadIndex( File fastaFile ) throws FileNotFoundException {
File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
index = new FastaSequenceIndex(indexFile);
indexIterator = index.iterator();
}
/**
* Do some basic checking to make sure the dictionary and the index match.
*/
private void sanityCheckDictionaryAgainstIndex() {
// Make sure dictionary and index are the same size.
if( sequenceDictionary.getSequences().size() != index.size() )
throw new PicardException("Sequence dictionary and index contain different numbers of contigs");
for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() ) {
// Make sure sequence name is present in the index.
String sequenceName = sequenceRecord.getSequenceName();
if( !index.hasIndexEntry(sequenceName) )
throw new PicardException("Index does not contain dictionary entry: " + sequenceName );
// Make sure sequence length matches index length.
if( sequenceRecord.getSequenceLength() != index.getIndexEntry(sequenceName).getSize())
throw new PicardException("Index length does not match dictionary length for contig: " + sequenceName );
}
}
public SAMSequenceDictionary getSequenceDictionary() {
throw new UnsupportedOperationException("Indexed fasta files do not require dictionaries");
return sequenceDictionary;
}
public ReferenceSequence getSequence( String contig ) {