A bit of cleanup in preparation for Picard patch.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2286 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-12-08 16:09:04 +00:00
parent d3b78338da
commit 9e2f831206
4 changed files with 63 additions and 39 deletions

View File

@ -12,32 +12,27 @@ import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
/** /**
* Created by IntelliJ IDEA. * Reads a fasta index file (.fai), as generated by `samtools faidx`.
* User: hanna
* Date: Apr 14, 2009
* Time: 10:02:10 AM
*
* Reads a fasta index file (.fai).
*/ */
public class FastaSequenceIndex implements Iterable { public class FastaSequenceIndex implements Iterable {
// Use a linked hash map to preserve the ordering of the contigs. /**
private Map<String,FastaSequenceIndexEntry> sequenceEntries = * Store the entries. Use a LinkedHashMap for consistent iteration in insertion order.
new LinkedHashMap<String,FastaSequenceIndexEntry>(); */
private Map<String,FastaSequenceIndexEntry> sequenceEntries = new LinkedHashMap<String,FastaSequenceIndexEntry>();
/** /**
* Build a sequence index from the specified file. * Build a sequence index from the specified file.
* @param indexFile File to open. * @param indexFile File to open.
* @throws PicardException if file is of invalid format. * @throws FileNotFoundException if the index file cannot be found.
*/ */
public FastaSequenceIndex( File indexFile ) throws FileNotFoundException { public FastaSequenceIndex( File indexFile ) throws FileNotFoundException {
if(!indexFile.exists()) if(!indexFile.exists())
throw new FileNotFoundException(String.format("Fasta index file is missing",indexFile.getAbsolutePath())); throw new FileNotFoundException(String.format("Fasta index file is missing: %s",indexFile.getAbsolutePath()));
IoUtil.assertFileIsReadable(indexFile); IoUtil.assertFileIsReadable(indexFile);
parseIndexFile(indexFile); parseIndexFile(indexFile);
} }
/** /**
* Parse the contents of an index file, caching the results internally. * Parse the contents of an index file, caching the results internally.
* @param indexFile File to parse. * @param indexFile File to parse.
@ -92,6 +87,10 @@ public class FastaSequenceIndex implements Iterable {
return sequenceEntries.get(contigName); return sequenceEntries.get(contigName);
} }
/**
* Creates an iterator which can iterate through all entries in a fasta index.
* @return iterator over all fasta index entries.
*/
public Iterator<FastaSequenceIndexEntry> iterator() { public Iterator<FastaSequenceIndexEntry> iterator() {
return sequenceEntries.values().iterator(); return sequenceEntries.values().iterator();
} }
@ -105,6 +104,9 @@ public class FastaSequenceIndex implements Iterable {
} }
} }
/**
* Hold an individual entry in a fasta sequence index file.
*/
class FastaSequenceIndexEntry { class FastaSequenceIndexEntry {
private String contig; private String contig;
private long location; private long location;
@ -112,11 +114,19 @@ class FastaSequenceIndexEntry {
private int basesPerLine; private int basesPerLine;
private int bytesPerLine; private int bytesPerLine;
/**
* Create a new entry with the given parameters.
* @param contig Contig this entry represents.
* @param location Location (byte coordinate) in the fasta file.
* @param size The number of bases in the contig.
* @param basesPerLine How many bases are on each line.
* @param bytesPerLine How many bytes are on each line (includes newline characters).
*/
public FastaSequenceIndexEntry( String contig, public FastaSequenceIndexEntry( String contig,
long location, long location,
long size, long size,
int basesPerLine, int basesPerLine,
int bytesPerLine ) { int bytesPerLine ) {
this.contig = contig; this.contig = contig;
this.location = location; this.location = location;
this.size = size; this.size = size;
@ -165,6 +175,10 @@ class FastaSequenceIndexEntry {
return bytesPerLine; return bytesPerLine;
} }
/**
* For debugging. Emit the contents of each contig line.
* @return A string representation of the contig line.
*/
public String toString() { public String toString() {
return String.format("contig %s; location %d; size %d; basesPerLine %d; bytesPerLine %d", contig, return String.format("contig %s; location %d; size %d; basesPerLine %d; bytesPerLine %d", contig,
location, location,

View File

@ -17,34 +17,46 @@ import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.Iterator; import java.util.Iterator;
import java.util.Scanner;
/** /**
* Created by IntelliJ IDEA.
* User: hanna
* Date: Apr 14, 2009
* Time: 2:14:26 PM
*
* A fasta file driven by an index for fast, concurrent lookups. Supports two interfaces: * A fasta file driven by an index for fast, concurrent lookups. Supports two interfaces:
* the ReferenceSequenceFile for old-style, stateful lookups and a direct getter. * the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.
*/ */
public class IndexedFastaSequenceFile implements ReferenceSequenceFile { public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
/**
* Stores the main fasta file.
*/
private final File file; private final File file;
private FileInputStream in;
/**
* The interface facilitating direct access to the fasta.
*/
private FileChannel channel; private FileChannel channel;
/**
* A representation of the sequence dictionary, stored alongside the fasta in a .dict file.
*/
private SAMSequenceDictionary sequenceDictionary = null; private SAMSequenceDictionary sequenceDictionary = null;
/**
* A representation of the sequence index, stored alongside the fasta in a .fasta.fai file.
*/
private FastaSequenceIndex index; private FastaSequenceIndex index;
/**
* An iterator into the fasta index, for traversing iteratively across the fasta.
*/
private Iterator<FastaSequenceIndexEntry> indexIterator; private Iterator<FastaSequenceIndexEntry> indexIterator;
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
* @param file The file to open.
* @throws FileNotFoundException If the fasta or any of its supporting files cannot be found.
*/
public IndexedFastaSequenceFile(File file) throws FileNotFoundException { public IndexedFastaSequenceFile(File file) throws FileNotFoundException {
this.file = file; this.file = file;
in = new FileInputStream(file); FileInputStream in = new FileInputStream(file);
channel = in.getChannel(); channel = in.getChannel();
loadDictionary(file); loadDictionary(file);
@ -55,7 +67,6 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
/** /**
* Loads a dictionary, if available. * Loads a dictionary, if available.
* @param fastaFile File to check for a match. * @param fastaFile File to check for a match.
* TODO: This code is copied directly from FastaSequenceFile / FastaSequenceFile2. Bring it into a shared utility.
*/ */
private void loadDictionary( File fastaFile ) { private void loadDictionary( File fastaFile ) {
// Try and locate the dictionary // Try and locate the dictionary
@ -207,11 +218,18 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
return getSequence( indexIterator.next().getContig() ); return getSequence( indexIterator.next().getContig() );
} }
/**
* Reset the iterator over the index.
*/
@Override @Override
public void reset() { public void reset() {
indexIterator = index.iterator(); indexIterator = index.iterator();
} }
/**
* A simple toString implementation for debugging.
* @return String representation of the file.
*/
public String toString() { public String toString() {
return this.file.getAbsolutePath(); return this.file.getAbsolutePath();
} }

View File

@ -12,11 +12,7 @@ import java.io.FileNotFoundException;
import java.util.Iterator; import java.util.Iterator;
/** /**
* Created by IntelliJ IDEA. * Test the fasta sequence index reader.
* User: hanna
* Date: Apr 14, 2009
* Time: 10:34:15 AM
* To change this template use File | Settings | File Templates.
*/ */
public class FastaSequenceIndexTest extends BaseTest { public class FastaSequenceIndexTest extends BaseTest {
// our basic human 18 fai // our basic human 18 fai

View File

@ -16,11 +16,7 @@ import net.sf.picard.PicardException;
import net.sf.samtools.util.StringUtil; import net.sf.samtools.util.StringUtil;
/** /**
* Created by IntelliJ IDEA. * Test the indexed fasta sequence file reader.
* User: hanna
* Date: Apr 14, 2009
* Time: 2:37:29 PM
* To change this template use File | Settings | File Templates.
*/ */
public class IndexedFastaSequenceFileTest extends BaseTest { public class IndexedFastaSequenceFileTest extends BaseTest {
private static String sequenceFileName; private static String sequenceFileName;