221 lines
7.9 KiB
Java
221 lines
7.9 KiB
Java
|
|
/*
|
||
|
|
* The Broad Institute
|
||
|
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||
|
|
* This software and its documentation are copyright 2008 by the
|
||
|
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||
|
|
*
|
||
|
|
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||
|
|
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||
|
|
* or functionality.
|
||
|
|
*/
|
||
|
|
package edu.mit.broad.arachne;
|
||
|
|
|
||
|
|
|
||
|
|
import edu.mit.broad.sam.util.CloseableIterator;
|
||
|
|
|
||
|
|
import java.io.*;
|
||
|
|
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Reader for arachne Fastb files.
|
||
|
|
*/
|
||
|
|
public class FastbReader
|
||
|
|
implements CloseableIterator<String> {
|
||
|
|
|
||
|
|
// Notes on fastb file format
|
||
|
|
//
|
||
|
|
// Fastb files contain the serialized contents of an arachne vecbasevector,
|
||
|
|
// which is a typedef for mastervec<basevector, unsigned int>.
|
||
|
|
// The serialization of mastervec objects starts with a 24 byte mv_file_control_block,
|
||
|
|
// followed by N variable length segments (one for each element of the mastervec vector),
|
||
|
|
// followed by an offset table containing N 8-byte file offsets to the N variable length
|
||
|
|
// segments, followed by N fixed length data segments, one for each vector element.
|
||
|
|
// Thus, reading a single element of the mastervec vector requires reading from three
|
||
|
|
// separate places in the file (the offset table, the variable length section and the
|
||
|
|
// fixed length section).
|
||
|
|
//
|
||
|
|
// The mastervec file header is 24 bytes arranged as follows:
|
||
|
|
// n 4-byte signed(?) integer (number of entries)
|
||
|
|
// c1 1-byte unsigned bit mask (see below)
|
||
|
|
// reserved 1-byte unused
|
||
|
|
// sizeX 1-byte unsigned, sizeof first template parameter (16 for fastb files)
|
||
|
|
// sizeA 1-byte unsigned, sizeof second template parameter (4 for fastb files)
|
||
|
|
// offsets_start 8-byte signed(?) integer, file offset of offset table
|
||
|
|
// static_start 8-byte signed(?) integer, file offset of static data (fixed size section)
|
||
|
|
//
|
||
|
|
// For fastb files, the fixed size section contains 4 bytes for each object, which is the
|
||
|
|
// unsigned(?) count of the number of bases in this entry.
|
||
|
|
// For fastb files, the variable length section contains a bit vector with two bits per base.
|
||
|
|
// The bases are encoded as follows: A = 0, C = 1, G = 2, T = 3.
|
||
|
|
//
|
||
|
|
// For fastb files, in the file header N is the number of entries in the fastb file.
|
||
|
|
// c1 is unused/unimplemented except that the two low-order bits should be 0x01, indicating
|
||
|
|
// that we are using the single-file representation. There is also apparently a three-file
|
||
|
|
// representation that looks the same except that the offset table and static (fixed length)
|
||
|
|
// table are in separate files named <basename>.offsets and <basename>.static.
|
||
|
|
// The sizeX should be 16 for fastb files and sizeA should be 4.
|
||
|
|
//
|
||
|
|
// Note that in fastb files, the sequences are not identified by name or id, only by index
|
||
|
|
// (zero based) into the mastervec object. There is no representation for bases other than
|
||
|
|
// ACGT (i.e. Ns cannot be encoded).
|
||
|
|
|
||
|
|
private static final char[] BASES = { 'A', 'C', 'G', 'T' };
|
||
|
|
|
||
|
|
private File mFile;
|
||
|
|
private RandomAccessFile mRandomFile;
|
||
|
|
private int mEntryCount;
|
||
|
|
private long mOffsetTableOffset;
|
||
|
|
private long mLengthTableOffset;
|
||
|
|
private int mCurrentPosition;
|
||
|
|
private byte[] mIOBuffer = new byte[8];
|
||
|
|
|
||
|
|
|
||
|
|
public FastbReader(File file)
|
||
|
|
throws IOException {
|
||
|
|
mFile = file;
|
||
|
|
mRandomFile = new RandomAccessFile(mFile, "r");
|
||
|
|
readHeader();
|
||
|
|
}
|
||
|
|
|
||
|
|
public int getSequenceCount() {
|
||
|
|
return mEntryCount;
|
||
|
|
}
|
||
|
|
|
||
|
|
public boolean hasNext() {
|
||
|
|
return (mCurrentPosition < mEntryCount);
|
||
|
|
}
|
||
|
|
|
||
|
|
public String next() {
|
||
|
|
if (!hasNext()) {
|
||
|
|
throw new IllegalStateException("Iterator exhausted");
|
||
|
|
}
|
||
|
|
try {
|
||
|
|
return readSequence(mCurrentPosition);
|
||
|
|
} catch (IOException exc) {
|
||
|
|
throw new RuntimeException(exc.getMessage(), exc);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
public void remove() {
|
||
|
|
throw new UnsupportedOperationException("Not supported: remove");
|
||
|
|
}
|
||
|
|
|
||
|
|
public void close() {
|
||
|
|
if (mRandomFile != null) {
|
||
|
|
mEntryCount = 0;
|
||
|
|
mCurrentPosition = 0;
|
||
|
|
try {
|
||
|
|
mRandomFile.close();
|
||
|
|
} catch (IOException exc) {
|
||
|
|
throw new RuntimeException(exc.getMessage(), exc);
|
||
|
|
} finally {
|
||
|
|
mRandomFile = null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
public String readSequence(int n)
|
||
|
|
throws IOException {
|
||
|
|
if (mRandomFile == null) {
|
||
|
|
throw new IllegalStateException("Reader is closed");
|
||
|
|
}
|
||
|
|
if (n < 0 || n >= mEntryCount) {
|
||
|
|
throw new IndexOutOfBoundsException("Illegal index: " + n);
|
||
|
|
}
|
||
|
|
long offset = getEntryOffset(n);
|
||
|
|
int length = getEntryBaseCount(n);
|
||
|
|
String result = readBases(offset, length);
|
||
|
|
mCurrentPosition = n+1;
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
private void readHeader()
|
||
|
|
throws IOException {
|
||
|
|
|
||
|
|
byte[] fileControlBlock = new byte[24];
|
||
|
|
mRandomFile.readFully(fileControlBlock, 0, 24);
|
||
|
|
|
||
|
|
int word2 = deserializeInt(fileControlBlock, 4);
|
||
|
|
int nFiles = word2 & 0x3;
|
||
|
|
int sizeX = (word2 >> 16) & 0xFF;
|
||
|
|
int sizeA = (word2 >> 24) & 0xFF;
|
||
|
|
if (nFiles != 1) {
|
||
|
|
throw new RuntimeException(mFile + ": Invalid file header: nFiles = " + nFiles);
|
||
|
|
}
|
||
|
|
if (sizeX != 16) {
|
||
|
|
throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeX);
|
||
|
|
}
|
||
|
|
if (sizeA != 4) {
|
||
|
|
throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeA);
|
||
|
|
}
|
||
|
|
mEntryCount = deserializeInt(fileControlBlock, 0);
|
||
|
|
mOffsetTableOffset = deserializeLong(fileControlBlock, 8);
|
||
|
|
mLengthTableOffset = deserializeLong(fileControlBlock, 16);
|
||
|
|
}
|
||
|
|
|
||
|
|
private long getEntryOffset(int n)
|
||
|
|
throws IOException {
|
||
|
|
mRandomFile.seek(mOffsetTableOffset + 8 * n);
|
||
|
|
mRandomFile.readFully(mIOBuffer, 0, 8);
|
||
|
|
return deserializeLong(mIOBuffer, 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
private int getEntryBaseCount(int n)
|
||
|
|
throws IOException {
|
||
|
|
mRandomFile.seek(mLengthTableOffset + 4 * n);
|
||
|
|
mRandomFile.readFully(mIOBuffer, 0, 4);
|
||
|
|
return deserializeInt(mIOBuffer, 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
private String readBases(long fileOffset, int baseCount)
|
||
|
|
throws IOException {
|
||
|
|
|
||
|
|
|
||
|
|
int byteCount = (baseCount + 3) / 4;
|
||
|
|
byte[] data = new byte[byteCount];
|
||
|
|
mRandomFile.seek(fileOffset);
|
||
|
|
mRandomFile.readFully(data, 0, byteCount);
|
||
|
|
|
||
|
|
int baseIndex = 0;
|
||
|
|
int dataIndex = 0;
|
||
|
|
char[] baseBuffer = new char[baseCount];
|
||
|
|
while (baseIndex < baseCount) {
|
||
|
|
int b = data[dataIndex++];
|
||
|
|
int count = Math.min(4, baseCount - baseIndex);
|
||
|
|
for (int i = 0; i < count; i++) {
|
||
|
|
baseBuffer[baseIndex++] = BASES[b & 0x3];
|
||
|
|
b = b >> 2;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return new String(baseBuffer);
|
||
|
|
}
|
||
|
|
|
||
|
|
private int deserializeInt(byte[] buffer, int offset) {
|
||
|
|
int byte1 = buffer[offset] & 0xFF;
|
||
|
|
int byte2 = buffer[offset+1] & 0xFF;
|
||
|
|
int byte3 = buffer[offset+2] & 0xFF;
|
||
|
|
int byte4 = buffer[offset+3] & 0xFF;
|
||
|
|
return (byte1 | (byte2 << 8) | (byte3 << 16) | (byte4 << 24));
|
||
|
|
}
|
||
|
|
|
||
|
|
private long deserializeLong(byte[] buffer, int offset) {
|
||
|
|
long int1 = deserializeInt(buffer, offset) & 0xFFFFFFFFL;
|
||
|
|
long int2 = deserializeInt(buffer, offset+4) & 0xFFFFFFFFL;
|
||
|
|
return (int1 | (int2 << 32));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Stub for interactive use (see also Fastb2Fasta)
|
||
|
|
public static void main(String[] args)
|
||
|
|
throws Exception {
|
||
|
|
FastbReader reader = new FastbReader(new File(args[0]));
|
||
|
|
int readId = 0;
|
||
|
|
while (reader.hasNext()) {
|
||
|
|
System.out.println(">" + readId);
|
||
|
|
System.out.println(reader.next());
|
||
|
|
readId++;
|
||
|
|
}
|
||
|
|
reader.close();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|