241 lines
8.4 KiB
Java
241 lines
8.4 KiB
Java
|
|
package org.broadinstitute.sting.utils.genotype.glf;
|
||
|
|
|
||
|
|
import net.sf.samtools.util.BinaryCodec;
|
||
|
|
import net.sf.samtools.util.BlockCompressedInputStream;
|
||
|
|
import net.sf.samtools.util.RuntimeEOFException;
|
||
|
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||
|
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||
|
|
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||
|
|
|
||
|
|
import java.io.DataInputStream;
|
||
|
|
import java.io.File;
|
||
|
|
import java.io.IOException;
|
||
|
|
import java.util.Iterator;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Copyright (c) 2009 The Broad Institute
|
||
|
|
*
|
||
|
|
* Permission is hereby granted, free of charge, to any person
|
||
|
|
* obtaining a copy of this software and associated documentation
|
||
|
|
* files (the "Software"), to deal in the Software without
|
||
|
|
* restriction, including without limitation the rights to use,
|
||
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
|
|
* copies of the Software, and to permit persons to whom the
|
||
|
|
* Software is furnished to do so, subject to the following
|
||
|
|
* conditions:
|
||
|
|
*
|
||
|
|
* The above copyright notice and this permission notice shall be
|
||
|
|
* included in all copies or substantial portions of the Software.
|
||
|
|
*
|
||
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||
|
|
*/
|
||
|
|
|
||
|
|
/** an object for reading in GLF files */
|
||
|
|
// TODO -- DELETE ME GLF
|
||
|
|
public class GLFReader implements Iterator<GLFRecord> {
|
||
|
|
|
||
|
|
// our next record
|
||
|
|
private GLFRecord nextRecord = null;
|
||
|
|
|
||
|
|
// the glf magic number, which identifies a properly formatted GLF file
|
||
|
|
public static final short[] glfMagic = {'G', 'L', 'F', '\3'};
|
||
|
|
|
||
|
|
// our input codec
|
||
|
|
private final BinaryCodec inputBinaryCodec;
|
||
|
|
|
||
|
|
// our header string
|
||
|
|
private String headerStr;
|
||
|
|
|
||
|
|
// our reference name
|
||
|
|
private String referenceName;
|
||
|
|
|
||
|
|
// reference length
|
||
|
|
private int referenceLength;
|
||
|
|
|
||
|
|
// the current location, keeping track of the offsets
|
||
|
|
private long currentLocation = 1;
|
||
|
|
|
||
|
|
// we have this variable becuase there is no eof for glf's
|
||
|
|
private int lastRecordType = -1;
|
||
|
|
|
||
|
|
private File myFile;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* create a glf reader
|
||
|
|
*
|
||
|
|
* @param readFrom the file to read from
|
||
|
|
*/
|
||
|
|
public GLFReader(File readFrom) {
|
||
|
|
myFile = readFrom;
|
||
|
|
|
||
|
|
try {
|
||
|
|
inputBinaryCodec = new BinaryCodec(new DataInputStream(new BlockCompressedInputStream(readFrom)));
|
||
|
|
} catch (IOException e) {
|
||
|
|
throw new UserException.CouldNotReadInputFile(myFile, e);
|
||
|
|
}
|
||
|
|
|
||
|
|
inputBinaryCodec.setInputFileName(readFrom.getName());
|
||
|
|
|
||
|
|
// first verify that it's a valid GLF
|
||
|
|
for (short s : glfMagic) {
|
||
|
|
if (inputBinaryCodec.readUByte() != s)
|
||
|
|
throw new UserException.MalformedFile(myFile, "Verification of GLF format failed: magic string doesn't match)");
|
||
|
|
}
|
||
|
|
|
||
|
|
// get the header string
|
||
|
|
headerStr = inputBinaryCodec.readLengthAndString(false);
|
||
|
|
|
||
|
|
if (advanceContig()) {
|
||
|
|
// setup the next record
|
||
|
|
next();
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* read in a single point call
|
||
|
|
*
|
||
|
|
* @param refBase the reference base
|
||
|
|
* @param inputBinaryCodec the binary codec
|
||
|
|
*
|
||
|
|
* @return a single point call object
|
||
|
|
*/
|
||
|
|
private GLFSingleCall generateSPC(char refBase, BinaryCodec inputBinaryCodec) {
|
||
|
|
int offset = (int) inputBinaryCodec.readUInt();
|
||
|
|
long depth = inputBinaryCodec.readUInt();
|
||
|
|
short min_lk = (short) ((depth & 0x00000000ff000000) >> 24);
|
||
|
|
int readDepth = (int) (depth & 0x0000000000ffffff);
|
||
|
|
short rmsMapping = inputBinaryCodec.readUByte();
|
||
|
|
double[] lkValues = new double[LikelihoodObject.GENOTYPE.values().length];
|
||
|
|
for (int x = 0; x < LikelihoodObject.GENOTYPE.values().length; x++) {
|
||
|
|
lkValues[x] = ((double)inputBinaryCodec.readUByte() / GLFRecord.LIKELIHOOD_SCALE_FACTOR + (double)min_lk);
|
||
|
|
}
|
||
|
|
return new GLFSingleCall(referenceName, refBase, (int)(offset+currentLocation), readDepth, rmsMapping, lkValues);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* read in a variable length call, and generate a VLC object from the data
|
||
|
|
*
|
||
|
|
* @param refBase the reference base
|
||
|
|
* @param inputBinaryCodec the input codex
|
||
|
|
*
|
||
|
|
* @return a GLFVariableLengthCall object
|
||
|
|
*/
|
||
|
|
private GLFVariableLengthCall generateVLC(char refBase, BinaryCodec inputBinaryCodec) {
|
||
|
|
int offset = (int) inputBinaryCodec.readUInt();
|
||
|
|
int depth = (int) inputBinaryCodec.readUInt();
|
||
|
|
short min_lk = (short) ((depth & 0x00000000ff000000) >> 24);
|
||
|
|
int readDepth = (depth & 0x0000000000ffffff);
|
||
|
|
short rmsMapping = inputBinaryCodec.readUByte();
|
||
|
|
short lkHom1 = inputBinaryCodec.readUByte();
|
||
|
|
short lkHom2 = inputBinaryCodec.readUByte();
|
||
|
|
short lkHet = inputBinaryCodec.readUByte();
|
||
|
|
int indelLen1 = (int) inputBinaryCodec.readShort();
|
||
|
|
int indelLen2 = (int) inputBinaryCodec.readShort();
|
||
|
|
|
||
|
|
int readCnt = Math.abs(indelLen1);
|
||
|
|
short indelSeq1[] = new short[readCnt];
|
||
|
|
for (int x = 0; x < readCnt; x++) {
|
||
|
|
indelSeq1[x] = inputBinaryCodec.readUByte();
|
||
|
|
}
|
||
|
|
readCnt = Math.abs(indelLen2);
|
||
|
|
short indelSeq2[] = new short[readCnt];
|
||
|
|
for (int x = 0; x < readCnt; x++) {
|
||
|
|
indelSeq2[x] = inputBinaryCodec.readUByte();
|
||
|
|
}
|
||
|
|
return new GLFVariableLengthCall(referenceName, refBase, offset+currentLocation, readDepth, rmsMapping, lkHom1, lkHom2, lkHet, indelLen1, indelSeq1, indelLen2, indelSeq2);
|
||
|
|
}
|
||
|
|
|
||
|
|
public boolean hasNext() {
|
||
|
|
return (nextRecord != null);
|
||
|
|
}
|
||
|
|
|
||
|
|
public GLFRecord next() {
|
||
|
|
GLFRecord ret = nextRecord;
|
||
|
|
short firstBase = protectedByteReadForFile();
|
||
|
|
if (firstBase == -1) return ret;
|
||
|
|
|
||
|
|
// parse out the record type and reference base
|
||
|
|
byte recordType = (byte) ((firstBase & 0x0f0) >> 4);
|
||
|
|
char refBase = (char) (firstBase & 0x000f);
|
||
|
|
lastRecordType = recordType;
|
||
|
|
|
||
|
|
if (recordType == 1) {
|
||
|
|
nextRecord = generateSPC(refBase, inputBinaryCodec);
|
||
|
|
} else if (recordType == 2) {
|
||
|
|
nextRecord = generateVLC(refBase, inputBinaryCodec);
|
||
|
|
} else if (recordType == 0) {
|
||
|
|
if (advanceContig()) {
|
||
|
|
return next();
|
||
|
|
}
|
||
|
|
//nextRecord = null;
|
||
|
|
} else {
|
||
|
|
throw new UserException.MalformedFile(myFile, "Unknown GLF record type (type = " + recordType + ")");
|
||
|
|
}
|
||
|
|
if (nextRecord != null) currentLocation = nextRecord.getPosition();
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* read a short, and if we see an exception only throw it if it's unexpected (not after a zero)
|
||
|
|
* @return a short
|
||
|
|
*/
|
||
|
|
private short protectedByteReadForFile() {
|
||
|
|
short st = -1;
|
||
|
|
try {
|
||
|
|
st = inputBinaryCodec.readUByte();
|
||
|
|
} catch (RuntimeEOFException exp) {
|
||
|
|
nextRecord = null;
|
||
|
|
if (lastRecordType != 0) {
|
||
|
|
throw exp; // if the last record was a zero, this is an ok condition. Otherwise throw an exception
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return st;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* advance to the next contig
|
||
|
|
*
|
||
|
|
* @return true if we could advance
|
||
|
|
*/
|
||
|
|
private boolean advanceContig() {
|
||
|
|
// try to read the next sequence record
|
||
|
|
try {
|
||
|
|
// get the reference name
|
||
|
|
referenceName = inputBinaryCodec.readLengthAndString(true);
|
||
|
|
|
||
|
|
// get the reference length - this may be a problem storing an unsigned int into a signed int. but screw it.
|
||
|
|
referenceLength = (int) inputBinaryCodec.readUInt();
|
||
|
|
//System.err.println(referenceName.length());
|
||
|
|
currentLocation = 1;
|
||
|
|
return true;
|
||
|
|
} catch (RuntimeException e) {
|
||
|
|
if (lastRecordType != 0) {
|
||
|
|
throw e; // if the last record was a zero, this is an ok condition. Otherwise throw an exception
|
||
|
|
}
|
||
|
|
nextRecord = null;
|
||
|
|
}
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
public void remove() {
|
||
|
|
throw new ReviewedStingException("GLFReader doesn't support remove()");
|
||
|
|
}
|
||
|
|
|
||
|
|
public void close() {
|
||
|
|
inputBinaryCodec.close();
|
||
|
|
}
|
||
|
|
|
||
|
|
public String getHeaderStr() {
|
||
|
|
return headerStr;
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|