an initial pass at the GLF reader, and some other genotype changes to phase out the LikelihoodObject I created.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1179 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
5735c87581
commit
2a86f2f833
|
|
@ -161,9 +161,9 @@ public class LikelihoodObject {
|
|||
*
|
||||
* @return a float array containing our genotype likelihoods, as negitive log likelihoods
|
||||
*/
|
||||
public float[] toFloatArray() {
|
||||
public double[] toDoubleArray() {
|
||||
// make an array of floats
|
||||
float[] ft = new float[10];
|
||||
double[] ft = new double[10];
|
||||
int index = 0;
|
||||
for (GENOTYPE T : GENOTYPE.values()) {
|
||||
ft[index] = this.likelihood.get(T).floatValue();
|
||||
|
|
@ -178,18 +178,19 @@ public class LikelihoodObject {
|
|||
* @return a GenotypeLikelihoods object representing our data
|
||||
*/
|
||||
public GenotypeLikelihoods convert( SAMFileHeader samHeader, int seqIndex, int seqPosition, byte refBase ) {
|
||||
float[] ft = toFloatArray();
|
||||
|
||||
double[] ft = toDoubleArray();
|
||||
float[] db = new float[ft.length];
|
||||
int index = 0;
|
||||
if (this.mLikelihoodType == LIKELIHOOD_TYPE.NEGITIVE_LOG) {
|
||||
for (float f : ft) {
|
||||
f = f * -1.0f;
|
||||
for (;index < ft.length; index++) {
|
||||
db[index] = ((float)ft[index] * -1.0f);
|
||||
}
|
||||
} else if (this.mLikelihoodType == LIKELIHOOD_TYPE.RAW) {
|
||||
for (float f : ft) {
|
||||
f = (float) Math.log(f);
|
||||
for (;index < ft.length; index++) {
|
||||
db[index] = (float) Math.log(ft[index]);
|
||||
}
|
||||
}
|
||||
return new GenotypeLikelihoods(samHeader, seqIndex, seqPosition, refBase, ft);
|
||||
return new GenotypeLikelihoods(samHeader, seqIndex, seqPosition, refBase, db);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,147 @@
|
|||
package org.broadinstitute.sting.utils.genotype.glf;
|
||||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import net.sf.samtools.util.BlockCompressedOutputStream;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.DataOutputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* an object for reading in GLF files
|
||||
*/
|
||||
public class GLFReader implements Iterator<GLFRecord> {
|
||||
|
||||
// our next record
|
||||
private GLFRecord nextRecord = null;
|
||||
|
||||
// the glf magic number, which identifies a properly formatted GLF file
|
||||
public static final short[] glfMagic = {'G', 'L', 'F', '\3'};
|
||||
|
||||
// our input codec
|
||||
private final BinaryCodec inputBinaryCodec;
|
||||
|
||||
// our header string
|
||||
private String headerStr;
|
||||
|
||||
// our reference name
|
||||
private String referenceName;
|
||||
|
||||
// reference length
|
||||
private int referenceLength;
|
||||
|
||||
GLFReader( File readFrom ) {
|
||||
inputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(readFrom)));
|
||||
inputBinaryCodec.setInputFileName(readFrom.getName());
|
||||
|
||||
// first verify that it's a valid GLF
|
||||
for (short s: glfMagic) {
|
||||
if (inputBinaryCodec.readUByte() != s) throw new StingException("Verification of GLF format failed: magic string doesn't match)");
|
||||
}
|
||||
|
||||
// get the header string
|
||||
headerStr = inputBinaryCodec.readLengthAndString(false);
|
||||
|
||||
// get the reference name
|
||||
referenceName = inputBinaryCodec.readLengthAndString(true);
|
||||
|
||||
// get the reference length - this may be a problem storing an unsigned int into a signed int. but screw it.
|
||||
referenceLength = (int)inputBinaryCodec.readUInt();
|
||||
|
||||
// get the next record
|
||||
nextRecord = next();
|
||||
}
|
||||
|
||||
private SinglePointCall generateSPC(char refBase, BinaryCodec inputBinaryCodec) {
|
||||
int offset = (int)inputBinaryCodec.readUInt();
|
||||
long depth = inputBinaryCodec.readUInt();
|
||||
short min_lk = (short)((depth & 0x00000000ff000000) >> 24);
|
||||
int readDepth = (int)(depth & 0x0000000000ffffff);
|
||||
short rmsMapping = inputBinaryCodec.readUByte();
|
||||
double[] lkValues = new double[LikelihoodObject.GENOTYPE.values().length];
|
||||
for (int x = 0; x < LikelihoodObject.GENOTYPE.values().length; x++) {
|
||||
lkValues[x] = inputBinaryCodec.readUByte();
|
||||
}
|
||||
return new SinglePointCall(refBase,offset,readDepth,rmsMapping,lkValues);
|
||||
}
|
||||
|
||||
|
||||
private VariableLengthCall generateVLC(char refBase, BinaryCodec inputBinaryCodec) {
|
||||
int offset = (int)inputBinaryCodec.readUInt();
|
||||
int depth = (int)inputBinaryCodec.readUInt();
|
||||
short min_lk = (short)((depth & 0x00000000ff000000) >> 24);
|
||||
int readDepth = (depth & 0x0000000000ffffff);
|
||||
short rmsMapping = inputBinaryCodec.readUByte();
|
||||
short lkHom1 = inputBinaryCodec.readUByte();
|
||||
short lkHom2 = inputBinaryCodec.readUByte();
|
||||
short lkHet = inputBinaryCodec.readUByte();
|
||||
int indelLen1 = (int)inputBinaryCodec.readShort();
|
||||
int indelLen2 = (int)inputBinaryCodec.readShort();
|
||||
short[] indelSeq1 = new short[indelLen1];
|
||||
short[] indelSeq2 = new short[indelLen2];
|
||||
for (int x = 0; x < indelLen1; x++) {
|
||||
indelSeq1[x] = inputBinaryCodec.readUByte();
|
||||
}
|
||||
for (int x = 0; x < indelLen2; x++) {
|
||||
indelSeq2[x] = inputBinaryCodec.readUByte();
|
||||
}
|
||||
return new VariableLengthCall(refBase,offset,readDepth,rmsMapping,lkHom1,lkHom2,lkHet,indelSeq1,indelSeq2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return (nextRecord != null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public GLFRecord next() {
|
||||
short firstBase = inputBinaryCodec.readUByte();
|
||||
byte recordType = (byte)(firstBase & 0x00f0 >> 4);
|
||||
char refBase = (char)(firstBase & 0x000f);
|
||||
|
||||
GLFRecord ret = nextRecord;
|
||||
if (recordType == 1) {
|
||||
nextRecord = generateSPC(refBase, inputBinaryCodec);
|
||||
}
|
||||
else if (recordType == 2) {
|
||||
nextRecord = generateVLC(refBase, inputBinaryCodec);
|
||||
}
|
||||
else if (recordType == 0){
|
||||
nextRecord = null;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new StingException("I'm Sorry Dave, I can't let you do that (also GLFReader doesn't support remove()).");
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.utils.genotype.glf;
|
||||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
|
||||
/*
|
||||
|
|
@ -216,5 +217,20 @@ abstract class GLFRecord {
|
|||
return (d > 255.0) ? (byte)255 : (byte)Math.round(d);
|
||||
}
|
||||
|
||||
/**
|
||||
* find the minimum value in a set of doubles
|
||||
* @param vals
|
||||
* @return
|
||||
*/
|
||||
protected static double findMin(double vals[]) {
|
||||
if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in");
|
||||
|
||||
double min = vals[0];
|
||||
for (double d: vals) {
|
||||
if (d < min) min = d;
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@ public class GLFWriter implements GenotypeWriter {
|
|||
SinglePointCall call = new SinglePointCall(refBase, genomicLoc,
|
||||
readDepth,
|
||||
(short) rmsMapQ,
|
||||
lhValues);
|
||||
lhValues.toDoubleArray());
|
||||
call.write(this.outputBinaryCodec);
|
||||
}
|
||||
|
||||
|
|
@ -127,13 +127,10 @@ public class GLFWriter implements GenotypeWriter {
|
|||
VariableLengthCall call = new VariableLengthCall(refBase,
|
||||
genomicLoc,
|
||||
readDepth,
|
||||
lowestLikelihood,
|
||||
(short) rmsMapQ,
|
||||
firstHomZyg.getLikelihood(),
|
||||
secondHomZyg.getLikelihood(),
|
||||
hetLikelihood,
|
||||
firstHomZyg.getLengthOfIndel(),
|
||||
secondHomZyg.getLengthOfIndel(),
|
||||
firstHomZyg.getIndelSequence(),
|
||||
secondHomZyg.getIndelSequence());
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
package org.broadinstitute.sting.utils.genotype.glf;
|
||||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
|
||||
|
||||
/*
|
||||
|
|
@ -38,11 +37,8 @@ import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
|||
*/
|
||||
class SinglePointCall extends GLFRecord {
|
||||
|
||||
// our record type
|
||||
private final RECORD_TYPE type = RECORD_TYPE.SINGLE;
|
||||
|
||||
// our likelihood object
|
||||
private LikelihoodObject likelihoods;
|
||||
private double likelihoods[];
|
||||
|
||||
/**
|
||||
* create a single
|
||||
|
|
@ -51,25 +47,24 @@ class SinglePointCall extends GLFRecord {
|
|||
* @param offset the location, as an offset from the previous glf record
|
||||
* @param readDepth the read depth at the specified postion
|
||||
* @param rmsMapQ the root mean square of the mapping quality
|
||||
* @param lhValues the LikelihoodObject, representing the genotype likelyhoods
|
||||
* @param likelihoods the Likelihoods
|
||||
*/
|
||||
SinglePointCall( char refBase, int offset, int readDepth, short rmsMapQ, LikelihoodObject lhValues ) {
|
||||
super(refBase, offset, (short) lhValues.getBestLikelihood(), readDepth, rmsMapQ);
|
||||
SinglePointCall( char refBase, int offset, int readDepth, short rmsMapQ, double likelihoods[] ) {
|
||||
super(refBase, offset, (short) GLFRecord.findMin(likelihoods), readDepth, rmsMapQ);
|
||||
|
||||
likelihoods = lhValues;
|
||||
this.likelihoods = likelihoods;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Write out the record to a binary codec
|
||||
*
|
||||
* @param out
|
||||
* @param out the codec to write to
|
||||
*/
|
||||
void write( BinaryCodec out ) {
|
||||
super.write(out);
|
||||
short array[] = likelihoods.toByteArray();
|
||||
for (int x = 0; x < array.length; x++) {
|
||||
out.writeUByte(array[x]);
|
||||
for (double likelihood : likelihoods) {
|
||||
out.writeUByte(GLFRecord.toCappedShort(likelihood));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -88,7 +83,7 @@ class SinglePointCall extends GLFRecord {
|
|||
* @return number of bytes we represent
|
||||
*/
|
||||
public int getByteSize() {
|
||||
return likelihoods.genoTypeCount + super.getByteSize();
|
||||
return likelihoods.length + super.getByteSize();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.genotype.glf;
|
|||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
|
|
@ -58,33 +57,27 @@ class VariableLengthCall extends GLFRecord {
|
|||
* @param offset the location, as an offset from the previous glf record
|
||||
* @param readDepth the read depth at the specified postion
|
||||
* @param rmsMapQ the root mean square of the mapping quality
|
||||
* @param minimumLikelihood the minimum likelihood value
|
||||
* @param lkHom1 the negitive log likelihood of the first homozygous indel allele, from 0 to 255
|
||||
* @param lkHom2 the negitive log likelihood of the second homozygous indel allele, from 0 to 255
|
||||
* @param lkHet the negitive log likelihood of the heterozygote, from 0 to 255
|
||||
* @param indelLen1 the length of the first indel allele
|
||||
* @param indelLen2 the length of the second indel allele
|
||||
* @param indelSeq1 the sequence for the first indel allele
|
||||
* @param indelSeq2 the sequence for the second indel allele
|
||||
*/
|
||||
VariableLengthCall( char refBase,
|
||||
long offset,
|
||||
int readDepth,
|
||||
double minimumLikelihood,
|
||||
short rmsMapQ,
|
||||
double lkHom1,
|
||||
double lkHom2,
|
||||
double lkHet,
|
||||
int indelLen1,
|
||||
int indelLen2,
|
||||
final short indelSeq1[],
|
||||
final short indelSeq2[] ) {
|
||||
super(refBase, offset, GLFRecord.toCappedShort(minimumLikelihood), readDepth, rmsMapQ);
|
||||
super(refBase, offset, GLFRecord.toCappedShort(findMin(new double []{lkHom1,lkHom2,lkHet})), readDepth, rmsMapQ);
|
||||
this.lkHom1 = GLFRecord.toCappedShort(lkHom1);
|
||||
this.lkHom2 = GLFRecord.toCappedShort(lkHom2);
|
||||
this.lkHet = GLFRecord.toCappedShort(lkHet);
|
||||
this.indelLen1 = indelLen1;
|
||||
this.indelLen2 = indelLen2;
|
||||
this.indelLen1 = indelSeq1.length;
|
||||
this.indelLen2 = indelSeq2.length;
|
||||
this.indelSeq1 = indelSeq1;
|
||||
this.indelSeq2 = indelSeq2;
|
||||
size = 16 + indelSeq1.length + indelSeq2.length;
|
||||
|
|
@ -103,11 +96,11 @@ class VariableLengthCall extends GLFRecord {
|
|||
out.writeByte(lkHet);
|
||||
out.writeShort(new Integer(indelLen1).shortValue());
|
||||
out.writeShort(new Integer(indelLen2).shortValue());
|
||||
for (int x = 0; x < indelSeq1.length; x++) {
|
||||
out.writeUByte(indelSeq1[x]);
|
||||
for (short anIndelSeq1 : indelSeq1) {
|
||||
out.writeUByte(anIndelSeq1);
|
||||
}
|
||||
for (int x = 0; x < indelSeq2.length; x++) {
|
||||
out.writeUByte(indelSeq2[x]);
|
||||
for (short anIndelSeq2 : indelSeq2) {
|
||||
out.writeUByte(anIndelSeq2);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -120,4 +113,5 @@ class VariableLengthCall extends GLFRecord {
|
|||
public int getByteSize() {
|
||||
return size + super.getByteSize();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue