an initial pass at the GLF reader, and some other genotype changes to phase out the LikelihoodObject I created.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1179 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-07-07 04:30:27 +00:00
parent 5735c87581
commit 2a86f2f833
6 changed files with 191 additions and 41 deletions

View File

@ -161,9 +161,9 @@ public class LikelihoodObject {
* *
* @return a float array containing our genotype likelihoods, as negitive log likelihoods * @return a float array containing our genotype likelihoods, as negitive log likelihoods
*/ */
public float[] toFloatArray() { public double[] toDoubleArray() {
// make an array of floats // make an array of floats
float[] ft = new float[10]; double[] ft = new double[10];
int index = 0; int index = 0;
for (GENOTYPE T : GENOTYPE.values()) { for (GENOTYPE T : GENOTYPE.values()) {
ft[index] = this.likelihood.get(T).floatValue(); ft[index] = this.likelihood.get(T).floatValue();
@ -178,18 +178,19 @@ public class LikelihoodObject {
* @return a GenotypeLikelihoods object representing our data * @return a GenotypeLikelihoods object representing our data
*/ */
public GenotypeLikelihoods convert( SAMFileHeader samHeader, int seqIndex, int seqPosition, byte refBase ) { public GenotypeLikelihoods convert( SAMFileHeader samHeader, int seqIndex, int seqPosition, byte refBase ) {
float[] ft = toFloatArray(); double[] ft = toDoubleArray();
float[] db = new float[ft.length];
int index = 0;
if (this.mLikelihoodType == LIKELIHOOD_TYPE.NEGITIVE_LOG) { if (this.mLikelihoodType == LIKELIHOOD_TYPE.NEGITIVE_LOG) {
for (float f : ft) { for (;index < ft.length; index++) {
f = f * -1.0f; db[index] = ((float)ft[index] * -1.0f);
} }
} else if (this.mLikelihoodType == LIKELIHOOD_TYPE.RAW) { } else if (this.mLikelihoodType == LIKELIHOOD_TYPE.RAW) {
for (float f : ft) { for (;index < ft.length; index++) {
f = (float) Math.log(f); db[index] = (float) Math.log(ft[index]);
} }
} }
return new GenotypeLikelihoods(samHeader, seqIndex, seqPosition, refBase, ft); return new GenotypeLikelihoods(samHeader, seqIndex, seqPosition, refBase, db);
} }
/** /**

View File

@ -0,0 +1,147 @@
package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.util.BinaryCodec;
import net.sf.samtools.util.BlockCompressedOutputStream;
import java.io.File;
import java.io.DataOutputStream;
import java.util.Iterator;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* an object for reading in GLF files
*/
public class GLFReader implements Iterator<GLFRecord> {
// our next record
private GLFRecord nextRecord = null;
// the glf magic number, which identifies a properly formatted GLF file
public static final short[] glfMagic = {'G', 'L', 'F', '\3'};
// our input codec
private final BinaryCodec inputBinaryCodec;
// our header string
private String headerStr;
// our reference name
private String referenceName;
// reference length
private int referenceLength;
GLFReader( File readFrom ) {
inputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(readFrom)));
inputBinaryCodec.setInputFileName(readFrom.getName());
// first verify that it's a valid GLF
for (short s: glfMagic) {
if (inputBinaryCodec.readUByte() != s) throw new StingException("Verification of GLF format failed: magic string doesn't match)");
}
// get the header string
headerStr = inputBinaryCodec.readLengthAndString(false);
// get the reference name
referenceName = inputBinaryCodec.readLengthAndString(true);
// get the reference length - this may be a problem storing an unsigned int into a signed int. but screw it.
referenceLength = (int)inputBinaryCodec.readUInt();
// get the next record
nextRecord = next();
}
private SinglePointCall generateSPC(char refBase, BinaryCodec inputBinaryCodec) {
int offset = (int)inputBinaryCodec.readUInt();
long depth = inputBinaryCodec.readUInt();
short min_lk = (short)((depth & 0x00000000ff000000) >> 24);
int readDepth = (int)(depth & 0x0000000000ffffff);
short rmsMapping = inputBinaryCodec.readUByte();
double[] lkValues = new double[LikelihoodObject.GENOTYPE.values().length];
for (int x = 0; x < LikelihoodObject.GENOTYPE.values().length; x++) {
lkValues[x] = inputBinaryCodec.readUByte();
}
return new SinglePointCall(refBase,offset,readDepth,rmsMapping,lkValues);
}
private VariableLengthCall generateVLC(char refBase, BinaryCodec inputBinaryCodec) {
int offset = (int)inputBinaryCodec.readUInt();
int depth = (int)inputBinaryCodec.readUInt();
short min_lk = (short)((depth & 0x00000000ff000000) >> 24);
int readDepth = (depth & 0x0000000000ffffff);
short rmsMapping = inputBinaryCodec.readUByte();
short lkHom1 = inputBinaryCodec.readUByte();
short lkHom2 = inputBinaryCodec.readUByte();
short lkHet = inputBinaryCodec.readUByte();
int indelLen1 = (int)inputBinaryCodec.readShort();
int indelLen2 = (int)inputBinaryCodec.readShort();
short[] indelSeq1 = new short[indelLen1];
short[] indelSeq2 = new short[indelLen2];
for (int x = 0; x < indelLen1; x++) {
indelSeq1[x] = inputBinaryCodec.readUByte();
}
for (int x = 0; x < indelLen2; x++) {
indelSeq2[x] = inputBinaryCodec.readUByte();
}
return new VariableLengthCall(refBase,offset,readDepth,rmsMapping,lkHom1,lkHom2,lkHet,indelSeq1,indelSeq2);
}
@Override
public boolean hasNext() {
return (nextRecord != null);
}
@Override
public GLFRecord next() {
short firstBase = inputBinaryCodec.readUByte();
byte recordType = (byte)(firstBase & 0x00f0 >> 4);
char refBase = (char)(firstBase & 0x000f);
GLFRecord ret = nextRecord;
if (recordType == 1) {
nextRecord = generateSPC(refBase, inputBinaryCodec);
}
else if (recordType == 2) {
nextRecord = generateVLC(refBase, inputBinaryCodec);
}
else if (recordType == 0){
nextRecord = null;
}
return ret;
}
@Override
public void remove() {
throw new StingException("I'm Sorry Dave, I can't let you do that (also GLFReader doesn't support remove()).");
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.utils.genotype.glf; package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.util.BinaryCodec; import net.sf.samtools.util.BinaryCodec;
import org.broadinstitute.sting.utils.StingException;
/* /*
@ -216,5 +217,20 @@ abstract class GLFRecord {
return (d > 255.0) ? (byte)255 : (byte)Math.round(d); return (d > 255.0) ? (byte)255 : (byte)Math.round(d);
} }
/**
* find the minimum value in a set of doubles
* @param vals
* @return
*/
protected static double findMin(double vals[]) {
if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in");
double min = vals[0];
for (double d: vals) {
if (d < min) min = d;
}
return min;
}
} }

View File

@ -89,7 +89,7 @@ public class GLFWriter implements GenotypeWriter {
SinglePointCall call = new SinglePointCall(refBase, genomicLoc, SinglePointCall call = new SinglePointCall(refBase, genomicLoc,
readDepth, readDepth,
(short) rmsMapQ, (short) rmsMapQ,
lhValues); lhValues.toDoubleArray());
call.write(this.outputBinaryCodec); call.write(this.outputBinaryCodec);
} }
@ -127,13 +127,10 @@ public class GLFWriter implements GenotypeWriter {
VariableLengthCall call = new VariableLengthCall(refBase, VariableLengthCall call = new VariableLengthCall(refBase,
genomicLoc, genomicLoc,
readDepth, readDepth,
lowestLikelihood,
(short) rmsMapQ, (short) rmsMapQ,
firstHomZyg.getLikelihood(), firstHomZyg.getLikelihood(),
secondHomZyg.getLikelihood(), secondHomZyg.getLikelihood(),
hetLikelihood, hetLikelihood,
firstHomZyg.getLengthOfIndel(),
secondHomZyg.getLengthOfIndel(),
firstHomZyg.getIndelSequence(), firstHomZyg.getIndelSequence(),
secondHomZyg.getIndelSequence()); secondHomZyg.getIndelSequence());

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.utils.genotype.glf; package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.util.BinaryCodec; import net.sf.samtools.util.BinaryCodec;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
/* /*
@ -38,11 +37,8 @@ import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
*/ */
class SinglePointCall extends GLFRecord { class SinglePointCall extends GLFRecord {
// our record type
private final RECORD_TYPE type = RECORD_TYPE.SINGLE;
// our likelihood object // our likelihood object
private LikelihoodObject likelihoods; private double likelihoods[];
/** /**
* create a single * create a single
@ -51,25 +47,24 @@ class SinglePointCall extends GLFRecord {
* @param offset the location, as an offset from the previous glf record * @param offset the location, as an offset from the previous glf record
* @param readDepth the read depth at the specified postion * @param readDepth the read depth at the specified postion
* @param rmsMapQ the root mean square of the mapping quality * @param rmsMapQ the root mean square of the mapping quality
* @param lhValues the LikelihoodObject, representing the genotype likelyhoods * @param likelihoods the Likelihoods
*/ */
SinglePointCall( char refBase, int offset, int readDepth, short rmsMapQ, LikelihoodObject lhValues ) { SinglePointCall( char refBase, int offset, int readDepth, short rmsMapQ, double likelihoods[] ) {
super(refBase, offset, (short) lhValues.getBestLikelihood(), readDepth, rmsMapQ); super(refBase, offset, (short) GLFRecord.findMin(likelihoods), readDepth, rmsMapQ);
likelihoods = lhValues; this.likelihoods = likelihoods;
} }
/** /**
* Write out the record to a binary codec * Write out the record to a binary codec
* *
* @param out * @param out the codec to write to
*/ */
void write( BinaryCodec out ) { void write( BinaryCodec out ) {
super.write(out); super.write(out);
short array[] = likelihoods.toByteArray(); for (double likelihood : likelihoods) {
for (int x = 0; x < array.length; x++) { out.writeUByte(GLFRecord.toCappedShort(likelihood));
out.writeUByte(array[x]);
} }
} }
@ -88,7 +83,7 @@ class SinglePointCall extends GLFRecord {
* @return number of bytes we represent * @return number of bytes we represent
*/ */
public int getByteSize() { public int getByteSize() {
return likelihoods.genoTypeCount + super.getByteSize(); return likelihoods.length + super.getByteSize();
} }
} }

View File

@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.util.BinaryCodec; import net.sf.samtools.util.BinaryCodec;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
* *
@ -58,33 +57,27 @@ class VariableLengthCall extends GLFRecord {
* @param offset the location, as an offset from the previous glf record * @param offset the location, as an offset from the previous glf record
* @param readDepth the read depth at the specified postion * @param readDepth the read depth at the specified postion
* @param rmsMapQ the root mean square of the mapping quality * @param rmsMapQ the root mean square of the mapping quality
* @param minimumLikelihood the minimum likelihood value
* @param lkHom1 the negitive log likelihood of the first homozygous indel allele, from 0 to 255 * @param lkHom1 the negitive log likelihood of the first homozygous indel allele, from 0 to 255
* @param lkHom2 the negitive log likelihood of the second homozygous indel allele, from 0 to 255 * @param lkHom2 the negitive log likelihood of the second homozygous indel allele, from 0 to 255
* @param lkHet the negitive log likelihood of the heterozygote, from 0 to 255 * @param lkHet the negitive log likelihood of the heterozygote, from 0 to 255
* @param indelLen1 the length of the first indel allele
* @param indelLen2 the length of the second indel allele
* @param indelSeq1 the sequence for the first indel allele * @param indelSeq1 the sequence for the first indel allele
* @param indelSeq2 the sequence for the second indel allele * @param indelSeq2 the sequence for the second indel allele
*/ */
VariableLengthCall( char refBase, VariableLengthCall( char refBase,
long offset, long offset,
int readDepth, int readDepth,
double minimumLikelihood,
short rmsMapQ, short rmsMapQ,
double lkHom1, double lkHom1,
double lkHom2, double lkHom2,
double lkHet, double lkHet,
int indelLen1,
int indelLen2,
final short indelSeq1[], final short indelSeq1[],
final short indelSeq2[] ) { final short indelSeq2[] ) {
super(refBase, offset, GLFRecord.toCappedShort(minimumLikelihood), readDepth, rmsMapQ); super(refBase, offset, GLFRecord.toCappedShort(findMin(new double []{lkHom1,lkHom2,lkHet})), readDepth, rmsMapQ);
this.lkHom1 = GLFRecord.toCappedShort(lkHom1); this.lkHom1 = GLFRecord.toCappedShort(lkHom1);
this.lkHom2 = GLFRecord.toCappedShort(lkHom2); this.lkHom2 = GLFRecord.toCappedShort(lkHom2);
this.lkHet = GLFRecord.toCappedShort(lkHet); this.lkHet = GLFRecord.toCappedShort(lkHet);
this.indelLen1 = indelLen1; this.indelLen1 = indelSeq1.length;
this.indelLen2 = indelLen2; this.indelLen2 = indelSeq2.length;
this.indelSeq1 = indelSeq1; this.indelSeq1 = indelSeq1;
this.indelSeq2 = indelSeq2; this.indelSeq2 = indelSeq2;
size = 16 + indelSeq1.length + indelSeq2.length; size = 16 + indelSeq1.length + indelSeq2.length;
@ -103,11 +96,11 @@ class VariableLengthCall extends GLFRecord {
out.writeByte(lkHet); out.writeByte(lkHet);
out.writeShort(new Integer(indelLen1).shortValue()); out.writeShort(new Integer(indelLen1).shortValue());
out.writeShort(new Integer(indelLen2).shortValue()); out.writeShort(new Integer(indelLen2).shortValue());
for (int x = 0; x < indelSeq1.length; x++) { for (short anIndelSeq1 : indelSeq1) {
out.writeUByte(indelSeq1[x]); out.writeUByte(anIndelSeq1);
} }
for (int x = 0; x < indelSeq2.length; x++) { for (short anIndelSeq2 : indelSeq2) {
out.writeUByte(indelSeq2[x]); out.writeUByte(anIndelSeq2);
} }
} }
@ -120,4 +113,5 @@ class VariableLengthCall extends GLFRecord {
public int getByteSize() { public int getByteSize() {
return size + super.getByteSize(); return size + super.getByteSize();
} }
} }