an initial pass at the GLF reader, and some other genotype changes to phase out the LikelihoodObject I created.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1179 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-07-07 04:30:27 +00:00
parent 5735c87581
commit 2a86f2f833
6 changed files with 191 additions and 41 deletions

View File

@ -161,9 +161,9 @@ public class LikelihoodObject {
*
* @return a float array containing our genotype likelihoods, as negitive log likelihoods
*/
public float[] toFloatArray() {
public double[] toDoubleArray() {
// make an array of floats
float[] ft = new float[10];
double[] ft = new double[10];
int index = 0;
for (GENOTYPE T : GENOTYPE.values()) {
ft[index] = this.likelihood.get(T).floatValue();
@ -178,18 +178,19 @@ public class LikelihoodObject {
* @return a GenotypeLikelihoods object representing our data
*/
public GenotypeLikelihoods convert( SAMFileHeader samHeader, int seqIndex, int seqPosition, byte refBase ) {
float[] ft = toFloatArray();
double[] ft = toDoubleArray();
float[] db = new float[ft.length];
int index = 0;
if (this.mLikelihoodType == LIKELIHOOD_TYPE.NEGITIVE_LOG) {
for (float f : ft) {
f = f * -1.0f;
for (;index < ft.length; index++) {
db[index] = ((float)ft[index] * -1.0f);
}
} else if (this.mLikelihoodType == LIKELIHOOD_TYPE.RAW) {
for (float f : ft) {
f = (float) Math.log(f);
for (;index < ft.length; index++) {
db[index] = (float) Math.log(ft[index]);
}
}
return new GenotypeLikelihoods(samHeader, seqIndex, seqPosition, refBase, ft);
return new GenotypeLikelihoods(samHeader, seqIndex, seqPosition, refBase, db);
}
/**

View File

@ -0,0 +1,147 @@
package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.util.BinaryCodec;
import net.sf.samtools.util.BlockCompressedOutputStream;
import java.io.File;
import java.io.DataOutputStream;
import java.util.Iterator;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* an object for reading in GLF files
*/
public class GLFReader implements Iterator<GLFRecord> {
// our next record
private GLFRecord nextRecord = null;
// the glf magic number, which identifies a properly formatted GLF file
public static final short[] glfMagic = {'G', 'L', 'F', '\3'};
// our input codec
private final BinaryCodec inputBinaryCodec;
// our header string
private String headerStr;
// our reference name
private String referenceName;
// reference length
private int referenceLength;
GLFReader( File readFrom ) {
inputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(readFrom)));
inputBinaryCodec.setInputFileName(readFrom.getName());
// first verify that it's a valid GLF
for (short s: glfMagic) {
if (inputBinaryCodec.readUByte() != s) throw new StingException("Verification of GLF format failed: magic string doesn't match)");
}
// get the header string
headerStr = inputBinaryCodec.readLengthAndString(false);
// get the reference name
referenceName = inputBinaryCodec.readLengthAndString(true);
// get the reference length - this may be a problem storing an unsigned int into a signed int. but screw it.
referenceLength = (int)inputBinaryCodec.readUInt();
// get the next record
nextRecord = next();
}
private SinglePointCall generateSPC(char refBase, BinaryCodec inputBinaryCodec) {
int offset = (int)inputBinaryCodec.readUInt();
long depth = inputBinaryCodec.readUInt();
short min_lk = (short)((depth & 0x00000000ff000000) >> 24);
int readDepth = (int)(depth & 0x0000000000ffffff);
short rmsMapping = inputBinaryCodec.readUByte();
double[] lkValues = new double[LikelihoodObject.GENOTYPE.values().length];
for (int x = 0; x < LikelihoodObject.GENOTYPE.values().length; x++) {
lkValues[x] = inputBinaryCodec.readUByte();
}
return new SinglePointCall(refBase,offset,readDepth,rmsMapping,lkValues);
}
private VariableLengthCall generateVLC(char refBase, BinaryCodec inputBinaryCodec) {
int offset = (int)inputBinaryCodec.readUInt();
int depth = (int)inputBinaryCodec.readUInt();
short min_lk = (short)((depth & 0x00000000ff000000) >> 24);
int readDepth = (depth & 0x0000000000ffffff);
short rmsMapping = inputBinaryCodec.readUByte();
short lkHom1 = inputBinaryCodec.readUByte();
short lkHom2 = inputBinaryCodec.readUByte();
short lkHet = inputBinaryCodec.readUByte();
int indelLen1 = (int)inputBinaryCodec.readShort();
int indelLen2 = (int)inputBinaryCodec.readShort();
short[] indelSeq1 = new short[indelLen1];
short[] indelSeq2 = new short[indelLen2];
for (int x = 0; x < indelLen1; x++) {
indelSeq1[x] = inputBinaryCodec.readUByte();
}
for (int x = 0; x < indelLen2; x++) {
indelSeq2[x] = inputBinaryCodec.readUByte();
}
return new VariableLengthCall(refBase,offset,readDepth,rmsMapping,lkHom1,lkHom2,lkHet,indelSeq1,indelSeq2);
}
@Override
public boolean hasNext() {
return (nextRecord != null);
}
@Override
public GLFRecord next() {
short firstBase = inputBinaryCodec.readUByte();
byte recordType = (byte)(firstBase & 0x00f0 >> 4);
char refBase = (char)(firstBase & 0x000f);
GLFRecord ret = nextRecord;
if (recordType == 1) {
nextRecord = generateSPC(refBase, inputBinaryCodec);
}
else if (recordType == 2) {
nextRecord = generateVLC(refBase, inputBinaryCodec);
}
else if (recordType == 0){
nextRecord = null;
}
return ret;
}
@Override
public void remove() {
throw new StingException("I'm Sorry Dave, I can't let you do that (also GLFReader doesn't support remove()).");
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.util.BinaryCodec;
import org.broadinstitute.sting.utils.StingException;
/*
@ -216,5 +217,20 @@ abstract class GLFRecord {
return (d > 255.0) ? (byte)255 : (byte)Math.round(d);
}
/**
* find the minimum value in a set of doubles
* @param vals
* @return
*/
protected static double findMin(double vals[]) {
if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in");
double min = vals[0];
for (double d: vals) {
if (d < min) min = d;
}
return min;
}
}

View File

@ -89,7 +89,7 @@ public class GLFWriter implements GenotypeWriter {
SinglePointCall call = new SinglePointCall(refBase, genomicLoc,
readDepth,
(short) rmsMapQ,
lhValues);
lhValues.toDoubleArray());
call.write(this.outputBinaryCodec);
}
@ -127,13 +127,10 @@ public class GLFWriter implements GenotypeWriter {
VariableLengthCall call = new VariableLengthCall(refBase,
genomicLoc,
readDepth,
lowestLikelihood,
(short) rmsMapQ,
firstHomZyg.getLikelihood(),
secondHomZyg.getLikelihood(),
hetLikelihood,
firstHomZyg.getLengthOfIndel(),
secondHomZyg.getLengthOfIndel(),
firstHomZyg.getIndelSequence(),
secondHomZyg.getIndelSequence());

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.util.BinaryCodec;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
/*
@ -38,11 +37,8 @@ import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
*/
class SinglePointCall extends GLFRecord {
// our record type
private final RECORD_TYPE type = RECORD_TYPE.SINGLE;
// our likelihood object
private LikelihoodObject likelihoods;
private double likelihoods[];
/**
* create a single
@ -51,25 +47,24 @@ class SinglePointCall extends GLFRecord {
* @param offset the location, as an offset from the previous glf record
* @param readDepth the read depth at the specified postion
* @param rmsMapQ the root mean square of the mapping quality
* @param lhValues the LikelihoodObject, representing the genotype likelyhoods
* @param likelihoods the Likelihoods
*/
SinglePointCall( char refBase, int offset, int readDepth, short rmsMapQ, LikelihoodObject lhValues ) {
super(refBase, offset, (short) lhValues.getBestLikelihood(), readDepth, rmsMapQ);
SinglePointCall( char refBase, int offset, int readDepth, short rmsMapQ, double likelihoods[] ) {
super(refBase, offset, (short) GLFRecord.findMin(likelihoods), readDepth, rmsMapQ);
likelihoods = lhValues;
this.likelihoods = likelihoods;
}
/**
* Write out the record to a binary codec
*
* @param out
* @param out the codec to write to
*/
void write( BinaryCodec out ) {
super.write(out);
short array[] = likelihoods.toByteArray();
for (int x = 0; x < array.length; x++) {
out.writeUByte(array[x]);
for (double likelihood : likelihoods) {
out.writeUByte(GLFRecord.toCappedShort(likelihood));
}
}
@ -88,7 +83,7 @@ class SinglePointCall extends GLFRecord {
* @return number of bytes we represent
*/
public int getByteSize() {
return likelihoods.genoTypeCount + super.getByteSize();
return likelihoods.length + super.getByteSize();
}
}

View File

@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.util.BinaryCodec;
/*
* Copyright (c) 2009 The Broad Institute
*
@ -58,33 +57,27 @@ class VariableLengthCall extends GLFRecord {
* @param offset the location, as an offset from the previous glf record
* @param readDepth the read depth at the specified postion
* @param rmsMapQ the root mean square of the mapping quality
* @param minimumLikelihood the minimum likelihood value
* @param lkHom1 the negitive log likelihood of the first homozygous indel allele, from 0 to 255
* @param lkHom2 the negitive log likelihood of the second homozygous indel allele, from 0 to 255
* @param lkHet the negitive log likelihood of the heterozygote, from 0 to 255
* @param indelLen1 the length of the first indel allele
* @param indelLen2 the length of the second indel allele
* @param indelSeq1 the sequence for the first indel allele
* @param indelSeq2 the sequence for the second indel allele
*/
VariableLengthCall( char refBase,
long offset,
int readDepth,
double minimumLikelihood,
short rmsMapQ,
double lkHom1,
double lkHom2,
double lkHet,
int indelLen1,
int indelLen2,
final short indelSeq1[],
final short indelSeq2[] ) {
super(refBase, offset, GLFRecord.toCappedShort(minimumLikelihood), readDepth, rmsMapQ);
super(refBase, offset, GLFRecord.toCappedShort(findMin(new double []{lkHom1,lkHom2,lkHet})), readDepth, rmsMapQ);
this.lkHom1 = GLFRecord.toCappedShort(lkHom1);
this.lkHom2 = GLFRecord.toCappedShort(lkHom2);
this.lkHet = GLFRecord.toCappedShort(lkHet);
this.indelLen1 = indelLen1;
this.indelLen2 = indelLen2;
this.indelLen1 = indelSeq1.length;
this.indelLen2 = indelSeq2.length;
this.indelSeq1 = indelSeq1;
this.indelSeq2 = indelSeq2;
size = 16 + indelSeq1.length + indelSeq2.length;
@ -103,11 +96,11 @@ class VariableLengthCall extends GLFRecord {
out.writeByte(lkHet);
out.writeShort(new Integer(indelLen1).shortValue());
out.writeShort(new Integer(indelLen2).shortValue());
for (int x = 0; x < indelSeq1.length; x++) {
out.writeUByte(indelSeq1[x]);
for (short anIndelSeq1 : indelSeq1) {
out.writeUByte(anIndelSeq1);
}
for (int x = 0; x < indelSeq2.length; x++) {
out.writeUByte(indelSeq2[x]);
for (short anIndelSeq2 : indelSeq2) {
out.writeUByte(anIndelSeq2);
}
}
@ -120,4 +113,5 @@ class VariableLengthCall extends GLFRecord {
public int getByteSize() {
return size + super.getByteSize();
}
}