GLF writing support
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@872 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
417f5b145e
commit
e712d69382
|
|
@ -2,271 +2,207 @@ package org.broadinstitute.sting.utils.glf;
|
|||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* User: aaron
|
||||
* Date: May 13, 2009
|
||||
* Time: 3:36:18 PM
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date May 13, 2009
|
||||
* <p/>
|
||||
* Class GLFRecord
|
||||
* <p/>
|
||||
* The base record that's stored in the GLF format.
|
||||
* <p/>
|
||||
* Class RecordType
|
||||
* <p/>
|
||||
* The base record type for all GLF entries. Each record has a number of fields
|
||||
* common to the record set. This is also the source of the REF_BASE enumeration,
|
||||
* which represents the accepted FASTA nucleotide symbols and their assocated GLF
|
||||
* field values.
|
||||
*/
|
||||
public class GLFRecord {
|
||||
// our record list
|
||||
private final List<RecordType> records = new ArrayList<RecordType>();
|
||||
abstract class GLFRecord {
|
||||
|
||||
public static final byte[] glfMagic = {'G','L','F','\3'};
|
||||
private String headerText = "";
|
||||
private String referenceSequenceName = "";
|
||||
private long referenceSequenceLength = 0;
|
||||
// fields common to all records
|
||||
protected REF_BASE refBase;
|
||||
protected long offset = 1;
|
||||
protected short minimumLikelihood = 0;
|
||||
protected int readDepth = 0;
|
||||
protected short rmsMapQ = 0;
|
||||
|
||||
private int currentOffset = -1;
|
||||
/**
|
||||
* The public constructor for creating a GLF object
|
||||
* @param headerText the header text (currently unclear what the contents are)
|
||||
* @param referenceSequenceName the reference sequence name
|
||||
*/
|
||||
public GLFRecord(String headerText, String referenceSequenceName, int referenceSequenceLength) {
|
||||
this.headerText = headerText;
|
||||
this.referenceSequenceName = referenceSequenceName;
|
||||
this.referenceSequenceLength = referenceSequenceLength;
|
||||
}
|
||||
// the size of this base structure
|
||||
protected final int baseSize = 10;
|
||||
|
||||
public void addSNPCall(int genomicLoc, long read_depth, int rmsMapQ, LikelihoodObject lhValues) {
|
||||
if (currentOffset >= genomicLoc) {
|
||||
throw new IllegalArgumentException("The location supplied is less then a previous location");
|
||||
}
|
||||
/** the reference base enumeration, with their short (type) values for GLF */
|
||||
public enum REF_BASE {
|
||||
X((short) 0x00),
|
||||
A((short) 0x01),
|
||||
C((short) 0x02),
|
||||
M((short) 0x03),
|
||||
G((short) 0x04),
|
||||
R((short) 0x05),
|
||||
S((short) 0x06),
|
||||
V((short) 0x07),
|
||||
T((short) 0x08),
|
||||
W((short) 0x09),
|
||||
Y((short) 0x0A),
|
||||
H((short) 0x0B),
|
||||
K((short) 0x0C),
|
||||
D((short) 0x0D),
|
||||
B((short) 0x0E),
|
||||
N((short) 0x0F);
|
||||
|
||||
// make sure the read depth isn't too large
|
||||
if (read_depth < 0 || read_depth > 0x00FFFFFF) {
|
||||
throw new IllegalArgumentException("The read depth is too large; must lie in the range 0 to 0x00ffffff");
|
||||
}
|
||||
|
||||
// check that the rmsSquare is greater then 0, and will fit in a uint8
|
||||
if (rmsMapQ > 0x000000FF || rmsMapQ < 0) {
|
||||
throw new IllegalArgumentException("rms of mapping quality is too large; must lie in the range 0 to 0x000000ff");
|
||||
}
|
||||
|
||||
if (lhValues == null) {
|
||||
throw new IllegalArgumentException("likelihood object cannot be null");
|
||||
}
|
||||
|
||||
SinglePointCall call = new SinglePointCall(genomicLoc - currentOffset,
|
||||
read_depth,
|
||||
rmsMapQ,
|
||||
lhValues.toByteArray(),
|
||||
(short)lhValues.getMinimumValue());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the record to a binary codec object
|
||||
*
|
||||
* @param out
|
||||
*/
|
||||
public void write(BinaryCodec out) {
|
||||
out.writeBytes(glfMagic);
|
||||
out.writeString(headerText,true,true);
|
||||
out.writeString(referenceSequenceName,true,true);
|
||||
out.writeUInt(referenceSequenceLength);
|
||||
for (RecordType rec: records) {
|
||||
out.writeUByte(rec.getRecordType().getFieldValue());
|
||||
rec.write(out);
|
||||
}
|
||||
out.writeByte((byte)0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
interface RecordType {
|
||||
|
||||
enum RECORD_TYPE {
|
||||
VARIABLE((short)2),
|
||||
SINGLE((short)1);
|
||||
private final short fieldValue; // in kilograms
|
||||
RECORD_TYPE(short value) {
|
||||
|
||||
/**
|
||||
* private constructor, used by the enum class to makes each enum value
|
||||
* @param value the short values specified in the enum listing
|
||||
*/
|
||||
REF_BASE( short value ) {
|
||||
fieldValue = value;
|
||||
}
|
||||
public short getFieldValue() {
|
||||
|
||||
/**
|
||||
* static method from returning a REF_BASE given the character representation
|
||||
*
|
||||
* @param value the character representation of a REF_BASE
|
||||
*
|
||||
* @return the corresponding REF_BASE
|
||||
* @throws IllegalArgumentException if the value passed can't be converted
|
||||
*/
|
||||
public static REF_BASE toBase( char value ) {
|
||||
String str = String.valueOf(value).toUpperCase();
|
||||
for (int x = 0; x < REF_BASE.values().length; x++) {
|
||||
if (REF_BASE.values()[x].toString().equals(str)) {
|
||||
return REF_BASE.values()[x];
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Counldn't find matching reference base for " + str);
|
||||
}
|
||||
|
||||
/** @return the hex value of the given REF_BASE */
|
||||
public short getBaseHexValue() {
|
||||
return fieldValue;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** the record type enum, which enumerates the different records we can have in a GLF */
|
||||
enum RECORD_TYPE {
|
||||
SINGLE((short) 1),
|
||||
VARIABLE((short) 2);
|
||||
|
||||
private final short fieldValue; // in kilograms
|
||||
|
||||
RECORD_TYPE( short value ) {
|
||||
fieldValue = value;
|
||||
}
|
||||
|
||||
public short getReadTypeValue() {
|
||||
return fieldValue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* write the record out to a binary codec
|
||||
* @param out
|
||||
* Constructor, given the base a character reference base
|
||||
*
|
||||
* @param base the reference base in the reference
|
||||
* @param offset the offset from the beginning of the reference seq
|
||||
* @param minimumLikelihood it's minimum likelihood
|
||||
* @param readDepth the read depth at this position
|
||||
* @param rmsMapQ the root mean square of the mapping quality
|
||||
*/
|
||||
public void write(BinaryCodec out);
|
||||
public GLFRecord( char base, long offset, short minimumLikelihood, int readDepth, short rmsMapQ ) {
|
||||
REF_BASE newBase = REF_BASE.toBase(base);
|
||||
validateInput(newBase, offset, minimumLikelihood, readDepth, rmsMapQ);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor, given the base a REF_BASE
|
||||
*
|
||||
* @param base the reference base in the reference
|
||||
* @param offset the offset from the beginning of the reference seq
|
||||
* @param minimumLikelihood it's minimum likelihood
|
||||
* @param readDepth the read depth at this position
|
||||
* @param rmsMapQ the root mean square of the mapping quality
|
||||
*/
|
||||
GLFRecord( REF_BASE base, long offset, short minimumLikelihood, int readDepth, short rmsMapQ ) {
|
||||
validateInput(base, offset, minimumLikelihood, readDepth, rmsMapQ);
|
||||
}
|
||||
|
||||
/**
|
||||
* validate the input during construction, and store valid values
|
||||
*
|
||||
* @param base the reference base in the reference, as a REF_BASE
|
||||
* @param offset the offset from the beginning of the reference seq
|
||||
* @param minimumLikelihood it's minimum likelihood
|
||||
* @param readDepth the read depth at this position
|
||||
* @param rmsMapQ the root mean square of the mapping quality
|
||||
*/
|
||||
private void validateInput( REF_BASE base, long offset, short minimumLikelihood, int readDepth, short rmsMapQ ) {
|
||||
this.refBase = base;
|
||||
if (offset > 4294967295L || offset < 0) {
|
||||
throw new IllegalArgumentException("Offset is out of bounds (0 to 0xffffffff) value passed = " + offset);
|
||||
}
|
||||
this.offset = offset;
|
||||
|
||||
if (minimumLikelihood > 255 || minimumLikelihood < 0) {
|
||||
throw new IllegalArgumentException("minimumLikelihood is out of bounds (0 to 0xffffffff) value passed = " + minimumLikelihood);
|
||||
}
|
||||
this.minimumLikelihood = minimumLikelihood;
|
||||
|
||||
if (readDepth > 16777215 || readDepth < 0) {
|
||||
throw new IllegalArgumentException("readDepth is out of bounds (0 to 0xffffff) value passed = " + readDepth);
|
||||
}
|
||||
this.readDepth = readDepth;
|
||||
|
||||
if (rmsMapQ > 255 || rmsMapQ < 0) {
|
||||
throw new IllegalArgumentException("rmsMapQ is out of bounds (0 to 0xff) value passed = " + rmsMapQ);
|
||||
}
|
||||
this.rmsMapQ = rmsMapQ;
|
||||
}
|
||||
|
||||
/**
|
||||
* write the this record to a binary codec output.
|
||||
*
|
||||
* @param out the binary codec to write to
|
||||
*/
|
||||
public void write( BinaryCodec out ) {
|
||||
out.writeUByte((short) ( this.getRecordType().getReadTypeValue() << 4 | ( refBase.getBaseHexValue() & 0x0f ) ));
|
||||
out.writeUInt(( (Long) offset ).intValue());
|
||||
out.writeUInt((new Long(readDepth).intValue()));
|
||||
out.writeUByte((short) rmsMapQ);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the record type
|
||||
* @return the record type
|
||||
*
|
||||
* @return the record type enumeration
|
||||
*/
|
||||
public RECORD_TYPE getRecordType();
|
||||
public abstract RECORD_TYPE getRecordType();
|
||||
|
||||
/**
|
||||
*
|
||||
* @return
|
||||
* Return the size of this record in bytes.
|
||||
*
|
||||
* @return the size of this record type, in bytes
|
||||
*/
|
||||
public int getByteSize();
|
||||
public abstract int getByteSize();
|
||||
}
|
||||
|
||||
// the second record type
|
||||
class VariableLengthCall implements RecordType {
|
||||
public int offset = 0;
|
||||
public int min_depth = 0;
|
||||
public byte rmsMapQ = 0;
|
||||
public byte lkHom1 = 0;
|
||||
public byte lkHom2 = 0;
|
||||
public byte lkHet = 0;
|
||||
public short indelLen1 = 0;
|
||||
public short indelLen2 = 0;
|
||||
public final byte indelSeq1[];
|
||||
public final byte indelSeq2[];
|
||||
|
||||
// our size, we're immutable (at least the size is)
|
||||
private final int size; // in bytes
|
||||
|
||||
/**
|
||||
* the BinaryCodec constructor
|
||||
*
|
||||
* @param in the binary codec to get data from
|
||||
*/
|
||||
VariableLengthCall(BinaryCodec in) {
|
||||
offset = in.readInt();
|
||||
min_depth = in.readInt();
|
||||
rmsMapQ = in.readByte();
|
||||
lkHom1 = in.readByte();
|
||||
lkHom2 = in.readByte();
|
||||
lkHet = in.readByte();
|
||||
indelLen1 = in.readShort();
|
||||
indelLen2 = in.readShort();
|
||||
indelSeq1 = new byte[indelLen1];
|
||||
indelSeq2 = new byte[indelLen2];
|
||||
this.size = 16 + indelLen1 + indelLen2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the record to a binary codec
|
||||
*
|
||||
* @param out
|
||||
*/
|
||||
public void write(BinaryCodec out) {
|
||||
out.writeInt(offset);
|
||||
out.writeInt(min_depth);
|
||||
out.writeByte(rmsMapQ);
|
||||
out.writeByte(lkHom1);
|
||||
out.writeByte(lkHom2);
|
||||
out.writeByte(lkHet);
|
||||
out.writeShort(indelLen1);
|
||||
out.writeShort(indelLen2);
|
||||
out.writeBytes(indelSeq1);
|
||||
out.writeBytes(indelSeq2);
|
||||
}
|
||||
|
||||
public RECORD_TYPE getRecordType() {
|
||||
return RECORD_TYPE.VARIABLE;
|
||||
}
|
||||
|
||||
/** @return */
|
||||
public int getByteSize() {
|
||||
return size;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// the first record type
|
||||
class SinglePointCall implements RecordType {
|
||||
// our likelyhood array size
|
||||
public static final int LIKELYHOOD_SIZE = 10;
|
||||
|
||||
// class fields
|
||||
public int offset = 0;
|
||||
public Long min_depth = 0L;
|
||||
public int rmsMapQ = 0;
|
||||
public final short lk[] = new short[LIKELYHOOD_SIZE];
|
||||
public short minimumLikelihood = 0;
|
||||
|
||||
// our size, we're immutable (the size at least)
|
||||
private final int byteSize; // in bytes
|
||||
|
||||
/**
|
||||
* The parameter constructor
|
||||
*
|
||||
* @param offset
|
||||
* @param min_depth
|
||||
* @param rmsMapQ
|
||||
* @param lk
|
||||
*/
|
||||
SinglePointCall(int offset, long min_depth, int rmsMapQ, short[] lk, short minimumLikelihood) {
|
||||
if (lk.length != LIKELYHOOD_SIZE) {
|
||||
throw new IllegalArgumentException("SinglePointCall: passed in likelyhood array size != LIKELYHOOD_SIZE");
|
||||
}
|
||||
this.offset = offset;
|
||||
this.min_depth = min_depth;
|
||||
this.rmsMapQ = rmsMapQ;
|
||||
this.minimumLikelihood = minimumLikelihood;
|
||||
System.arraycopy(lk, 0, this.lk, 0, LIKELYHOOD_SIZE);
|
||||
byteSize = 9 + lk.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* the BinaryCodec constructor
|
||||
*
|
||||
* @param in the binary codec to get data from
|
||||
*/
|
||||
SinglePointCall(BinaryCodec in) {
|
||||
offset = in.readInt();
|
||||
min_depth = Long.valueOf(in.readInt());
|
||||
rmsMapQ = in.readByte();
|
||||
for (int x = 0; x < LIKELYHOOD_SIZE; x++) {
|
||||
lk[x] = in.readUByte();
|
||||
}
|
||||
byteSize = 9 + lk.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the record to a binary codec
|
||||
*
|
||||
* @param out
|
||||
*/
|
||||
public void write(BinaryCodec out) {
|
||||
out.writeInt(offset);
|
||||
out.writeInt(min_depth.intValue());
|
||||
out.writeByte(rmsMapQ);
|
||||
for (int x = 0; x < LIKELYHOOD_SIZE; x++) {
|
||||
out.writeUByte(lk[x]);
|
||||
}
|
||||
}
|
||||
|
||||
public RECORD_TYPE getRecordType() {
|
||||
return RECORD_TYPE.SINGLE;
|
||||
}
|
||||
|
||||
/** @return */
|
||||
public int getByteSize() {
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,170 @@
|
|||
package org.broadinstitute.sting.utils.glf;
|
||||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import net.sf.samtools.util.BlockCompressedOutputStream;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.io.File;
|
||||
import java.io.DataOutputStream;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: May 13, 2009
|
||||
* Time: 3:36:18 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
*/
|
||||
public class GLFWriter {
|
||||
// our output codec
|
||||
private final BinaryCodec outputBinaryCodec;
|
||||
|
||||
public static final short[] glfMagic = {'G', 'L', 'F', '\3'};
|
||||
private String headerText = "";
|
||||
private String referenceSequenceName = "";
|
||||
private long referenceSequenceLength = 0;
|
||||
|
||||
/**
|
||||
* The public constructor for creating a GLF object
|
||||
*
|
||||
* @param headerText the header text (currently unclear what the contents are)
|
||||
* @param referenceSequenceName the reference sequence name
|
||||
*/
|
||||
public GLFWriter( String headerText, String referenceSequenceName, int referenceSequenceLength, File writeTo ) {
|
||||
this.headerText = headerText;
|
||||
this.referenceSequenceName = referenceSequenceName;
|
||||
this.referenceSequenceLength = referenceSequenceLength;
|
||||
outputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(writeTo)));
|
||||
outputBinaryCodec.setOutputFileName(writeTo.toString());
|
||||
this.writeHeader();
|
||||
}
|
||||
|
||||
/**
|
||||
* add a point genotype to the GLF writer
|
||||
*
|
||||
* @param refBase the reference base, as a char
|
||||
* @param genomicLoc the location, as an offset from the previous glf record
|
||||
* @param readDepth the read depth at the specified postion
|
||||
* @param rmsMapQ the root mean square of the mapping quality
|
||||
* @param lhValues the LikelihoodObject, representing the genotype likelyhoods
|
||||
*/
|
||||
public void addPointCall( char refBase,
|
||||
int genomicLoc,
|
||||
int readDepth,
|
||||
short rmsMapQ,
|
||||
LikelihoodObject lhValues ) {
|
||||
|
||||
SinglePointCall call = new SinglePointCall(refBase, genomicLoc,
|
||||
readDepth,
|
||||
rmsMapQ,
|
||||
lhValues.toByteArray(),
|
||||
lhValues);
|
||||
call.write(this.outputBinaryCodec);
|
||||
}
|
||||
|
||||
/**
|
||||
* add a variable length (indel, deletion, etc) to the genotype writer
|
||||
*
|
||||
* @param refBase the reference base
|
||||
* @param genomicLoc the location, as an offset from the previous glf record
|
||||
* @param readDepth the read depth at the specified postion
|
||||
* @param rmsMapQ the root mean square of the mapping quality
|
||||
* @param minimumLikelihood the minimum likelihood value
|
||||
* @param homozygProb1 the probability of the first homozygous indel allele
|
||||
* @param homozygProb2 the probability of the second homozygous indel allele
|
||||
* @param heterozygProb the probability of the heterozygote
|
||||
* @param indelLength1 the length of the first indel allele
|
||||
* @param indelLength2 the length of the second indel allele
|
||||
* @param indelSeq1 the sequence for the first indel allele
|
||||
* @param indelSeq2 the sequence for the second indel allele
|
||||
*/
|
||||
public void addVarLengthCall( char refBase,
|
||||
long genomicLoc,
|
||||
int readDepth,
|
||||
short rmsMapQ,
|
||||
short minimumLikelihood,
|
||||
short homozygProb1,
|
||||
short homozygProb2,
|
||||
short heterozygProb,
|
||||
int indelLength1,
|
||||
int indelLength2,
|
||||
char[] indelSeq1,
|
||||
char[] indelSeq2 ) {
|
||||
|
||||
short[] indexSeqEq1 = new short[indelSeq1.length];
|
||||
short[] indexSeqEq2 = new short[indelSeq2.length];
|
||||
for (int x = 0; x < indelSeq1.length; x++) {
|
||||
indexSeqEq1[x] = new Integer(0x00ff & indelSeq1[x]).shortValue();
|
||||
}
|
||||
for (int x = 0; x < indelSeq2.length; x++) {
|
||||
indexSeqEq2[x] = new Integer(0x00ff & indelSeq2[x]).shortValue();
|
||||
}
|
||||
|
||||
VariableLengthCall call = new VariableLengthCall(refBase,
|
||||
genomicLoc,
|
||||
readDepth,
|
||||
minimumLikelihood,
|
||||
rmsMapQ,
|
||||
homozygProb1,
|
||||
homozygProb2,
|
||||
heterozygProb,
|
||||
indelLength1,
|
||||
indelLength2,
|
||||
indexSeqEq1,
|
||||
indexSeqEq2);
|
||||
|
||||
call.write(this.outputBinaryCodec);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* add a GLF record to the output file
|
||||
* @param rec the GLF record to write.
|
||||
*/
|
||||
public void addGLFRecord( GLFRecord rec ) {
|
||||
rec.write(this.outputBinaryCodec);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the header information for the GLF file
|
||||
*
|
||||
*/
|
||||
private void writeHeader() {
|
||||
for (int x = 0; x < glfMagic.length; x++) {
|
||||
outputBinaryCodec.writeByte(glfMagic[x]);
|
||||
}
|
||||
if (!(headerText.equals(""))) {
|
||||
outputBinaryCodec.writeString(headerText, true, true);
|
||||
} else {
|
||||
outputBinaryCodec.writeInt(0);
|
||||
}
|
||||
outputBinaryCodec.writeString(referenceSequenceName, true, true);
|
||||
outputBinaryCodec.writeUInt(referenceSequenceLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* close the file
|
||||
*
|
||||
*/
|
||||
public void close() {
|
||||
outputBinaryCodec.writeByte((byte) 0);
|
||||
outputBinaryCodec.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -46,19 +46,23 @@ public class LikelihoodObject {
|
|||
AA, AT, AC, AG, CC, CT, CG, GG, GT, TT
|
||||
}
|
||||
|
||||
// the associated probabilities for each genotype
|
||||
// the associated negitive log likelihoods for each genotype
|
||||
final protected HashMap<GENOTYPE, Integer> likelihood = new HashMap<GENOTYPE, Integer>();
|
||||
|
||||
/** create a blank likelihood object */
|
||||
public LikelihoodObject() {
|
||||
for (GENOTYPE type : GENOTYPE.values()) {
|
||||
likelihood.put(type, 0);
|
||||
likelihood.put(type, 255);
|
||||
}
|
||||
}
|
||||
|
||||
// since the default case it to start with all infinity, we can choose any random base
|
||||
GENOTYPE greatest = GENOTYPE.AA;
|
||||
|
||||
|
||||
/**
|
||||
* create a likelyhood object, given an array of genotype scores in GLFv3 ordering
|
||||
* @param values
|
||||
* @param values an array of int's from 0 to 255, representing the negitive log likelihoods.
|
||||
*/
|
||||
public LikelihoodObject(int[] values) {
|
||||
if (values.length != GENOTYPE.values().length) {
|
||||
|
|
@ -77,22 +81,25 @@ public class LikelihoodObject {
|
|||
/**
|
||||
* set the likelihood, given it's probability and the genotype
|
||||
* @param type the genotype
|
||||
* @param likelyhood the likelihood as a float between 0 and 1, which is converted to a byte
|
||||
* @param lh the likelihood as a double between 0 and 1, which is converted to a byte
|
||||
*/
|
||||
public void setLikelihood(GENOTYPE type, float likelyhood) {
|
||||
likelihood.put(type,(int)Math.round(likelyhood*255.0));
|
||||
public void setLikelihood(GENOTYPE type, int lh) {
|
||||
if (lh < 0 || lh > 255) {
|
||||
throw new IllegalArgumentException("supplied likelihood must be between 0 and 255");
|
||||
}
|
||||
likelihood.put(type,lh);
|
||||
if (lh < likelihood.get(this.greatest)) {
|
||||
this.greatest = type;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* find the minimum likelihood value stored in the set
|
||||
* find the minimum likelihood value stored in the set. This represents the most likely genotype,
|
||||
* since genotypes are represented as negitive log likeihoods
|
||||
* @return
|
||||
*/
|
||||
public int getMinimumValue() {
|
||||
int minimum = Integer.MAX_VALUE;
|
||||
for (int i : likelihood.values()) {
|
||||
if (i < minimum) { minimum = i;}
|
||||
}
|
||||
return minimum;
|
||||
return likelihood.get(this.greatest);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,94 @@
|
|||
package org.broadinstitute.sting.utils.glf;
|
||||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class SinglePointCall
|
||||
*
|
||||
* This class represents a single point geneotype call in GLF vernacular
|
||||
**/
|
||||
class SinglePointCall extends GLFRecord {
|
||||
|
||||
// our likelyhood array size
|
||||
public static final int LIKELYHOOD_SIZE = 10;
|
||||
|
||||
// our record type
|
||||
private final RECORD_TYPE type = RECORD_TYPE.SINGLE;
|
||||
|
||||
// our array of likelihoods
|
||||
public final short lk[] = new short[LIKELYHOOD_SIZE];
|
||||
|
||||
// our size, we're immutable (the size at least). In bytes.
|
||||
private final int byteSize;
|
||||
|
||||
|
||||
SinglePointCall(char refBase, int offset, int readDepth, short rmsMapQ, short[] lk, LikelihoodObject minimumLikelihood ) {
|
||||
super(refBase,offset,(short)minimumLikelihood.getMinimumValue(),readDepth,rmsMapQ);
|
||||
|
||||
if (lk.length != LIKELYHOOD_SIZE) {
|
||||
throw new IllegalArgumentException("SinglePointCall: passed in likelyhood array size != LIKELYHOOD_SIZE");
|
||||
}
|
||||
|
||||
System.arraycopy(lk, 0, this.lk, 0, LIKELYHOOD_SIZE);
|
||||
byteSize = 9 + lk.length;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Write out the record to a binary codec
|
||||
*
|
||||
* @param out
|
||||
*/
|
||||
public void write( BinaryCodec out ) {
|
||||
super.write(out);
|
||||
for (int x = 0; x < LIKELYHOOD_SIZE; x++) {
|
||||
out.writeUByte(lk[x]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* return the record type we represent, in this case SINGLE
|
||||
* @return RECORD_TYPE.SINGLE
|
||||
*/
|
||||
public RECORD_TYPE getRecordType() {
|
||||
return RECORD_TYPE.SINGLE;
|
||||
}
|
||||
|
||||
/**
|
||||
* return our size in bytes
|
||||
* @return number of bytes we represent
|
||||
*/
|
||||
public int getByteSize() {
|
||||
return byteSize;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
package org.broadinstitute.sting.utils.glf;
|
||||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VariableLengthCall
|
||||
* <p/>
|
||||
* This class represents variable length genotype calls in the GLF format.
|
||||
* Currently a lot of parameters need to be provided, but we may be able to thin
|
||||
* those down as we understand what we have to specify and what we can infer.
|
||||
*/
|
||||
class VariableLengthCall extends GLFRecord {
|
||||
// our fields, corresponding to the glf spec
|
||||
private short lkHom1 = 0;
|
||||
private short lkHom2 = 0;
|
||||
private short lkHet = 0;
|
||||
private int indelLen1 = 0;
|
||||
private int indelLen2 = 0;
|
||||
private final short indelSeq1[];
|
||||
private final short indelSeq2[];
|
||||
|
||||
// our size, which is immutable, in bytes
|
||||
private final int size;
|
||||
|
||||
|
||||
/** the default constructor */
|
||||
VariableLengthCall( char refBase,
|
||||
long offset,
|
||||
int readDepth,
|
||||
short minimumLikelihood,
|
||||
short rmsMapQ,
|
||||
short lkHom1,
|
||||
short lkHom2,
|
||||
short lkHet,
|
||||
int indelLen1,
|
||||
int indelLen2,
|
||||
final short indelSeq1[],
|
||||
final short indelSeq2[] ) {
|
||||
super(refBase, offset, minimumLikelihood, readDepth, rmsMapQ);
|
||||
this.lkHom1 = lkHom1;
|
||||
this.lkHom2 = lkHom2;
|
||||
this.lkHet = lkHet;
|
||||
this.indelLen1 = indelLen1;
|
||||
this.indelLen2 = indelLen2;
|
||||
this.indelSeq1 = indelSeq1;
|
||||
this.indelSeq2 = indelSeq2;
|
||||
size = 16 + indelSeq1.length + indelSeq2.length;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the record to a binary codec
|
||||
*
|
||||
* @param out the binary codec to write to
|
||||
*/
|
||||
public void write( BinaryCodec out ) {
|
||||
super.write(out);
|
||||
out.writeByte(lkHom1);
|
||||
out.writeByte(lkHom2);
|
||||
out.writeByte(lkHet);
|
||||
out.writeShort(new Integer(indelLen1).shortValue());
|
||||
out.writeShort(new Integer(indelLen2).shortValue());
|
||||
for (int x = 0; x < indelSeq1.length; x++) {
|
||||
out.writeUByte(indelSeq1[x]);
|
||||
}
|
||||
for (int x = 0; x < indelSeq2.length; x++) {
|
||||
out.writeUByte(indelSeq2[x]);
|
||||
}
|
||||
}
|
||||
|
||||
/** @return RECORD_TYPE.VARIABLE */
|
||||
public RECORD_TYPE getRecordType() {
|
||||
return RECORD_TYPE.VARIABLE;
|
||||
}
|
||||
|
||||
/** @return */
|
||||
public int getByteSize() {
|
||||
return size;
|
||||
}
|
||||
}
|
||||
|
|
@ -2,9 +2,13 @@ package org.broadinstitute.sting.utils.glf;
|
|||
|
||||
import org.junit.Test;
|
||||
import org.junit.Before;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import net.sf.samtools.util.BlockCompressedOutputStream;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/*
|
||||
|
|
@ -39,39 +43,57 @@ import java.io.File;
|
|||
* <p/>
|
||||
* Tests for the GLFRecord class
|
||||
*/
|
||||
public class GLFRecordTest {
|
||||
public class GLFWriterTest extends BaseTest {
|
||||
|
||||
/** some made up values that we use to generate the GLF */
|
||||
private final String header = "header";
|
||||
private final String referenceSequenceName = "refSeq";
|
||||
private final String header = "";
|
||||
private final String referenceSequenceName = "chr1";
|
||||
private final int refLength = 1000;
|
||||
File writeTo = new File("testGLF.glf");
|
||||
|
||||
private GLFRecord rec;
|
||||
private GLFWriter rec;
|
||||
|
||||
@Before
|
||||
public void before() {
|
||||
rec = new GLFRecord(header, referenceSequenceName, refLength);
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* make a fake snp
|
||||
*
|
||||
* @param genotype the genotype, 0-15 (AA, AT, AA, ... GG)
|
||||
*/
|
||||
private void addFakeSNP(int genotype, int location) {
|
||||
private void addFakeSNP( int genotype, int location ) {
|
||||
LikelihoodObject obj = new LikelihoodObject();
|
||||
obj.setLikelihood(LikelihoodObject.GENOTYPE.values()[genotype],0.5f);
|
||||
rec.addSNPCall(location,10,10,obj);
|
||||
obj.setLikelihood(LikelihoodObject.GENOTYPE.values()[genotype], 128);
|
||||
int ran = (int) Math.floor(Math.random() * 4.0);
|
||||
char let = 'A';
|
||||
switch (ran) {
|
||||
case 0:
|
||||
let = 'T';
|
||||
break;
|
||||
case 1:
|
||||
let = 'C';
|
||||
break;
|
||||
case 2:
|
||||
let = 'G';
|
||||
break;
|
||||
}
|
||||
try {
|
||||
rec.addPointCall(let, location, 10, (short) 10, obj);
|
||||
} catch (IllegalArgumentException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void basicWrite() {
|
||||
File writeTo = new File("testGLF.glf");
|
||||
BinaryCodec codec = new BinaryCodec(writeTo, true);
|
||||
rec = new GLFWriter(header, referenceSequenceName, refLength, writeTo);
|
||||
for (int x = 0; x < 100; x++) {
|
||||
addFakeSNP(0,x);
|
||||
addFakeSNP((int) Math.round(Math.random() * 9), 1);
|
||||
}
|
||||
rec.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -90,7 +90,7 @@ public class LikelihoodObjectTest extends BaseTest {
|
|||
public void testSetLikelihood() {
|
||||
mLO = new LikelihoodObject();
|
||||
for (LikelihoodObject.GENOTYPE t : LikelihoodObject.GENOTYPE.values()) {
|
||||
mLO.setLikelihood(t,0.5f);
|
||||
mLO.setLikelihood(t,128);
|
||||
}
|
||||
assertTrue(mLO.likelihood.size() == LikelihoodObject.GENOTYPE.values().length);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue