From 5546aa44169a2ccd97831a2320e1d63d9c62e1ed Mon Sep 17 00:00:00 2001 From: aaron Date: Mon, 22 Feb 2010 22:27:39 +0000 Subject: [PATCH] adding code to deal with the off-spec situation where our minimum likelihood is above the GLF max of 255. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2871 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/genotype/glf/GLFRecord.java | 35 ++++++++++--------- .../utils/genotype/glf/GLFSingleCall.java | 28 ++++++++++++--- .../genotype/glf/GLFVariableLengthCall.java | 19 +++++++--- .../UnifiedGenotyperIntegrationTest.java | 2 +- .../utils/genotype/glf/GLFWriterTest.java | 35 +++++++++++++++++++ 5 files changed, 94 insertions(+), 25 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java index 3089aae3c..2b7f61cd6 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java @@ -47,7 +47,6 @@ public abstract class GLFRecord { protected String contig; protected REF_BASE refBase; protected long position = 1; - protected short minimumLikelihood = 0; protected int readDepth = 0; protected short rmsMapQ = 0; @@ -126,7 +125,7 @@ public abstract class GLFRecord { SINGLE((short) 1), VARIABLE((short) 2); - private final short fieldValue; // in kilograms + private final short fieldValue; // a place to store the type RECORD_TYPE(short value) { fieldValue = value; @@ -144,13 +143,12 @@ public abstract class GLFRecord { * @param contig the contig string * @param base the reference base in the reference * @param position the distance from the beginning of the reference seq - * @param minimumLikelihood it's minimum likelihood * @param readDepth the read depth at this position * @param rmsMapQ the root mean square of the mapping quality */ - public GLFRecord(String contig, char base, long position, short minimumLikelihood, int readDepth, short rmsMapQ) { + public GLFRecord(String contig, char base, long position, int readDepth, short rmsMapQ) { REF_BASE newBase = REF_BASE.toBase(base); - validateInput(contig, newBase, position, minimumLikelihood, readDepth, rmsMapQ); + validateInput(contig, newBase, position, readDepth, rmsMapQ); } /** @@ -159,12 +157,11 @@ public abstract class GLFRecord { * @param contig the contig string * @param base the reference base in the reference * @param position the distance from the beginning of the reference seq - * @param minimumLikelihood it's minimum likelihood * @param readDepth the read depth at this position * @param rmsMapQ the root mean square of the mapping quality */ - GLFRecord(String contig, REF_BASE base, long position, short minimumLikelihood, int readDepth, short rmsMapQ) { - validateInput(contig, base, position, minimumLikelihood, readDepth, rmsMapQ); + GLFRecord(String contig, REF_BASE base, long position, int readDepth, short rmsMapQ) { + validateInput(contig, base, position, readDepth, rmsMapQ); } /** @@ -173,11 +170,10 @@ public abstract class GLFRecord { * @param chromosome the reference contig, as a String * @param base the reference base in the reference, as a REF_BASE * @param position the distance from the beginning of the reference seq - * @param minimumLikelihood it's minimum likelihood * @param readDepth the read depth at this position * @param rmsMapQ the root mean square of the mapping quality */ - private void validateInput(String chromosome, REF_BASE base, long position, double minimumLikelihood, int readDepth, short rmsMapQ) { + private void validateInput(String chromosome, REF_BASE base, long position, int readDepth, short rmsMapQ) { // add any validation to the contig string here this.contig = chromosome; @@ -188,10 +184,10 @@ public abstract class GLFRecord { } this.position = position; - if (minimumLikelihood > 255 || minimumLikelihood < 0) { - throw new IllegalArgumentException("minimumLikelihood is out of bounds (0 to 0xffffffff) value passed = " + minimumLikelihood); - } - this.minimumLikelihood = GLFRecord.toCappedShort(minimumLikelihood); +// if (minimumLikelihood > 255 || minimumLikelihood < 0) { +// throw new IllegalArgumentException("minimumLikelihood is out of bounds (0 to 0xffffffff) value passed = " + minimumLikelihood); +// } +// this.minimumLikelihood = GLFRecord.toCappedShort(minimumLikelihood); if (readDepth > 16777215 || readDepth < 0) { throw new IllegalArgumentException("readDepth is out of bounds (0 to 0xffffff) value passed = " + readDepth); @@ -218,7 +214,7 @@ public abstract class GLFRecord { short bite = ((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f))); out.writeUByte((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f))); out.writeUInt(((Long) (offset)).intValue()); // we have to subtract one, we're an offset - long write = (long) ((long) (readDepth & 0xffffff) | (long) (this.minimumLikelihood & 0xff) << 24); + long write = (long) ((long) (readDepth & 0xffffff) | (long) (this.getMinimumLikelihood() & 0xff) << 24); out.writeUInt(write); out.writeUByte((short) rmsMapQ); } @@ -276,7 +272,7 @@ public abstract class GLFRecord { } public short getMinimumLikelihood() { - return minimumLikelihood; + return calculateMinLikelihood(); } public int getReadDepth() { @@ -290,5 +286,12 @@ public abstract class GLFRecord { public String getContig() { return this.contig; } + + /** + * this method had to be abstracted so that the underlying records could set the minimum likelihood (ML) in the event + * that the ML is above 255. In this case the records need to scale the value appropriately, and warn the users. + * @return a short of the minimum likelihood. + */ + protected abstract short calculateMinLikelihood(); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFSingleCall.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFSingleCall.java index e1f880d24..dd3dfbb70 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFSingleCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFSingleCall.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.genotype.glf; import net.sf.samtools.util.BinaryCodec; +import org.broadinstitute.sting.utils.Utils; /* @@ -39,7 +40,7 @@ public class GLFSingleCall extends GLFRecord { // our likelihoods array private double likelihoods[]; - + private double minLikelihood; /** * create a single * @@ -51,7 +52,8 @@ public class GLFSingleCall extends GLFRecord { * @param likelihoods the Likelihoods */ public GLFSingleCall(String contig, char refBase, int position, int readDepth, short rmsMapQ, double likelihoods[]) { - super(contig, refBase, position, (short) GLFRecord.findMin(likelihoods), readDepth, rmsMapQ); + super(contig, refBase, position, readDepth, rmsMapQ); + minLikelihood = GLFRecord.findMin(likelihoods); this.likelihoods = likelihoods; } @@ -64,8 +66,6 @@ public class GLFSingleCall extends GLFRecord { void write(BinaryCodec out, GLFRecord lastRec) { super.write(out, lastRec); short[] adjusted = new short[likelihoods.length]; - // find the minimum likelihood as a double - double minLikelihood = GLFRecord.findMin(likelihoods); // we want to scale our values for (int x = 0; x < likelihoods.length; x++) { @@ -98,7 +98,27 @@ public class GLFSingleCall extends GLFRecord { return likelihoods.length + super.getByteSize(); } + /** + * this method had to be abstracted so that the underlying records could set the minimum likelihood (ML) in the event + * that the ML is above 255. In this case the records need to scale their likelihood values appropriately, and warn the user. + * + * @return a short of the minimum likelihood. + */ + @Override + protected short calculateMinLikelihood() { + if (minLikelihood > 255.0) { + double scale = minLikelihood - 255.0; + this.minLikelihood = 255.0; + for (int x = 0; x < this.likelihoods.length; x++) + this.likelihoods[x] = this.likelihoods[x] - scale; + Utils.warnUser("GLFRecord: Locus " + this.getContig() + ":" + this.position + " had it's likelihood information scaled, the original likelihood values are unrecoverable"); + } + return toCappedShort(minLikelihood); + } + public double[] getLikelihoods() { return likelihoods; } + + } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFVariableLengthCall.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFVariableLengthCall.java index 491d9ff30..2276c13d6 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFVariableLengthCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFVariableLengthCall.java @@ -45,7 +45,7 @@ public class GLFVariableLengthCall extends GLFRecord { private int indelLen2 = 0; private final short indelSeq1[]; private final short indelSeq2[]; - + private short minlikelihood; // our size, which is immutable, in bytes private final int size; @@ -57,7 +57,7 @@ public class GLFVariableLengthCall extends GLFRecord { * @param refBase the reference base * @param offset the location, as an offset from the previous glf record * @param offset the location, as an offset from the previous glf record - + s * @param readDepth the read depth at the specified postion * @param rmsMapQ the root mean square of the mapping quality * @param lkHom1 the negitive log likelihood of the first homozygous indel allele, from 0 to 255 @@ -78,7 +78,7 @@ public class GLFVariableLengthCall extends GLFRecord { final short indelSeq1[], int indelTwoLength, final short indelSeq2[]) { - super(contig, refBase, offset, GLFRecord.toCappedShort(findMin(new double[]{lkHom1, lkHom2, lkHet})), readDepth, rmsMapQ); + super(contig, refBase, offset, readDepth, rmsMapQ); this.lkHom1 = GLFRecord.toCappedShort(lkHom1); this.lkHom2 = GLFRecord.toCappedShort(lkHom2); this.lkHet = GLFRecord.toCappedShort(lkHet); @@ -87,7 +87,7 @@ public class GLFVariableLengthCall extends GLFRecord { this.indelSeq1 = indelSeq1; this.indelSeq2 = indelSeq2; size = 16 + indelSeq1.length + indelSeq2.length; - + this.minlikelihood = GLFRecord.toCappedShort(findMin(new double[]{lkHom1, lkHom2, lkHet})); } /** @@ -120,6 +120,17 @@ public class GLFVariableLengthCall extends GLFRecord { return size + super.getByteSize(); } + /** + * this method had to be abstracted so that the underlying records could set the minimum likelihood (ML) in the event + * that the ML is above 255. In this case the records need to scale the value appropriately, and warn the users. + * + * @return a short of the minimum likelihood. + */ + @Override + protected short calculateMinLikelihood() { + return minlikelihood; + } + public short getLkHom1() { return lkHom1; } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 39d697a73..4484bce00 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOtherFormat() { HashMap e = new HashMap(); - e.put( "GLF", "d5d1c5ea8d42712d5509cd1c9c38359d" ); + e.put( "GLF", "ddb1074b6f4a0fd1e15e4381476f1055" ); e.put( "GELI_BINARY", "764a0fed1b3cf089230fd91f3be9c2df" ); for ( Map.Entry entry : e.entrySet() ) { diff --git a/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterTest.java b/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterTest.java index 0f3b9f5d3..64ffb919c 100755 --- a/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFWriterTest.java @@ -93,6 +93,23 @@ public class GLFWriterTest extends BaseTest { return new FakeGenotype(location, genotypes[bestGenotype], ref, SIGNIFICANCE, lk); } + /** + * create a fake genotype with a minimum likelihood greater than 255 + * @param bestGenotype the best genotype, as an index into the array of values + * @param location the location we're creating the genotype at + * @param ref the reference base + * @return a FakeGenotype (a fake genotype) + */ + private FakeGenotype createGreaterThan255MinimumGenotype(int bestGenotype, GenomeLoc location, char ref) { + double lk[] = new double[GENOTYPE_COUNT]; + for (int x = 0; x < GENOTYPE_COUNT; x++) { + lk[x] = -355.0 - (double) x; // they'll all be unique like a snowflake + } + lk[bestGenotype] = -256.0; // lets make the best way better + + return new FakeGenotype(location, genotypes[bestGenotype], ref, SIGNIFICANCE, lk); + } + /** * can we actually write a file? @@ -113,6 +130,24 @@ public class GLFWriterTest extends BaseTest { } + /** + * can we actually write a file? + */ + @Test + public void basicWriteGreaterMinimumLikelihood() { + File writeTo = new File("testGLF2.glf"); + writeTo.deleteOnExit(); + + rec = new GLFWriter(writeTo); + ((GLFWriter)rec).writeHeader(header); + for (int x = 0; x < 5; x++) { + GenomeLoc loc = GenomeLocParser.createGenomeLoc(1, x + 1); + Genotype type = createGreaterThan255MinimumGenotype(x % 10, loc, 'A'); + rec.addGenotypeCall(type); + } + rec.close(); + + } /** * write a bunch of fake records a GLF file, and then read it back from the