changes for corrected GLF likelihood output, along with better tests

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1754 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-10-01 20:45:05 +00:00
parent 2309d19f6f
commit e885cc4b21
10 changed files with 214 additions and 79 deletions

View File

@ -18,7 +18,7 @@ import java.util.List;
* The single sample implementation of the genotype interface, which contains * The single sample implementation of the genotype interface, which contains
* extra information for the various genotype outputs * extra information for the various genotype outputs
*/ */
public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, LikelihoodsBacked { public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, LikelihoodsBacked, PosteriorsBacked {
private final char mRefBase; private final char mRefBase;
private final GenotypeLikelihoods mGenotypeLikelihoods; private final GenotypeLikelihoods mGenotypeLikelihoods;
@ -279,11 +279,19 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li
* @return * @return
*/ */
public double[] getLikelihoods() { public double[] getLikelihoods() {
// TODO: this is wrong, obviously, but we've kept it for now to stay backward compatible with previous calls return this.mGenotypeLikelihoods.getLikelihoods();
return this.mGenotypeLikelihoods.getPosteriors();
} }
/**
* get the likelihood information for this
*
* @return
*/
@Override
public double[] getPosteriors() {
return this.mGenotypeLikelihoods.getPosteriors();
}
} }

View File

@ -110,7 +110,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
// a method to support getting the geli string, since the AlleleFrequencyEstimate is going away // a method to support getting the geli string, since the AlleleFrequencyEstimate is going away
public String toGeliString (Genotype locus) { public String toGeliString (Genotype locus) {
double likelihoods[]; double posteriors[];
int readDepth = -1; int readDepth = -1;
double nextVrsBest = 0; double nextVrsBest = 0;
double nextVrsRef = 0; double nextVrsRef = 0;
@ -121,17 +121,17 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
readDepth = ((ReadBacked)locus).getReadCount(); readDepth = ((ReadBacked)locus).getReadCount();
} }
if (!(locus instanceof GenotypesBacked)) { if (!(locus instanceof GenotypesBacked)) {
likelihoods = new double[10]; posteriors = new double[10];
Arrays.fill(likelihoods, 0.0); Arrays.fill(posteriors, 0.0);
} else { } else {
likelihoods = ((LikelihoodsBacked) locus).getLikelihoods(); posteriors = ((PosteriorsBacked) locus).getPosteriors();
double[] lks; double[] lks;
lks = Arrays.copyOf(likelihoods,likelihoods.length); lks = Arrays.copyOf(posteriors,posteriors.length);
Arrays.sort(lks); Arrays.sort(lks);
nextVrsBest = lks[9] - lks[8]; nextVrsBest = lks[9] - lks[8];
if (ref != 'X') { if (ref != 'X') {
int index = (DiploidGenotype.valueOf(Utils.dupString(ref,2)).ordinal()); int index = (DiploidGenotype.valueOf(Utils.dupString(ref,2)).ordinal());
nextVrsRef = lks[9] - likelihoods[index]; nextVrsRef = lks[9] - posteriors[index];
} }
} }
// we have to calcuate our own // we have to calcuate our own
@ -145,16 +145,16 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
locus.getBases(), locus.getBases(),
nextVrsRef, nextVrsRef,
nextVrsBest, nextVrsBest,
likelihoods[0], posteriors[0],
likelihoods[1], posteriors[1],
likelihoods[2], posteriors[2],
likelihoods[3], posteriors[3],
likelihoods[4], posteriors[4],
likelihoods[5], posteriors[5],
likelihoods[6], posteriors[6],
likelihoods[7], posteriors[7],
likelihoods[8], posteriors[8],
likelihoods[9])); posteriors[9]));
} }
} }

View File

@ -0,0 +1,18 @@
package org.broadinstitute.sting.utils.genotype;
/**
* @author aaron
* <p/>
* Interface PosteriorsBacked
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public interface PosteriorsBacked {
/**
* get the likelihood information for this
* @return
*/
public double[] getPosteriors();
}

View File

@ -98,15 +98,15 @@ public class GeliAdapter implements GenotypeWriter {
*/ */
@Override @Override
public void addGenotypeCall(Genotype locus) { public void addGenotypeCall(Genotype locus) {
double likelihoods[]; double posteriors[];
int readDepth = -1; int readDepth = -1;
double nextVrsBest = 0; double nextVrsBest = 0;
double nextVrsRef = 0; double nextVrsRef = 0;
if (!(locus instanceof GenotypesBacked)) { if (!(locus instanceof GenotypesBacked)) {
likelihoods = new double[10]; posteriors = new double[10];
Arrays.fill(likelihoods, Double.MIN_VALUE); Arrays.fill(posteriors, Double.MIN_VALUE);
} else { } else {
likelihoods = ((LikelihoodsBacked) locus).getLikelihoods(); posteriors = ((PosteriorsBacked) locus).getPosteriors();
} }
char ref = locus.getReference(); char ref = locus.getReference();

View File

@ -50,7 +50,7 @@ public class GeliTextWriter implements GenotypeWriter {
* @param locus the locus to add * @param locus the locus to add
*/ */
public void addGenotypeCall(Genotype locus) { public void addGenotypeCall(Genotype locus) {
double likelihoods[]; double posteriors[];
int readDepth = -1; int readDepth = -1;
double nextVrsBest = 0; double nextVrsBest = 0;
double nextVrsRef = 0; double nextVrsRef = 0;
@ -61,17 +61,17 @@ public class GeliTextWriter implements GenotypeWriter {
readDepth = ((ReadBacked)locus).getReadCount(); readDepth = ((ReadBacked)locus).getReadCount();
} }
if (!(locus instanceof GenotypesBacked)) { if (!(locus instanceof GenotypesBacked)) {
likelihoods = new double[10]; posteriors = new double[10];
Arrays.fill(likelihoods, 0.0); Arrays.fill(posteriors, 0.0);
} else { } else {
likelihoods = ((LikelihoodsBacked) locus).getLikelihoods(); posteriors = ((PosteriorsBacked) locus).getPosteriors();
double[] lks; double[] lks;
lks = Arrays.copyOf(likelihoods,likelihoods.length); lks = Arrays.copyOf(posteriors,posteriors.length);
Arrays.sort(lks); Arrays.sort(lks);
nextVrsBest = lks[9] - lks[8]; nextVrsBest = lks[9] - lks[8];
if (ref != 'X') { if (ref != 'X') {
int index = (DiploidGenotype.valueOf(Utils.dupString(ref,2)).ordinal()); int index = (DiploidGenotype.valueOf(Utils.dupString(ref,2)).ordinal());
nextVrsRef = lks[9] - likelihoods[index]; nextVrsRef = lks[9] - posteriors[index];
} }
} }
// we have to calcuate our own // we have to calcuate our own
@ -85,16 +85,16 @@ public class GeliTextWriter implements GenotypeWriter {
locus.getBases(), locus.getBases(),
nextVrsRef, nextVrsRef,
nextVrsBest, nextVrsBest,
likelihoods[0], posteriors[0],
likelihoods[1], posteriors[1],
likelihoods[2], posteriors[2],
likelihoods[3], posteriors[3],
likelihoods[4], posteriors[4],
likelihoods[5], posteriors[5],
likelihoods[6], posteriors[6],
likelihoods[7], posteriors[7],
likelihoods[8], posteriors[8],
likelihoods[9])); posteriors[9]));
} }
/** /**

View File

@ -108,7 +108,7 @@ public class GLFReader implements Iterator<GLFRecord> {
short rmsMapping = inputBinaryCodec.readUByte(); short rmsMapping = inputBinaryCodec.readUByte();
double[] lkValues = new double[LikelihoodObject.GENOTYPE.values().length]; double[] lkValues = new double[LikelihoodObject.GENOTYPE.values().length];
for (int x = 0; x < LikelihoodObject.GENOTYPE.values().length; x++) { for (int x = 0; x < LikelihoodObject.GENOTYPE.values().length; x++) {
lkValues[x] = inputBinaryCodec.readUByte(); lkValues[x] = (inputBinaryCodec.readUByte() / GLFRecord.LIKELIHOOD_SCALE_FACTOR + min_lk);
} }
return new SinglePointCall(refBase, offset, readDepth, rmsMapping, lkValues); return new SinglePointCall(refBase, offset, readDepth, rmsMapping, lkValues);
} }

View File

@ -40,7 +40,7 @@ import org.broadinstitute.sting.utils.StingException;
* field values. * field values.
*/ */
public abstract class GLFRecord { public abstract class GLFRecord {
public final static double LIKELIHOOD_SCALE_FACTOR = 1; public final static double LIKELIHOOD_SCALE_FACTOR = 10;
// fields common to all records // fields common to all records
@ -247,9 +247,9 @@ public abstract class GLFRecord {
if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in"); if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in");
double min = vals[0]; double min = vals[0];
for (double d : vals) { for (double d : vals)
if (d < min) min = d; if (d < min) min = d;
}
return GLFRecord.toCappedShort(min); return GLFRecord.toCappedShort(min);
} }

View File

@ -113,7 +113,7 @@ public class GLFWriter implements GenotypeWriter {
// get likelihood information if available // get likelihood information if available
LikelihoodObject obj; LikelihoodObject obj;
if (locus instanceof GenotypesBacked) { if (locus instanceof LikelihoodsBacked) {
obj = new LikelihoodObject(((LikelihoodsBacked) locus).getLikelihoods(), LikelihoodObject.LIKELIHOOD_TYPE.LOG); obj = new LikelihoodObject(((LikelihoodsBacked) locus).getLikelihoods(), LikelihoodObject.LIKELIHOOD_TYPE.LOG);
} else { } else {
double values[] = new double[10]; double values[] = new double[10];
@ -122,12 +122,14 @@ public class GLFWriter implements GenotypeWriter {
} }
obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); // transform! ... to negitive log likelihoods obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); // transform! ... to negitive log likelihoods
double rms = -1; double rms = 0;
int readCount = 0;
// calculate the RMS mapping qualities and the read depth // calculate the RMS mapping qualities and the read depth
if (locus instanceof ReadBacked) { if (locus instanceof ReadBacked) {
rms = calculateRMS(((ReadBacked)locus).getReads()); rms = calculateRMS(((ReadBacked)locus).getReads());
readCount = ((ReadBacked)locus).getReadCount();
} }
this.addGenotypeCall(GenomeLocParser.getContigInfo(locus.getLocation().getContig()),(int)locus.getLocation().getStart(),(float)rms,ref,((ReadBacked)locus).getReadCount(),obj); this.addGenotypeCall(GenomeLocParser.getContigInfo(locus.getLocation().getContig()),(int)locus.getLocation().getStart(),(float)rms,ref,readCount,obj);
} }

View File

@ -62,9 +62,14 @@ public class SinglePointCall extends GLFRecord {
*/ */
void write(BinaryCodec out) { void write(BinaryCodec out) {
super.write(out); super.write(out);
short[] adjusted = new short[likelihoods.length];
// we want to scale our values
for (int x = 0; x < likelihoods.length; x++) {
adjusted[x] = GLFRecord.toCappedShort(LIKELIHOOD_SCALE_FACTOR * (Math.round(likelihoods[x]) - this.minimumLikelihood));
}
try { try {
for (double likelihood : likelihoods) { for (short value : adjusted) {
out.writeUByte(GLFRecord.toCappedShort(likelihood)); out.writeUByte(value);
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();

View File

@ -1,12 +1,23 @@
package org.broadinstitute.sting.utils.genotype.glf; package org.broadinstitute.sting.utils.genotype.glf;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject; import org.broadinstitute.sting.utils.genotype.LikelihoodsBacked;
import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/* /*
@ -47,51 +58,142 @@ public class GLFWriterTest extends BaseTest {
private final String header = ""; private final String header = "";
private final String referenceSequenceName = "chr1"; private final String referenceSequenceName = "chr1";
private final int refLength = 1000; private final int refLength = 1000;
File writeTo = new File("testGLF.glf"); private static final int GENOTYPE_COUNT = 10;
private GenotypeWriter rec; private GenotypeWriter rec;
protected static final String[] genotypes = {"AA", "AC", "AG", "AT", "CC", "CG", "CT", "GG", "GT", "TT"};
private static IndexedFastaSequenceFile seq;
protected final static double SIGNIFICANCE = 5.1;
@Before @Before
public void before() { public void before() {
} }
/** @BeforeClass
* make a fake snp public static void beforeTests() {
* try {
* @param genotype the genotype, 0-15 (AA, AT, AA, ... GG) seq = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta"));
*/ } catch (FileNotFoundException e) {
private void addFakeSNP( int genotype, int location ) { throw new StingException("unable to load the sequence dictionary");
LikelihoodObject obj = new LikelihoodObject();
obj.setLikelihood(LikelihoodObject.GENOTYPE.values()[genotype], 128);
int ran = (int) Math.floor(Math.random() * 4.0);
char let = 'A';
switch (ran) {
case 0:
let = 'T';
break;
case 1:
let = 'C';
break;
case 2:
let = 'G';
break;
} }
/*try { GenomeLocParser.setupRefContigOrdering(seq);
rec.addPointCall(let, location, 10, (short) 10, obj);
} catch (IllegalArgumentException e) { }
e.printStackTrace();
} */
private FakeGenotype createGenotype(int bestGenotype, GenomeLoc location, char ref) {
double lk[] = new double[GENOTYPE_COUNT];
for (int x = 0; x < GENOTYPE_COUNT; x++) {
lk[x] = -15.0 - (double) x; // they'll all be unique like a snowflake
}
lk[bestGenotype] = -10.0; // lets make the best way better
return new FakeGenotype(location, genotypes[bestGenotype], ref, SIGNIFICANCE, lk);
}
@Test
public void basicWrite() {
File writeTo = new File("testGLF.glf");
writeTo.deleteOnExit();
rec = new GLFWriter(header, writeTo);
for (int x = 0; x < 100; x++) {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(1, x + 1);
Genotype type = createGenotype(x % 10, loc, 'A');
rec.addGenotypeCall(type);
}
rec.close();
} }
@Test @Test
public void basicWrite() { public void basicWriteThenRead() {
File writeTo = new File("testGLF2.glf");
//writeTo.deleteOnExit();
List<FakeGenotype> types = new ArrayList<FakeGenotype>();
rec = new GLFWriter(header, writeTo); rec = new GLFWriter(header, writeTo);
for (int x = 0; x < 100; x++) { for (int x = 0; x < 100; x++) {
addFakeSNP((int) Math.round(Math.random() * 9), 1); GenomeLoc loc = GenomeLocParser.createGenomeLoc(1, x + 1);
FakeGenotype type = createGenotype(x % 10, loc, 'A');
types.add(type);
rec.addGenotypeCall(type);
} }
rec.close(); rec.close();
GLFReader reader = new GLFReader(writeTo);
int count = 0;
while (reader.hasNext()) {
GLFRecord rec = reader.next();
Assert.assertTrue(types.get(count).compareTo(FakeGenotype.toFakeGenotype((SinglePointCall) rec, reader.getReferenceName(), reader.getCurrentLocation())) == 0);
count++;
}
}
}
class FakeGenotype extends BasicGenotype implements LikelihoodsBacked, Comparable<FakeGenotype> {
private double[] likelihoods;
/**
* create a basic genotype, given the following fields
*
* @param location the genomic location
* @param genotype the genotype, as a string, where ploidy = string.length
* @param ref the reference base as a char
* @param negLog10PError the confidence score
*/
public FakeGenotype(GenomeLoc location, String genotype, char ref, double negLog10PError, double likelihoods[]) {
super(location, genotype, ref, negLog10PError);
this.likelihoods = likelihoods;
}
/**
* get the likelihood information for this
*
* @return
*/
@Override
public double[] getLikelihoods() {
return likelihoods;
}
@Override
public int compareTo(FakeGenotype that) {
if (this.getLocation().compareTo(that.getLocation()) != 0) {
System.err.println("Location's aren't equal; this = " + this.getLocation() + " that = " + that.getLocation());
return this.getLocation().compareTo(that.getLocation());
}
if (!this.getBases().equals(that.getBases())) {
System.err.println("getBases's aren't equal; this = " + this.getBases() + " that = " + that.getBases());
return -1;
}
for (int x = 0; x < this.likelihoods.length; x++) {
if (this.likelihoods[x] != that.getLikelihoods()[x]) {
System.err.println("likelihoods' aren't equal; this = " + this.likelihoods[x] + " that = " + that.getLikelihoods()[x]);
return -1;
}
}
return 0;
}
public static FakeGenotype toFakeGenotype(SinglePointCall record, String contig, int postition) {
double likelihoods[] = record.getLikelihoods();
char ref = record.getRefBase().toChar();
double significance = GLFWriterTest.SIGNIFICANCE;
int minIndex = 0;
for (int i = 0; i < likelihoods.length; i++) {
if (likelihoods[i] < likelihoods[minIndex]) minIndex = i;
}
for (int i = 0; i < likelihoods.length; i++) {
likelihoods[i] = likelihoods[i] * -1;
}
String genotype = GLFWriterTest.genotypes[minIndex];
GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig, postition);
return new FakeGenotype(loc, genotype, ref, significance, likelihoods);
} }
} }