changes for corrected GLF likelihood output, along with better tests

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1754 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-10-01 20:45:05 +00:00
parent 2309d19f6f
commit e885cc4b21
10 changed files with 214 additions and 79 deletions

View File

@ -18,7 +18,7 @@ import java.util.List;
* The single sample implementation of the genotype interface, which contains
* extra information for the various genotype outputs
*/
public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, LikelihoodsBacked {
public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, LikelihoodsBacked, PosteriorsBacked {
private final char mRefBase;
private final GenotypeLikelihoods mGenotypeLikelihoods;
@ -279,11 +279,19 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li
* @return
*/
public double[] getLikelihoods() {
// TODO: this is wrong, obviously, but we've kept it for now to stay backward compatible with previous calls
return this.mGenotypeLikelihoods.getPosteriors();
return this.mGenotypeLikelihoods.getLikelihoods();
}
/**
* get the likelihood information for this
*
* @return
*/
@Override
public double[] getPosteriors() {
return this.mGenotypeLikelihoods.getPosteriors();
}
}

View File

@ -110,7 +110,7 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
// a method to support getting the geli string, since the AlleleFrequencyEstimate is going away
public String toGeliString (Genotype locus) {
double likelihoods[];
double posteriors[];
int readDepth = -1;
double nextVrsBest = 0;
double nextVrsRef = 0;
@ -121,17 +121,17 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
readDepth = ((ReadBacked)locus).getReadCount();
}
if (!(locus instanceof GenotypesBacked)) {
likelihoods = new double[10];
Arrays.fill(likelihoods, 0.0);
posteriors = new double[10];
Arrays.fill(posteriors, 0.0);
} else {
likelihoods = ((LikelihoodsBacked) locus).getLikelihoods();
posteriors = ((PosteriorsBacked) locus).getPosteriors();
double[] lks;
lks = Arrays.copyOf(likelihoods,likelihoods.length);
lks = Arrays.copyOf(posteriors,posteriors.length);
Arrays.sort(lks);
nextVrsBest = lks[9] - lks[8];
if (ref != 'X') {
int index = (DiploidGenotype.valueOf(Utils.dupString(ref,2)).ordinal());
nextVrsRef = lks[9] - likelihoods[index];
nextVrsRef = lks[9] - posteriors[index];
}
}
// we have to calcuate our own
@ -145,16 +145,16 @@ public class CoverageEvalWalker extends LocusWalker<List<String>, String> {
locus.getBases(),
nextVrsRef,
nextVrsBest,
likelihoods[0],
likelihoods[1],
likelihoods[2],
likelihoods[3],
likelihoods[4],
likelihoods[5],
likelihoods[6],
likelihoods[7],
likelihoods[8],
likelihoods[9]));
posteriors[0],
posteriors[1],
posteriors[2],
posteriors[3],
posteriors[4],
posteriors[5],
posteriors[6],
posteriors[7],
posteriors[8],
posteriors[9]));
}
}

View File

@ -0,0 +1,18 @@
package org.broadinstitute.sting.utils.genotype;
/**
* @author aaron
* <p/>
* Interface PosteriorsBacked
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public interface PosteriorsBacked {
/**
* get the likelihood information for this
* @return
*/
public double[] getPosteriors();
}

View File

@ -98,15 +98,15 @@ public class GeliAdapter implements GenotypeWriter {
*/
@Override
public void addGenotypeCall(Genotype locus) {
double likelihoods[];
double posteriors[];
int readDepth = -1;
double nextVrsBest = 0;
double nextVrsRef = 0;
if (!(locus instanceof GenotypesBacked)) {
likelihoods = new double[10];
Arrays.fill(likelihoods, Double.MIN_VALUE);
posteriors = new double[10];
Arrays.fill(posteriors, Double.MIN_VALUE);
} else {
likelihoods = ((LikelihoodsBacked) locus).getLikelihoods();
posteriors = ((PosteriorsBacked) locus).getPosteriors();
}
char ref = locus.getReference();

View File

@ -50,7 +50,7 @@ public class GeliTextWriter implements GenotypeWriter {
* @param locus the locus to add
*/
public void addGenotypeCall(Genotype locus) {
double likelihoods[];
double posteriors[];
int readDepth = -1;
double nextVrsBest = 0;
double nextVrsRef = 0;
@ -61,17 +61,17 @@ public class GeliTextWriter implements GenotypeWriter {
readDepth = ((ReadBacked)locus).getReadCount();
}
if (!(locus instanceof GenotypesBacked)) {
likelihoods = new double[10];
Arrays.fill(likelihoods, 0.0);
posteriors = new double[10];
Arrays.fill(posteriors, 0.0);
} else {
likelihoods = ((LikelihoodsBacked) locus).getLikelihoods();
posteriors = ((PosteriorsBacked) locus).getPosteriors();
double[] lks;
lks = Arrays.copyOf(likelihoods,likelihoods.length);
lks = Arrays.copyOf(posteriors,posteriors.length);
Arrays.sort(lks);
nextVrsBest = lks[9] - lks[8];
if (ref != 'X') {
int index = (DiploidGenotype.valueOf(Utils.dupString(ref,2)).ordinal());
nextVrsRef = lks[9] - likelihoods[index];
nextVrsRef = lks[9] - posteriors[index];
}
}
// we have to calcuate our own
@ -85,16 +85,16 @@ public class GeliTextWriter implements GenotypeWriter {
locus.getBases(),
nextVrsRef,
nextVrsBest,
likelihoods[0],
likelihoods[1],
likelihoods[2],
likelihoods[3],
likelihoods[4],
likelihoods[5],
likelihoods[6],
likelihoods[7],
likelihoods[8],
likelihoods[9]));
posteriors[0],
posteriors[1],
posteriors[2],
posteriors[3],
posteriors[4],
posteriors[5],
posteriors[6],
posteriors[7],
posteriors[8],
posteriors[9]));
}
/**

View File

@ -108,7 +108,7 @@ public class GLFReader implements Iterator<GLFRecord> {
short rmsMapping = inputBinaryCodec.readUByte();
double[] lkValues = new double[LikelihoodObject.GENOTYPE.values().length];
for (int x = 0; x < LikelihoodObject.GENOTYPE.values().length; x++) {
lkValues[x] = inputBinaryCodec.readUByte();
lkValues[x] = (inputBinaryCodec.readUByte() / GLFRecord.LIKELIHOOD_SCALE_FACTOR + min_lk);
}
return new SinglePointCall(refBase, offset, readDepth, rmsMapping, lkValues);
}

View File

@ -40,7 +40,7 @@ import org.broadinstitute.sting.utils.StingException;
* field values.
*/
public abstract class GLFRecord {
public final static double LIKELIHOOD_SCALE_FACTOR = 1;
public final static double LIKELIHOOD_SCALE_FACTOR = 10;
// fields common to all records
@ -247,9 +247,9 @@ public abstract class GLFRecord {
if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in");
double min = vals[0];
for (double d : vals) {
for (double d : vals)
if (d < min) min = d;
}
return GLFRecord.toCappedShort(min);
}

View File

@ -113,7 +113,7 @@ public class GLFWriter implements GenotypeWriter {
// get likelihood information if available
LikelihoodObject obj;
if (locus instanceof GenotypesBacked) {
if (locus instanceof LikelihoodsBacked) {
obj = new LikelihoodObject(((LikelihoodsBacked) locus).getLikelihoods(), LikelihoodObject.LIKELIHOOD_TYPE.LOG);
} else {
double values[] = new double[10];
@ -122,12 +122,14 @@ public class GLFWriter implements GenotypeWriter {
}
obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); // transform! ... to negitive log likelihoods
double rms = -1;
double rms = 0;
int readCount = 0;
// calculate the RMS mapping qualities and the read depth
if (locus instanceof ReadBacked) {
rms = calculateRMS(((ReadBacked)locus).getReads());
readCount = ((ReadBacked)locus).getReadCount();
}
this.addGenotypeCall(GenomeLocParser.getContigInfo(locus.getLocation().getContig()),(int)locus.getLocation().getStart(),(float)rms,ref,((ReadBacked)locus).getReadCount(),obj);
this.addGenotypeCall(GenomeLocParser.getContigInfo(locus.getLocation().getContig()),(int)locus.getLocation().getStart(),(float)rms,ref,readCount,obj);
}

View File

@ -62,10 +62,15 @@ public class SinglePointCall extends GLFRecord {
*/
void write(BinaryCodec out) {
super.write(out);
try {
for (double likelihood : likelihoods) {
out.writeUByte(GLFRecord.toCappedShort(likelihood));
short[] adjusted = new short[likelihoods.length];
// we want to scale our values
for (int x = 0; x < likelihoods.length; x++) {
adjusted[x] = GLFRecord.toCappedShort(LIKELIHOOD_SCALE_FACTOR * (Math.round(likelihoods[x]) - this.minimumLikelihood));
}
try {
for (short value : adjusted) {
out.writeUByte(value);
}
} catch (Exception e) {
e.printStackTrace();
}

View File

@ -1,12 +1,23 @@
package org.broadinstitute.sting.utils.genotype.glf;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import org.broadinstitute.sting.utils.genotype.LikelihoodsBacked;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/*
@ -47,51 +58,142 @@ public class GLFWriterTest extends BaseTest {
private final String header = "";
private final String referenceSequenceName = "chr1";
private final int refLength = 1000;
File writeTo = new File("testGLF.glf");
private static final int GENOTYPE_COUNT = 10;
private GenotypeWriter rec;
protected static final String[] genotypes = {"AA", "AC", "AG", "AT", "CC", "CG", "CT", "GG", "GT", "TT"};
private static IndexedFastaSequenceFile seq;
protected final static double SIGNIFICANCE = 5.1;
@Before
public void before() {
}
/**
* make a fake snp
*
* @param genotype the genotype, 0-15 (AA, AT, AA, ... GG)
*/
private void addFakeSNP( int genotype, int location ) {
LikelihoodObject obj = new LikelihoodObject();
obj.setLikelihood(LikelihoodObject.GENOTYPE.values()[genotype], 128);
int ran = (int) Math.floor(Math.random() * 4.0);
char let = 'A';
switch (ran) {
case 0:
let = 'T';
break;
case 1:
let = 'C';
break;
case 2:
let = 'G';
break;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
/*try {
rec.addPointCall(let, location, 10, (short) 10, obj);
} catch (IllegalArgumentException e) {
e.printStackTrace();
} */
GenomeLocParser.setupRefContigOrdering(seq);
}
private FakeGenotype createGenotype(int bestGenotype, GenomeLoc location, char ref) {
double lk[] = new double[GENOTYPE_COUNT];
for (int x = 0; x < GENOTYPE_COUNT; x++) {
lk[x] = -15.0 - (double) x; // they'll all be unique like a snowflake
}
lk[bestGenotype] = -10.0; // lets make the best way better
return new FakeGenotype(location, genotypes[bestGenotype], ref, SIGNIFICANCE, lk);
}
@Test
public void basicWrite() {
File writeTo = new File("testGLF.glf");
writeTo.deleteOnExit();
rec = new GLFWriter(header, writeTo);
for (int x = 0; x < 100; x++) {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(1, x + 1);
Genotype type = createGenotype(x % 10, loc, 'A');
rec.addGenotypeCall(type);
}
rec.close();
}
@Test
public void basicWrite() {
public void basicWriteThenRead() {
File writeTo = new File("testGLF2.glf");
//writeTo.deleteOnExit();
List<FakeGenotype> types = new ArrayList<FakeGenotype>();
rec = new GLFWriter(header, writeTo);
for (int x = 0; x < 100; x++) {
addFakeSNP((int) Math.round(Math.random() * 9), 1);
GenomeLoc loc = GenomeLocParser.createGenomeLoc(1, x + 1);
FakeGenotype type = createGenotype(x % 10, loc, 'A');
types.add(type);
rec.addGenotypeCall(type);
}
rec.close();
GLFReader reader = new GLFReader(writeTo);
int count = 0;
while (reader.hasNext()) {
GLFRecord rec = reader.next();
Assert.assertTrue(types.get(count).compareTo(FakeGenotype.toFakeGenotype((SinglePointCall) rec, reader.getReferenceName(), reader.getCurrentLocation())) == 0);
count++;
}
}
}
class FakeGenotype extends BasicGenotype implements LikelihoodsBacked, Comparable<FakeGenotype> {
private double[] likelihoods;
/**
* create a basic genotype, given the following fields
*
* @param location the genomic location
* @param genotype the genotype, as a string, where ploidy = string.length
* @param ref the reference base as a char
* @param negLog10PError the confidence score
*/
public FakeGenotype(GenomeLoc location, String genotype, char ref, double negLog10PError, double likelihoods[]) {
super(location, genotype, ref, negLog10PError);
this.likelihoods = likelihoods;
}
/**
* get the likelihood information for this
*
* @return
*/
@Override
public double[] getLikelihoods() {
return likelihoods;
}
@Override
public int compareTo(FakeGenotype that) {
if (this.getLocation().compareTo(that.getLocation()) != 0) {
System.err.println("Location's aren't equal; this = " + this.getLocation() + " that = " + that.getLocation());
return this.getLocation().compareTo(that.getLocation());
}
if (!this.getBases().equals(that.getBases())) {
System.err.println("getBases's aren't equal; this = " + this.getBases() + " that = " + that.getBases());
return -1;
}
for (int x = 0; x < this.likelihoods.length; x++) {
if (this.likelihoods[x] != that.getLikelihoods()[x]) {
System.err.println("likelihoods' aren't equal; this = " + this.likelihoods[x] + " that = " + that.getLikelihoods()[x]);
return -1;
}
}
return 0;
}
public static FakeGenotype toFakeGenotype(SinglePointCall record, String contig, int postition) {
double likelihoods[] = record.getLikelihoods();
char ref = record.getRefBase().toChar();
double significance = GLFWriterTest.SIGNIFICANCE;
int minIndex = 0;
for (int i = 0; i < likelihoods.length; i++) {
if (likelihoods[i] < likelihoods[minIndex]) minIndex = i;
}
for (int i = 0; i < likelihoods.length; i++) {
likelihoods[i] = likelihoods[i] * -1;
}
String genotype = GLFWriterTest.genotypes[minIndex];
GenomeLoc loc = GenomeLocParser.createGenomeLoc(contig, postition);
return new FakeGenotype(loc, genotype, ref, significance, likelihoods);
}
}