adding the GenotypeLikelihoodsWalker, which generates GLF genotype likelihoods that are pretty much identical to the samtools calls.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1235 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-07-14 16:57:18 +00:00
parent 99f9cd84ed
commit 01fc8da270
7 changed files with 137 additions and 26 deletions

View File

@ -0,0 +1,125 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import java.io.File;
import java.util.List;
/**
*
* @author aaron
*
* Class LikelihoodWalker
*
* a simple walker to calculate the genotype likelihoods
*/
@ReadFilters(ZeroMappingQualityReadFilter.class)
public class GenotypeLikelihoodsWalker extends LocusWalker<LikelihoodWrapper, GenotypeWriter> {
@Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = true) public File VARIANTS_FILE;
@Argument(fullName = "metrics_out", shortName = "metout", doc = "File to which metrics should be written", required = false) public File METRICS_FILE = new File("/dev/stderr");
@Argument(fullName = "variant_output_format", shortName = "vf", doc = "File to which metrics should be written", required = false) public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GLF;
@Override
public LikelihoodWrapper map(RefMetaDataTracker tracker, char ref, LocusContext context) {
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
String bases = pileup.getBases();
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
double rmsSum = 0.0;
// Handle single-base polymorphisms.
GenotypeLikelihoods G = new GenotypeLikelihoods();
for (int i = 0; i < reads.size(); i++) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
rmsSum += (read.getMappingQuality() * read.getMappingQuality());
G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
}
// our return
LikelihoodWrapper wrap = new LikelihoodWrapper();
return wrapLikelihoods(ref, context, reads, rmsSum, G, wrap);
}
/**
* wrap the likelihood values in with the other data we'll need
* @param ref the ref base
* @param context the locus context
* @param reads the reads
* @param rmsSum the rms square total (not the actual rms yet)
* @param g the genotypeLikelihoods
* @param wrap the object to place the values into
* @return a likelihood wrapper
*/
private LikelihoodWrapper wrapLikelihoods(char ref, LocusContext context, List<SAMRecord> reads, double rmsSum, GenotypeLikelihoods g, LikelihoodWrapper wrap) {
wrap.obj = new LikelihoodObject();
wrap.obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.LOG);
for (int x = 0; x < GenotypeLikelihoods.genotypes.length; x++) {
wrap.obj.setLikelihood(LikelihoodObject.GENOTYPE.valueOf(GenotypeLikelihoods.genotypes[x]), g.likelihoods[x]*10.0);
}
wrap.obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.NEGITIVE_LOG);
wrap.loc = GenomeLocParser.getContigInfo(context.getContig());
wrap.readDepth = reads.size();
float rms = (float)(Math.sqrt(rmsSum/reads.size()));
wrap.rms = (rms > 255) ? 255 : rms;
wrap.ref = ref;
wrap.position = context.getLocation().getStart();
return wrap;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
@Override
public GenotypeWriter reduceInit() {
return GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getEngine().getSAMHeader(),VARIANTS_FILE);
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
*
* @return accumulator with result of the map taken into account.
*/
@Override
public GenotypeWriter reduce(LikelihoodWrapper value, GenotypeWriter sum) {
sum.addGenotypeCall(value.loc,(int)value.position,value.rms,value.ref,value.readDepth,value.obj);
return sum;
}
/** Close the variant file. */
public void onTraversalDone(GenotypeWriter result) {
result.close();
}
}
class LikelihoodWrapper {
public LikelihoodObject obj;
public SAMSequenceRecord loc;
public long position;
public float rms;
public char ref;
public int readDepth;
}

View File

@ -9,7 +9,6 @@ import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
import org.broadinstitute.sting.playground.utils.AlleleMetrics;
import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods;
@ -21,8 +20,8 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import java.io.File;
import java.io.PrintStream;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.List;
@ReadFilters(ZeroMappingQualityReadFilter.class)
@ -106,10 +105,8 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
/** Initialize the walker with some sensible defaults */
public void initialize() {
metricsOut = new AlleleMetrics(METRICS_FILE, LOD_THRESHOLD);
if (this.VAR_FORMAT == GenotypeWriterFactory.GENOTYPE_FORMAT.GLF) {
GenotypeLikelihoods.toGLFGenotypePattern();
metricsOut = new AlleleMetrics(METRICS_FILE, LOD_THRESHOLD);
mGenotypeWriter = GenotypeWriterFactory.create(GenotypeWriterFactory.GENOTYPE_FORMAT.GLF, GenomeAnalysisEngine.instance.getEngine().getSAMHeader(), VARIANTS_FILE);
} else {
try {

View File

@ -1,8 +1,8 @@
package org.broadinstitute.sting.playground.utils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.QualityUtils;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
@ -408,17 +408,4 @@ public class GenotypeLikelihoods {
public void addIndelLikelihood(IndelLikelihood indel_likelihood) { this.indel_likelihood = indel_likelihood; }
public IndelLikelihood getIndelLikelihood() { return this.indel_likelihood; }
// TODO: this is bad, but the formats disagree now.
public static void toGLFGenotypePattern() {
genotypes[0] = "AA";
genotypes[1] = "AT";
genotypes[2] = "AC";
genotypes[3] = "AG";
genotypes[4] = "CC";
genotypes[5] = "CT";
genotypes[6] = "CG";
genotypes[7] = "GG";
genotypes[8] = "GT";
genotypes[9] = "TT";
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.utils.genotype;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.glf.GLFWriter;
import java.io.File;
@ -32,7 +33,8 @@ public class GenotypeWriterFactory {
return new GLFWriter(header.toString(), destination);
case GELI:
return new GeliAdapter(destination, header);
default:
throw new StingException("Genotype writer " + format.toString() + " is not implemented");
}
return null;
}
}

View File

@ -48,7 +48,7 @@ public class LikelihoodObject {
// our possible genotypes, in order according to GLFv3
public enum GENOTYPE {
AA, AT, AC, AG, CC, CT, CG, GG, GT, TT
AA, AC, AG, AT, CC, CG, CT, GG, GT, TT
}
// our pileup of bases

View File

@ -194,7 +194,7 @@ abstract class GLFRecord {
short bite = ((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f)));
out.writeUByte((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f)));
out.writeUInt(((Long) offset).intValue());
int write = ((new Long(readDepth).intValue()) | this.minimumLikelihood << 24);
int write = ((new Long(readDepth).intValue() & 0xffffff) | this.minimumLikelihood & 0xff << 24);
out.writeUInt(write);
out.writeUByte((short) rmsMapQ);
}
@ -233,14 +233,14 @@ abstract class GLFRecord {
*
* @return
*/
protected static double findMin(double vals[]) {
protected static short findMin(double vals[]) {
if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in");
double min = vals[0];
for (double d : vals) {
if (d < min) min = d;
}
return min;
return (min > 255) ? 255 : (short)min;
}
}

View File

@ -55,7 +55,7 @@ public class GLFWriter implements GenotypeWriter {
private long referenceSequenceLength = 0;
// the last position written
private int lastPos = 0;
private int lastPos = 1;
/**
* The public constructor for creating a GLF object