Remodeled version of the UnifiedGenotyper.

We currently get identical lods and slods as MultiSampleCaller (except slods for ref calls, as I discussed with Jared) and are a bit faster in my few test cases.  Single-sample mode still emulates SSG.
The remaining to do items:
1. more testing still needed
2. we currently only output lods/slods, but I need to emit actual calls
3. stubs are in place for Mark's proposed version of the EM calculation and now I need to add the actual code.
More check-ins coming soon...


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1821 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-10-13 20:27:01 +00:00
parent b28446acac
commit 96b8499a31
9 changed files with 488 additions and 324 deletions

View File

@ -1,23 +1,37 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import java.util.List;
import java.util.*;
public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel {
public class AllMAFsGenotypeCalculationModel extends GenotypeCalculationModel {
protected AllMAFsGenotypeCalculationModel() {}
public List<GenotypeCall> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
protected double[] initializeAlleleFrequencies(int numSamples, int[] baseCounts) {
double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles
// This is just a stub...
// TODO --- implement me
// In this alternative approach we calculate the likelihoods of all possible
// MAFs (1-N) and choose the most likely one. We can probably be smart and
// avoid calculating MAFs that we know can't be the most likely...
return alleleFrequencies;
}
protected HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) {
return null;
}
protected double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs) {
return null;
}
protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies) {
}
protected EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext) {
return null;
}
}

View File

@ -4,224 +4,119 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import java.util.*;
public class EMGenotypeCalculationModel extends GenotypeCalculationModel {
public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel {
// We need to set a limit on the EM iterations in case something flukey goes on
private static final int MAX_EM_ITERATIONS = 6;
protected static final int MAX_EM_ITERATIONS = 6;
// We consider the EM stable when the MAF doesn't change more than 1/10N
private static final double EM_STABILITY_METRIC = 0.1;
protected static final double EM_STABILITY_METRIC = 0.1;
// keep track of some metrics about our calls
private CallMetrics callsMetrics = new CallMetrics();
protected CallMetrics callsMetrics = new CallMetrics();
protected EMGenotypeCalculationModel() {}
public List<GenotypeCall> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
// keep track of the GenotypeLikelihoods for each sample, separated by strand
HashMap<String, UnifiedGenotypeLikelihoods> GLs = new HashMap<String, UnifiedGenotypeLikelihoods>();
// keep track of the total counts of each base in the pileup
// keep track of the context for each sample, overall and separated by strand
int[] baseCounts = new int[4];
int totalObservedBases = 0;
// First order of business: create the initial GenotypeLikelihoods objects.
// Also, confirm that there aren't too many deletions in the pileup.
int deletionsInPile = 0;
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i = 0; i < reads.size(); i++) {
// get the read and offset
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
// skip deletions
if ( offset == -1 ) {
// are there too many deletions in the pileup?
if ( ++deletionsInPile > maxDeletionsInPileup )
return null;
continue;
}
// get the base; don't use bad bases
char base = read.getReadString().charAt(offset);
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
if ( baseIndex == -1 )
continue;
// create the GL holder object if this is the first time we're seeing a base for this sample
String readGroup = read.getAttribute("RG").toString(); // can't be null because those are filtered out
String sample = read.getHeader().getReadGroup(readGroup).getSample();
UnifiedGenotypeLikelihoods myGLs = GLs.get(sample);
if ( myGLs == null ) {
myGLs = new UnifiedGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform, VERBOSE);
GLs.put(sample, myGLs);
}
// assign the base to the appropriate strand
myGLs.add(base, read, offset);
// update the base counts
baseCounts[baseIndex]++;
totalObservedBases++;
}
// for now, we need to special-case single sample mode
if ( samples.size() == 1 ) {
String sample = samples.iterator().next();
UnifiedGenotypeLikelihoods UGL = GLs.get(sample);
// if there were no good bases, the likelihoods object wouldn't exist
if ( UGL == null )
return null;
callsMetrics.nCalledBases++;
UGL.setPriors(priors);
GenotypeCall call = new GenotypeCall(sample,context.getLocation(), ref, UGL.getGenotypeLikelihoods(), new ReadBackedPileup(ref, context));
if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) {
double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError());
if ( confidence >= LOD_THRESHOLD ) {
callsMetrics.nConfidentCalls++;
out.addGenotypeCall(call);
} else {
callsMetrics.nNonConfidentCalls++;
}
}
return Arrays.asList(call);
}
callsMetrics.nCalledBases++;
// Next, we need to create initial allele frequencies.
// An intelligent guess would be the observed base ratios (plus some small number to account for sampling issues).
double[] alleleFrequencies = new double[4];
for (int i = 0; i < 4; i++)
alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
for ( UnifiedGenotypeLikelihoods UGL : GLs.values() )
UGL.setPriors(AFPriors);
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, baseCounts);
if ( contexts == null )
return null;
// debugging output
for (int i = 0; i < 4; i++)
logger.debug("Base count and initial allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i] + ", " + alleleFrequencies[i]);
logger.debug("Base count for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i]);
// The EM loop:
// we want to continue until the calculation is stable, but we need some max on the number of iterations
int iterations = 0;
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N
double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)GLs.size());
boolean isStable = false;
// run the EM calculation
EMOutput overall = runEM(ref, contexts, priors, baseCounts, StratifiedContext.OVERALL);
double lod = overall.getPofD() - overall.getPofNull();
logger.debug("lod=" + lod);
while ( !isStable && iterations < MAX_EM_ITERATIONS ) {
// calculate strand score
EMOutput forward = runEM(ref, contexts, priors, baseCounts, StratifiedContext.FORWARD);
EMOutput reverse = runEM(ref, contexts, priors, baseCounts, StratifiedContext.REVERSE);
double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
// calculate the posterior-weighted allele frequencies and modify the priors accordingly
double[] newAlleleFrequencies = getPosteriorWeightedFrequencies(GLs.values());
AFPriors = calculateAlleleFreqBasedPriors(newAlleleFrequencies);
for ( UnifiedGenotypeLikelihoods UGL : GLs.values() )
UGL.setPriors(AFPriors);
// TODO -- finish me...
// determine whether we're stable
double AF_delta = 0.0;
for (int i = 0; i < 4; i++) {
AF_delta += Math.abs(alleleFrequencies[i] - newAlleleFrequencies[i]);
logger.debug("Previous allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i] + ", vs. new frequency: " + newAlleleFrequencies[i]);
}
System.out.println(String.format("LOD=%f, SLOD=%f", lod, strandScore));
isStable = AF_delta < EM_STABILITY_VALUE;
iterations++;
alleleFrequencies = newAlleleFrequencies;
}
logger.debug("EM loop took " + iterations + " iterations");
if (true)
throw new RuntimeException("DEBUGGING");
// compute actual priors: theta / MAF
double pF = computeThetaOverMAF(ref, alleleFrequencies, GLs.size());
// the posteriors from the EM loop are the population likelihoods here
// TODO -- finish me
/*
ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods;
pD = Compute_pD(G, em_result.sample_weights);
pNull = Compute_pNull(contexts, em_result.sample_weights);
// Apply p(f).
double pVar = 0.0;
for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; }
double p0 = Math.log10(1 - pVar);
double pF;
double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods);
if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; }
else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); }
pD = pD + pF;
pNull = pNull + p0;
lod = pD - pNull;
return lod;
*/
// make a call
// List<GenotypeCall> calls
//return new GenotypeCall(context.getLocation(), ref,gl, pileup);
//out.addMultiSampleCall((Genotype)calls);
//out.addMultiSampleCall((Genotype)calls, GenotypeMetaData metadata);
return null;
}
double[] getPosteriorWeightedFrequencies(Collection<UnifiedGenotypeLikelihoods> UGLs) {
double[] newAlleleLikelihoods = new double[4];
public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, int[] baseCounts, StratifiedContext contextType) {
for ( UnifiedGenotypeLikelihoods UGL : UGLs ) {
// get initial allele frequencies
double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), baseCounts);
for (int i = 0; i < alleleFrequencies.length; i++)
logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]);
// calculate the posterior weighted frequencies for this sample
double[] personalAllelePosteriors = new double[4];
double sum = 0;
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double posterior = Math.pow(10, UGL.getGenotypeLikelihoods().getPosterior(g));
personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base1)] += posterior;
personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base2)] += posterior;
sum += 2.0 * posterior;
}
// get the initial genotype likelihoods
HashMap<String, GenotypeLikelihoods> GLs = initializeGenotypeLikelihoods(ref, contexts, alleleFrequencies, priors, contextType);
// normalize
for (int i = 0; i < 4; i++)
personalAllelePosteriors[i] /= sum;
for (int i = 0; i < 4; i++)
newAlleleLikelihoods[i] += personalAllelePosteriors[i];
}
callsMetrics.nCalledBases++;
// The EM loop:
// we want to continue until the calculation is stable, but we need some max on the number of iterations
int iterations = 0;
boolean EM_IS_STABLE;
// normalize
double sum = 0;
for (int i = 0; i < 4; i++)
sum += newAlleleLikelihoods[i];
for (int i = 0; i < 4; i++)
newAlleleLikelihoods[i] /= sum;
do {
double[] newAlleleFrequencies = calculateAlleleFrequencyPosteriors(GLs);
for (int i = 0; i < alleleFrequencies.length; i++)
logger.debug("New allele frequency for i=" + i + ": " + newAlleleFrequencies[i]);
return newAlleleLikelihoods;
applyAlleleFrequencyToGenotypeLikelihoods(GLs, newAlleleFrequencies);
EM_IS_STABLE = isStable(alleleFrequencies, newAlleleFrequencies, contexts.size());
alleleFrequencies = newAlleleFrequencies;
} while ( ++iterations < MAX_EM_ITERATIONS && !EM_IS_STABLE );
logger.debug("EM loop took " + iterations + " iterations");
for ( String sample : GLs.keySet() )
logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString());
return computePofF(ref, GLs, alleleFrequencies, contexts.size());
}
private DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts);
protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType);
protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs);
protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies);
protected abstract EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext);
protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) {
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N
double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)numSamplesInContext);
// determine whether we're stable
double AF_delta = 0.0;
for (int i = 0; i < oldAlleleFrequencies.length; i++)
AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]);
return (AF_delta < EM_STABILITY_VALUE);
}
protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
// convert to log-space
double[] log10Freqs = new double[4];
for (int i = 0; i < 4; i++)
@ -240,25 +135,66 @@ public class EMGenotypeCalculationModel extends GenotypeCalculationModel {
return new DiploidGenotypePriors(alleleFreqPriors);
}
private double computeThetaOverMAF(char ref, double[] alleleFrequencies, int samplesAtLocus) {
double MAF;
Integer[] sortedIndexes = Utils.SortPermutation(alleleFrequencies);
if ( sortedIndexes[3] != BaseUtils.simpleBaseToBaseIndex(ref) )
MAF = alleleFrequencies[sortedIndexes[3]];
else
MAF = alleleFrequencies[sortedIndexes[2]];
/**
* Create the mapping from sample to alignment context; also, fill in the base counts along the way.
* Returns null iff there are too many deletions at this position.
*/
protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context, int[] baseCounts) {
int expectedChrs = (int)(2.0 * (double)samplesAtLocus * MAF);
HashMap<String, AlignmentContextBySample> contexts = new HashMap<String, AlignmentContextBySample>();
for (int i = 0; i < 4; i++)
baseCounts[i] = 0;
// TODO -- we need to use the priors from the UnifiedArgumentCollection
return DiploidGenotypePriors.HUMAN_HETEROZYGOSITY / (double)expectedChrs;
int deletionsInPile = 0;
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i = 0; i < reads.size(); i++) {
// get the read and offset
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
// find the sample; special case for pools
String sample;
if ( POOLED_INPUT ) {
sample = "POOL";
} else {
String readGroup = read.getAttribute("RG").toString(); // can't be null because those are filtered out
sample = read.getHeader().getReadGroup(readGroup).getSample();
}
// create a new context object if this is the first time we're seeing a read for this sample
AlignmentContextBySample myContext = contexts.get(sample);
if ( myContext == null ) {
myContext = new AlignmentContextBySample(context.getLocation());
contexts.put(sample, myContext);
}
// check for deletions
if ( offset == -1 ) {
// are there too many deletions in the pileup?
if ( ++deletionsInPile > maxDeletionsInPileup )
return null;
} else {
// add bad bases to the context (for DoC calculations), but don't count them
int baseIndex = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset));
if ( baseIndex != -1 )
baseCounts[baseIndex]++;
}
// add the read to this sample's context
myContext.add(read, offset);
}
return contexts;
}
/**
* A class to keep track of some basic metrics about our calls
*/
private class CallMetrics {
protected class CallMetrics {
long nConfidentCalls = 0;
long nNonConfidentCalls = 0;
long nCalledBases = 0;
@ -266,8 +202,96 @@ public class EMGenotypeCalculationModel extends GenotypeCalculationModel {
CallMetrics() {}
public String toString() {
return String.format("SSG: %d confident and %d non-confident calls were made at %d bases",
return String.format("UG: %d confident and %d non-confident calls were made at %d bases",
nConfidentCalls, nNonConfidentCalls, nCalledBases);
}
}
/**
* A class to keep track of the EM output
*/
protected class EMOutput {
private double pD, pNull, pF;
private HashMap<String, GenotypeLikelihoods> GLs;
EMOutput(double pD, double pNull, double pF, HashMap<String, GenotypeLikelihoods> GLs) {
this.pD = pD;
this.pNull = pNull;
this.pF = pF;
this.GLs = GLs;
}
public double getPofD() { return pD; }
public double getPofNull() { return pNull; }
public double getPofF() { return pF; }
public HashMap<String, GenotypeLikelihoods> getGenotypeLikelihoods() { return GLs; }
}
/**
* A class to keep track of the alignment context observed for a given sample.
* we currently store the overall context and strand-stratified sets,
* but any arbitrary stratification could be added.
*/
protected enum StratifiedContext { OVERALL, FORWARD, REVERSE }
protected class AlignmentContextBySample {
private AlignmentContext overall = null;
private AlignmentContext forward = null;
private AlignmentContext reverse = null;
private GenomeLoc loc;
private ArrayList<SAMRecord> allReads = new ArrayList<SAMRecord>();
private ArrayList<SAMRecord> forwardReads = new ArrayList<SAMRecord>();
private ArrayList<SAMRecord> reverseReads = new ArrayList<SAMRecord>();
private ArrayList<Integer> allOffsets = new ArrayList<Integer>();
private ArrayList<Integer> forwardOffsets = new ArrayList<Integer>();
private ArrayList<Integer> reverseOffsets = new ArrayList<Integer>();
AlignmentContextBySample(GenomeLoc loc) {
this.loc = loc;
}
public AlignmentContext getContext(StratifiedContext context) {
switch ( context ) {
case OVERALL: return getOverallContext();
case FORWARD: return getForwardContext();
case REVERSE: return getReverseContext();
}
return null;
}
private AlignmentContext getOverallContext() {
if ( overall == null )
overall = new AlignmentContext(loc, allReads, allOffsets);
return overall;
}
private AlignmentContext getForwardContext() {
if ( forward == null )
forward = new AlignmentContext(loc, forwardReads, forwardOffsets);
return forward;
}
private AlignmentContext getReverseContext() {
if ( reverse == null )
reverse = new AlignmentContext(loc, reverseReads, reverseOffsets);
return reverse;
}
public void add(SAMRecord read, int offset) {
if ( read.getReadNegativeStrandFlag() ) {
reverseReads.add(read);
reverseOffsets.add(offset);
} else {
forwardReads.add(read);
forwardOffsets.add(offset);
}
allReads.add(read);
allOffsets.add(offset);
}
}
}

View File

@ -14,16 +14,18 @@ import java.util.*;
public abstract class GenotypeCalculationModel implements Cloneable {
public enum Model {
EM,
ALL_MAFS
EM_POINT_ESTIMATE,
EM_ALL_MAFS
}
protected BaseMismatchModel baseModel;
protected Set<String> samples;
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
protected GenotypeWriter out;
protected Logger logger;
protected double heterozygosity;
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
protected boolean GENOTYPE_MODE;
protected boolean POOLED_INPUT;
protected double LOD_THRESHOLD;
protected int maxDeletionsInPileup;
protected boolean VERBOSE;
@ -37,34 +39,26 @@ public abstract class GenotypeCalculationModel implements Cloneable {
/**
* Initialize the GenotypeCalculationModel object
* Assumes that out is not null
* @param baseModel base model to use
* @param samples samples in input bam
* @param platform default platform
* @param out output writer
* @param out logger
* @param genotypeMode genotyping
* @param lod lod threshold
* @param maxDeletions max deletions to tolerate
* @param verbose verbose flag
* @param logger logger
* @param UAC unified arg collection
*/
protected void initialize(BaseMismatchModel baseModel,
Set<String> samples,
EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
GenotypeWriter out,
Logger logger,
boolean genotypeMode,
double lod,
int maxDeletions,
boolean verbose) {
this.baseModel = baseModel;
protected void initialize(Set<String> samples,
GenotypeWriter out,
Logger logger,
UnifiedArgumentCollection UAC) {
this.samples = samples;
defaultPlatform = platform;
this.out = out;
this.logger = logger;
GENOTYPE_MODE = genotypeMode;
LOD_THRESHOLD = lod;
maxDeletionsInPileup = maxDeletions;
VERBOSE = verbose;
baseModel = UAC.baseModel;
heterozygosity = UAC.heterozygosity;
defaultPlatform = UAC.defaultPlatform;
GENOTYPE_MODE = UAC.GENOTYPE;
POOLED_INPUT = UAC.POOLED;
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
maxDeletionsInPileup = UAC.MAX_DELETIONS;
VERBOSE = UAC.VERBOSE;
}
/**
@ -74,12 +68,14 @@ public abstract class GenotypeCalculationModel implements Cloneable {
*/
protected Object clone() throws CloneNotSupportedException {
GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone();
gcm.baseModel = baseModel;
gcm.samples = new HashSet<String>(samples);
gcm.defaultPlatform = defaultPlatform;
gcm.out = out;
gcm.logger = logger;
gcm.baseModel = baseModel;
gcm.heterozygosity = heterozygosity;
gcm.defaultPlatform = defaultPlatform;
gcm.GENOTYPE_MODE = GENOTYPE_MODE;
gcm.POOLED_INPUT = POOLED_INPUT;
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
gcm.VERBOSE = VERBOSE;
@ -88,6 +84,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
/**
* Must be overridden by concrete subclasses
* @param tracker rod data
* @param ref reference base
* @param context alignment context
* @param priors priors to use for GL

View File

@ -52,22 +52,22 @@ public class GenotypeCalculationModelFactory {
* @param logger logger
* @return model
*/
public static GenotypeCalculationModel makeGenotypeCalculation(UnifiedArgumentCollection UAC,
Set<String> samples,
public static GenotypeCalculationModel makeGenotypeCalculation(Set<String> samples,
GenotypeWriter out,
Logger logger) {
Logger logger,
UnifiedArgumentCollection UAC) {
GenotypeCalculationModel gcm;
switch ( UAC.genotypeModel ) {
case EM:
gcm = new EMGenotypeCalculationModel();
case EM_POINT_ESTIMATE:
gcm = new PointEstimateGenotypeCalculationModel();
break;
case ALL_MAFS:
case EM_ALL_MAFS:
gcm = new AllMAFsGenotypeCalculationModel();
break;
default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
}
gcm.initialize(UAC.baseModel, samples, UAC.defaultPlatform, out, logger, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE);
gcm.initialize(samples, out, logger, UAC);
return gcm;
}
}

View File

@ -121,7 +121,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return get the one minus error value
*/
@Override
public double getNegLog10PError() {
return Math.abs(mGenotypeLikelihoods.getPosterior(getBestGenotype()) - mGenotypeLikelihoods.getPosterior(getNextBest()));
}
@ -149,7 +148,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return the bases, as a string
*/
@Override
public String getBases() {
return getBestGenotype().toString();
}
@ -159,7 +157,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return the ploidy value
*/
@Override
public int getPloidy() {
return 2;
}
@ -169,7 +166,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return true if we're homozygous, false otherwise
*/
@Override
public boolean isHom() {
return getBestGenotype().isHom();
}
@ -179,7 +175,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return true if we're het, false otherwise
*/
@Override
public boolean isHet() {
return !isHom();
}
@ -190,7 +185,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return position on the genome wrapped in GenomeLoc object
*/
@Override
public GenomeLoc getLocation() {
return this.mLocation;
}
@ -200,7 +194,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return true is a SNP
*/
@Override
public boolean isPointGenotype() {
return true;
}
@ -212,7 +205,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return true if we're a variant
*/
@Override
public boolean isVariant(char ref) {
return !Utils.dupString(this.getReference(), 2).equals(getBestGenotype().toString());
}
@ -287,13 +279,11 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return
*/
@Override
public double[] getPosteriors() {
return this.mGenotypeLikelihoods.getPosteriors();
}
/** @return returns the sample name for this genotype */
@Override
public String getSampleName() {
return this.mSampleName;
}
@ -303,7 +293,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
*
* @return a string, representing the genotyping value
*/
@Override
public String getFilteringValue() {
return "0";
}

View File

@ -0,0 +1,204 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import java.util.*;
public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculationModel {
protected PointEstimateGenotypeCalculationModel() {}
// overload this method so we can special-case the single sample
public List<GenotypeCall> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
// we don't actually want to run EM for single samples
if ( samples.size() == 1 ) {
// split the context (so we can get forward/reverse contexts for free)
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, new int[4]);
if ( contexts == null )
return null;
// get the context for the sample
String sample = samples.iterator().next();
AlignmentContextBySample sampleContext = contexts.get(sample);
// if there were no good bases, the context wouldn't exist
if ( sampleContext == null )
return null;
callsMetrics.nCalledBases++;
// create the genotype call object
Pair<ReadBackedPileup, GenotypeLikelihoods> discoveryGL = getSingleSampleLikelihoods(ref, sampleContext, priors, StratifiedContext.OVERALL);
GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, discoveryGL.second, discoveryGL.first);
if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) {
double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError());
if ( confidence >= LOD_THRESHOLD ) {
callsMetrics.nConfidentCalls++;
out.addGenotypeCall(call);
} else {
callsMetrics.nNonConfidentCalls++;
}
}
return Arrays.asList(call);
}
return super.calculateGenotype(tracker, ref, context, priors);
}
private Pair<ReadBackedPileup, GenotypeLikelihoods> getSingleSampleLikelihoods(char ref, AlignmentContextBySample sampleContext, DiploidGenotypePriors priors, StratifiedContext contextType) {
// create the pileup
AlignmentContext myContext = sampleContext.getContext(contextType);
ReadBackedPileup pileup = new ReadBackedPileup(ref, myContext);
pileup.setIncludeDeletionsInPileupString(true);
// create the GenotypeLikelihoods object
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
GL.setVerbose(VERBOSE);
GL.add(pileup, true);
return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
}
protected double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts) {
// An intelligent guess would be the observed base ratios
// (plus some small number to account for sampling issues).
int totalObservedBases = 0;
for (int i = 0; i < 4; i++)
totalObservedBases += baseCounts[i];
double[] alleleFrequencies = new double[4];
for (int i = 0; i < 4; i++)
alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
return alleleFrequencies;
}
protected HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) {
HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
for ( String sample : contexts.keySet() ) {
AlignmentContextBySample context = contexts.get(sample);
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
// create the GenotypeLikelihoods object
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, AFPriors, defaultPlatform);
GL.setVerbose(VERBOSE);
GL.add(pileup, true);
GLs.put(sample, GL);
}
return GLs;
}
protected double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs) {
double[] newAlleleLikelihoods = new double[4];
for ( GenotypeLikelihoods GL : GLs.values() ) {
double[] normalizedPosteriors = GL.getNormalizedPosteriors();
// calculate the posterior weighted frequencies for this sample
double[] personalAllelePosteriors = new double[4];
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double posterior = normalizedPosteriors[g.ordinal()] / 2.0; // each base gets half the probability
personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base1)] += posterior;
personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base2)] += posterior;
}
for (int i = 0; i < 4; i++)
newAlleleLikelihoods[i] += personalAllelePosteriors[i];
}
// normalize
double sum = 0;
for (int i = 0; i < 4; i++)
sum += newAlleleLikelihoods[i];
for (int i = 0; i < 4; i++)
newAlleleLikelihoods[i] /= sum;
return newAlleleLikelihoods;
}
protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies) {
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
for ( GenotypeLikelihoods GL : GLs.values() )
GL.setPriors(AFPriors);
}
protected EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext) {
// compute pD and pNull without allele frequencies
double pD = compute_pD(GLs);
double pNull = compute_pNull(ref, GLs);
logger.debug("Original pD=" + pD + ", pNull=" + pNull);
// compute p0
double pVar = 0.0;
for (int i = 1; i < numSamplesInContext; i++)
pVar += heterozygosity/(double)i;
double p0 = Math.log10(1.0 - pVar);
// compute actual priors: theta / MAF
double MAF;
Integer[] sortedIndexes = Utils.SortPermutation(alleleFrequencies);
if ( sortedIndexes[3] != BaseUtils.simpleBaseToBaseIndex(ref) )
MAF = alleleFrequencies[sortedIndexes[3]];
else
MAF = alleleFrequencies[sortedIndexes[2]];
// compute pF
double pF;
double expectedChromosomes = 2.0 * (double)numSamplesInContext * MAF;
if ( expectedChromosomes < 1.0 )
pF = p0;
else
pF = Math.log10(heterozygosity / expectedChromosomes);
logger.debug("p0=" + p0 + ", pF=" + pF);
pD += pF;
pNull += p0;
logger.debug("Final pD=" + pD + ", pNull=" + pNull);
return new EMOutput(pD, pNull, pF, GLs);
}
private double compute_pD(HashMap<String, GenotypeLikelihoods> GLs) {
double pD = 0.0;
for ( GenotypeLikelihoods GL : GLs.values() ) {
double sum = 0.0;
for ( DiploidGenotype g : DiploidGenotype.values() ) {
sum += Math.pow(10, GL.getPosterior(g));
}
pD += Math.log10(sum);
}
return pD;
}
private double compute_pNull(char ref, HashMap<String, GenotypeLikelihoods> GLs) {
// compute null likelihoods
double[] alleleLikelihoods = new double[4];
for (int i = 0; i < 4; i++)
alleleLikelihoods[i] = 1e-6/3.0;
alleleLikelihoods[BaseUtils.simpleBaseToBaseIndex(ref)] = 1.0-1e-6;
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleLikelihoods);
HashMap<String, GenotypeLikelihoods> GL_null = new HashMap<String, GenotypeLikelihoods>();
try {
for ( String sample : GLs.keySet() ) {
GenotypeLikelihoods GL = (GenotypeLikelihoods)GLs.get(sample).clone();
GL.setPriors(AFPriors);
GL_null.put(sample, GL);
}
} catch (CloneNotSupportedException e) {
throw new StingException("Clone() not supported for given GenotypeLikelihoods subclass?");
}
return compute_pD(GL_null);
}
}

View File

@ -26,29 +26,23 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import java.io.File;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Sep 29, 2009
* Time: 12:07:43 PM
* To change this template use File | Settings | File Templates.
*/
public class UnifiedArgumentCollection {
// control the various models to be used
@Argument(fullName = "genotypeModel", shortName = "gm", doc = "Genotype calculation model to employ -- EM is the default choice; the ALL_MAFS model is experimental", required = false)
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM;
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while EM_ALL_MAFS is under development", required = false)
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
@Argument(fullName = "baseModel", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY;
@Argument(fullName = "pooled", shortName = "pooled", doc = "Does the input bam represent pooled data (so that genotypes can't be called)?", required = false)
public boolean POOLED = false;
// control the output
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
@ -65,10 +59,10 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
@Argument(fullName = "maxDeletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
@Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
public Integer MAX_DELETIONS = 1;
@Argument(fullName = "maxCoverage", shortName = "maxCoverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
@Argument(fullName = "max_coverage", shortName = "coverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
public Integer MAX_READS_IN_PILEUP = 10000;

View File

@ -1,58 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import net.sf.samtools.SAMRecord;
/**
A wrapper around the GenotypeLikelihoods class for the UnifiedGenotyper.
This class incorporates both strand-based likelihoods and a combined likelihoods over both strands.
*/
public class UnifiedGenotypeLikelihoods {
private GenotypeLikelihoods forwardStrandGL;
private GenotypeLikelihoods reverseStrandGL;
private GenotypeLikelihoods combinedGL;
public UnifiedGenotypeLikelihoods(BaseMismatchModel baseModel, DiploidGenotypePriors priors, EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform, boolean VERBOSE) {
forwardStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
forwardStrandGL.setVerbose(VERBOSE);
reverseStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
reverseStrandGL.setVerbose(VERBOSE);
combinedGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
combinedGL.setVerbose(VERBOSE);
}
public GenotypeLikelihoods getForwardStrandGenotypeLikelihoods() { return forwardStrandGL; }
public GenotypeLikelihoods getReverseStrandGenotypeLikelihoods() { return reverseStrandGL; }
public GenotypeLikelihoods getGenotypeLikelihoods() { return combinedGL; }
public void add(ReadBackedPileup pileup) {
for (int i = 0; i < pileup.getReads().size(); i++) {
int offset = pileup.getOffsets().get(i);
// ignore deletions
if ( offset == -1 )
continue;
SAMRecord read = pileup.getReads().get(i);
char base = read.getReadString().charAt(offset);
add(base, read, offset);
}
}
public void add(char base, SAMRecord read, int offset) {
if ( !read.getReadNegativeStrandFlag() )
forwardStrandGL.add(base, read.getBaseQualities()[offset], read, offset);
else
reverseStrandGL.add(base, read.getBaseQualities()[offset], read, offset);
combinedGL.add(base, read.getBaseQualities()[offset], read, offset);
}
public void setPriors(DiploidGenotypePriors priors) {
forwardStrandGL.setPriors(priors);
reverseStrandGL.setPriors(priors);
combinedGL.setPriors(priors);
}
}

View File

@ -55,8 +55,8 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
@Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false)
public File VARIANTS_FILE = null;
@Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used", required = false)
public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
@Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used; default is VCF", required = false)
public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
// the model used for calculating genotypes
@ -79,10 +79,10 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
return (BaseUtils.simpleBaseToBaseIndex(ref) != -1 && context.getReads().size() != 0);
}
/** Enable deletions in the pileup */
/** Enable deletions in the pileup **/
public boolean includeReadsWithDeletionAtLoci() { return true; }
/** Initialize the walker with some sensible defaults */
/** Initialize the samples, output, and genotype calculation model **/
public void initialize() {
// get all of the unique sample names
@ -104,7 +104,7 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, "UnifiedGenotyper",
this.getToolkit().getArguments().referenceFile.getName());
gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(UAC, samples, writer, logger);
gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, writer, logger, UAC);
}
/**