Remodeled version of the UnifiedGenotyper.
We currently get identical lods and slods as MultiSampleCaller (except slods for ref calls, as I discussed with Jared) and are a bit faster in my few test cases. Single-sample mode still emulates SSG. The remaining to do items: 1. more testing still needed 2. we currently only output lods/slods, but I need to emit actual calls 3. stubs are in place for Mark's proposed version of the EM calculation and now I need to add the actual code. More check-ins coming soon... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1821 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b28446acac
commit
96b8499a31
|
|
@ -1,23 +1,37 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel {
|
||||
|
||||
public class AllMAFsGenotypeCalculationModel extends GenotypeCalculationModel {
|
||||
|
||||
protected AllMAFsGenotypeCalculationModel() {}
|
||||
|
||||
public List<GenotypeCall> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
|
||||
protected double[] initializeAlleleFrequencies(int numSamples, int[] baseCounts) {
|
||||
double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles
|
||||
|
||||
// This is just a stub...
|
||||
// TODO --- implement me
|
||||
|
||||
// In this alternative approach we calculate the likelihoods of all possible
|
||||
// MAFs (1-N) and choose the most likely one. We can probably be smart and
|
||||
// avoid calculating MAFs that we know can't be the most likely...
|
||||
|
||||
return alleleFrequencies;
|
||||
}
|
||||
|
||||
protected HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
return null;
|
||||
}
|
||||
|
||||
protected double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
return null;
|
||||
}
|
||||
|
||||
protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies) {
|
||||
|
||||
}
|
||||
|
||||
protected EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -4,224 +4,119 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class EMGenotypeCalculationModel extends GenotypeCalculationModel {
|
||||
public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel {
|
||||
|
||||
// We need to set a limit on the EM iterations in case something flukey goes on
|
||||
private static final int MAX_EM_ITERATIONS = 6;
|
||||
protected static final int MAX_EM_ITERATIONS = 6;
|
||||
|
||||
// We consider the EM stable when the MAF doesn't change more than 1/10N
|
||||
private static final double EM_STABILITY_METRIC = 0.1;
|
||||
protected static final double EM_STABILITY_METRIC = 0.1;
|
||||
|
||||
// keep track of some metrics about our calls
|
||||
private CallMetrics callsMetrics = new CallMetrics();
|
||||
protected CallMetrics callsMetrics = new CallMetrics();
|
||||
|
||||
|
||||
protected EMGenotypeCalculationModel() {}
|
||||
|
||||
public List<GenotypeCall> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
|
||||
|
||||
// keep track of the GenotypeLikelihoods for each sample, separated by strand
|
||||
HashMap<String, UnifiedGenotypeLikelihoods> GLs = new HashMap<String, UnifiedGenotypeLikelihoods>();
|
||||
|
||||
// keep track of the total counts of each base in the pileup
|
||||
// keep track of the context for each sample, overall and separated by strand
|
||||
int[] baseCounts = new int[4];
|
||||
int totalObservedBases = 0;
|
||||
|
||||
// First order of business: create the initial GenotypeLikelihoods objects.
|
||||
// Also, confirm that there aren't too many deletions in the pileup.
|
||||
int deletionsInPile = 0;
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
|
||||
for (int i = 0; i < reads.size(); i++) {
|
||||
|
||||
// get the read and offset
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
|
||||
// skip deletions
|
||||
if ( offset == -1 ) {
|
||||
// are there too many deletions in the pileup?
|
||||
if ( ++deletionsInPile > maxDeletionsInPileup )
|
||||
return null;
|
||||
continue;
|
||||
}
|
||||
|
||||
// get the base; don't use bad bases
|
||||
char base = read.getReadString().charAt(offset);
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
if ( baseIndex == -1 )
|
||||
continue;
|
||||
|
||||
// create the GL holder object if this is the first time we're seeing a base for this sample
|
||||
String readGroup = read.getAttribute("RG").toString(); // can't be null because those are filtered out
|
||||
String sample = read.getHeader().getReadGroup(readGroup).getSample();
|
||||
UnifiedGenotypeLikelihoods myGLs = GLs.get(sample);
|
||||
if ( myGLs == null ) {
|
||||
myGLs = new UnifiedGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform, VERBOSE);
|
||||
GLs.put(sample, myGLs);
|
||||
}
|
||||
|
||||
// assign the base to the appropriate strand
|
||||
myGLs.add(base, read, offset);
|
||||
|
||||
// update the base counts
|
||||
baseCounts[baseIndex]++;
|
||||
totalObservedBases++;
|
||||
}
|
||||
|
||||
// for now, we need to special-case single sample mode
|
||||
if ( samples.size() == 1 ) {
|
||||
String sample = samples.iterator().next();
|
||||
UnifiedGenotypeLikelihoods UGL = GLs.get(sample);
|
||||
// if there were no good bases, the likelihoods object wouldn't exist
|
||||
if ( UGL == null )
|
||||
return null;
|
||||
|
||||
callsMetrics.nCalledBases++;
|
||||
UGL.setPriors(priors);
|
||||
GenotypeCall call = new GenotypeCall(sample,context.getLocation(), ref, UGL.getGenotypeLikelihoods(), new ReadBackedPileup(ref, context));
|
||||
|
||||
if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) {
|
||||
double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError());
|
||||
if ( confidence >= LOD_THRESHOLD ) {
|
||||
callsMetrics.nConfidentCalls++;
|
||||
out.addGenotypeCall(call);
|
||||
} else {
|
||||
callsMetrics.nNonConfidentCalls++;
|
||||
}
|
||||
}
|
||||
return Arrays.asList(call);
|
||||
}
|
||||
|
||||
callsMetrics.nCalledBases++;
|
||||
|
||||
// Next, we need to create initial allele frequencies.
|
||||
// An intelligent guess would be the observed base ratios (plus some small number to account for sampling issues).
|
||||
|
||||
double[] alleleFrequencies = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
|
||||
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
|
||||
|
||||
for ( UnifiedGenotypeLikelihoods UGL : GLs.values() )
|
||||
UGL.setPriors(AFPriors);
|
||||
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, baseCounts);
|
||||
if ( contexts == null )
|
||||
return null;
|
||||
|
||||
// debugging output
|
||||
for (int i = 0; i < 4; i++)
|
||||
logger.debug("Base count and initial allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i] + ", " + alleleFrequencies[i]);
|
||||
logger.debug("Base count for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i]);
|
||||
|
||||
|
||||
// The EM loop:
|
||||
// we want to continue until the calculation is stable, but we need some max on the number of iterations
|
||||
int iterations = 0;
|
||||
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N
|
||||
double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)GLs.size());
|
||||
boolean isStable = false;
|
||||
// run the EM calculation
|
||||
EMOutput overall = runEM(ref, contexts, priors, baseCounts, StratifiedContext.OVERALL);
|
||||
double lod = overall.getPofD() - overall.getPofNull();
|
||||
logger.debug("lod=" + lod);
|
||||
|
||||
while ( !isStable && iterations < MAX_EM_ITERATIONS ) {
|
||||
// calculate strand score
|
||||
EMOutput forward = runEM(ref, contexts, priors, baseCounts, StratifiedContext.FORWARD);
|
||||
EMOutput reverse = runEM(ref, contexts, priors, baseCounts, StratifiedContext.REVERSE);
|
||||
double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
|
||||
double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
|
||||
logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
|
||||
double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
|
||||
|
||||
// calculate the posterior-weighted allele frequencies and modify the priors accordingly
|
||||
double[] newAlleleFrequencies = getPosteriorWeightedFrequencies(GLs.values());
|
||||
AFPriors = calculateAlleleFreqBasedPriors(newAlleleFrequencies);
|
||||
for ( UnifiedGenotypeLikelihoods UGL : GLs.values() )
|
||||
UGL.setPriors(AFPriors);
|
||||
// TODO -- finish me...
|
||||
|
||||
// determine whether we're stable
|
||||
double AF_delta = 0.0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
AF_delta += Math.abs(alleleFrequencies[i] - newAlleleFrequencies[i]);
|
||||
logger.debug("Previous allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i] + ", vs. new frequency: " + newAlleleFrequencies[i]);
|
||||
}
|
||||
System.out.println(String.format("LOD=%f, SLOD=%f", lod, strandScore));
|
||||
|
||||
isStable = AF_delta < EM_STABILITY_VALUE;
|
||||
iterations++;
|
||||
|
||||
alleleFrequencies = newAlleleFrequencies;
|
||||
}
|
||||
|
||||
logger.debug("EM loop took " + iterations + " iterations");
|
||||
|
||||
if (true)
|
||||
throw new RuntimeException("DEBUGGING");
|
||||
|
||||
// compute actual priors: theta / MAF
|
||||
double pF = computeThetaOverMAF(ref, alleleFrequencies, GLs.size());
|
||||
|
||||
// the posteriors from the EM loop are the population likelihoods here
|
||||
// TODO -- finish me
|
||||
|
||||
|
||||
/*
|
||||
ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods;
|
||||
pD = Compute_pD(G, em_result.sample_weights);
|
||||
pNull = Compute_pNull(contexts, em_result.sample_weights);
|
||||
|
||||
// Apply p(f).
|
||||
double pVar = 0.0;
|
||||
for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; }
|
||||
|
||||
double p0 = Math.log10(1 - pVar);
|
||||
double pF;
|
||||
|
||||
double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods);
|
||||
|
||||
if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; }
|
||||
else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); }
|
||||
|
||||
pD = pD + pF;
|
||||
pNull = pNull + p0;
|
||||
|
||||
lod = pD - pNull;
|
||||
return lod;
|
||||
|
||||
*/
|
||||
|
||||
// make a call
|
||||
// List<GenotypeCall> calls
|
||||
//return new GenotypeCall(context.getLocation(), ref,gl, pileup);
|
||||
//out.addMultiSampleCall((Genotype)calls);
|
||||
//out.addMultiSampleCall((Genotype)calls, GenotypeMetaData metadata);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
double[] getPosteriorWeightedFrequencies(Collection<UnifiedGenotypeLikelihoods> UGLs) {
|
||||
double[] newAlleleLikelihoods = new double[4];
|
||||
public EMOutput runEM(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, int[] baseCounts, StratifiedContext contextType) {
|
||||
|
||||
for ( UnifiedGenotypeLikelihoods UGL : UGLs ) {
|
||||
// get initial allele frequencies
|
||||
double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), baseCounts);
|
||||
for (int i = 0; i < alleleFrequencies.length; i++)
|
||||
logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]);
|
||||
|
||||
// calculate the posterior weighted frequencies for this sample
|
||||
double[] personalAllelePosteriors = new double[4];
|
||||
double sum = 0;
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double posterior = Math.pow(10, UGL.getGenotypeLikelihoods().getPosterior(g));
|
||||
personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base1)] += posterior;
|
||||
personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base2)] += posterior;
|
||||
sum += 2.0 * posterior;
|
||||
}
|
||||
// get the initial genotype likelihoods
|
||||
HashMap<String, GenotypeLikelihoods> GLs = initializeGenotypeLikelihoods(ref, contexts, alleleFrequencies, priors, contextType);
|
||||
|
||||
// normalize
|
||||
for (int i = 0; i < 4; i++)
|
||||
personalAllelePosteriors[i] /= sum;
|
||||
for (int i = 0; i < 4; i++)
|
||||
newAlleleLikelihoods[i] += personalAllelePosteriors[i];
|
||||
}
|
||||
callsMetrics.nCalledBases++;
|
||||
|
||||
// The EM loop:
|
||||
// we want to continue until the calculation is stable, but we need some max on the number of iterations
|
||||
int iterations = 0;
|
||||
boolean EM_IS_STABLE;
|
||||
|
||||
// normalize
|
||||
double sum = 0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
sum += newAlleleLikelihoods[i];
|
||||
for (int i = 0; i < 4; i++)
|
||||
newAlleleLikelihoods[i] /= sum;
|
||||
do {
|
||||
double[] newAlleleFrequencies = calculateAlleleFrequencyPosteriors(GLs);
|
||||
for (int i = 0; i < alleleFrequencies.length; i++)
|
||||
logger.debug("New allele frequency for i=" + i + ": " + newAlleleFrequencies[i]);
|
||||
|
||||
return newAlleleLikelihoods;
|
||||
applyAlleleFrequencyToGenotypeLikelihoods(GLs, newAlleleFrequencies);
|
||||
|
||||
EM_IS_STABLE = isStable(alleleFrequencies, newAlleleFrequencies, contexts.size());
|
||||
|
||||
alleleFrequencies = newAlleleFrequencies;
|
||||
|
||||
} while ( ++iterations < MAX_EM_ITERATIONS && !EM_IS_STABLE );
|
||||
|
||||
logger.debug("EM loop took " + iterations + " iterations");
|
||||
for ( String sample : GLs.keySet() )
|
||||
logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString());
|
||||
|
||||
return computePofF(ref, GLs, alleleFrequencies, contexts.size());
|
||||
}
|
||||
|
||||
private DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
|
||||
protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts);
|
||||
protected abstract HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType);
|
||||
protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs);
|
||||
protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies);
|
||||
protected abstract EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext);
|
||||
|
||||
|
||||
protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) {
|
||||
// We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N
|
||||
double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)numSamplesInContext);
|
||||
|
||||
// determine whether we're stable
|
||||
double AF_delta = 0.0;
|
||||
for (int i = 0; i < oldAlleleFrequencies.length; i++)
|
||||
AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]);
|
||||
|
||||
return (AF_delta < EM_STABILITY_VALUE);
|
||||
}
|
||||
|
||||
protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) {
|
||||
// convert to log-space
|
||||
double[] log10Freqs = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
|
|
@ -240,25 +135,66 @@ public class EMGenotypeCalculationModel extends GenotypeCalculationModel {
|
|||
return new DiploidGenotypePriors(alleleFreqPriors);
|
||||
}
|
||||
|
||||
private double computeThetaOverMAF(char ref, double[] alleleFrequencies, int samplesAtLocus) {
|
||||
double MAF;
|
||||
Integer[] sortedIndexes = Utils.SortPermutation(alleleFrequencies);
|
||||
if ( sortedIndexes[3] != BaseUtils.simpleBaseToBaseIndex(ref) )
|
||||
MAF = alleleFrequencies[sortedIndexes[3]];
|
||||
else
|
||||
MAF = alleleFrequencies[sortedIndexes[2]];
|
||||
/**
|
||||
* Create the mapping from sample to alignment context; also, fill in the base counts along the way.
|
||||
* Returns null iff there are too many deletions at this position.
|
||||
*/
|
||||
protected HashMap<String, AlignmentContextBySample> splitContextBySample(AlignmentContext context, int[] baseCounts) {
|
||||
|
||||
int expectedChrs = (int)(2.0 * (double)samplesAtLocus * MAF);
|
||||
HashMap<String, AlignmentContextBySample> contexts = new HashMap<String, AlignmentContextBySample>();
|
||||
for (int i = 0; i < 4; i++)
|
||||
baseCounts[i] = 0;
|
||||
|
||||
// TODO -- we need to use the priors from the UnifiedArgumentCollection
|
||||
return DiploidGenotypePriors.HUMAN_HETEROZYGOSITY / (double)expectedChrs;
|
||||
int deletionsInPile = 0;
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
|
||||
for (int i = 0; i < reads.size(); i++) {
|
||||
|
||||
// get the read and offset
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
|
||||
// find the sample; special case for pools
|
||||
String sample;
|
||||
if ( POOLED_INPUT ) {
|
||||
sample = "POOL";
|
||||
} else {
|
||||
String readGroup = read.getAttribute("RG").toString(); // can't be null because those are filtered out
|
||||
sample = read.getHeader().getReadGroup(readGroup).getSample();
|
||||
}
|
||||
|
||||
// create a new context object if this is the first time we're seeing a read for this sample
|
||||
AlignmentContextBySample myContext = contexts.get(sample);
|
||||
if ( myContext == null ) {
|
||||
myContext = new AlignmentContextBySample(context.getLocation());
|
||||
contexts.put(sample, myContext);
|
||||
}
|
||||
|
||||
// check for deletions
|
||||
if ( offset == -1 ) {
|
||||
// are there too many deletions in the pileup?
|
||||
if ( ++deletionsInPile > maxDeletionsInPileup )
|
||||
return null;
|
||||
} else {
|
||||
// add bad bases to the context (for DoC calculations), but don't count them
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset));
|
||||
if ( baseIndex != -1 )
|
||||
baseCounts[baseIndex]++;
|
||||
}
|
||||
|
||||
// add the read to this sample's context
|
||||
myContext.add(read, offset);
|
||||
}
|
||||
|
||||
return contexts;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A class to keep track of some basic metrics about our calls
|
||||
*/
|
||||
private class CallMetrics {
|
||||
protected class CallMetrics {
|
||||
long nConfidentCalls = 0;
|
||||
long nNonConfidentCalls = 0;
|
||||
long nCalledBases = 0;
|
||||
|
|
@ -266,8 +202,96 @@ public class EMGenotypeCalculationModel extends GenotypeCalculationModel {
|
|||
CallMetrics() {}
|
||||
|
||||
public String toString() {
|
||||
return String.format("SSG: %d confident and %d non-confident calls were made at %d bases",
|
||||
return String.format("UG: %d confident and %d non-confident calls were made at %d bases",
|
||||
nConfidentCalls, nNonConfidentCalls, nCalledBases);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A class to keep track of the EM output
|
||||
*/
|
||||
protected class EMOutput {
|
||||
private double pD, pNull, pF;
|
||||
private HashMap<String, GenotypeLikelihoods> GLs;
|
||||
|
||||
EMOutput(double pD, double pNull, double pF, HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
this.pD = pD;
|
||||
this.pNull = pNull;
|
||||
this.pF = pF;
|
||||
this.GLs = GLs;
|
||||
}
|
||||
|
||||
public double getPofD() { return pD; }
|
||||
public double getPofNull() { return pNull; }
|
||||
public double getPofF() { return pF; }
|
||||
public HashMap<String, GenotypeLikelihoods> getGenotypeLikelihoods() { return GLs; }
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A class to keep track of the alignment context observed for a given sample.
|
||||
* we currently store the overall context and strand-stratified sets,
|
||||
* but any arbitrary stratification could be added.
|
||||
*/
|
||||
protected enum StratifiedContext { OVERALL, FORWARD, REVERSE }
|
||||
protected class AlignmentContextBySample {
|
||||
|
||||
private AlignmentContext overall = null;
|
||||
private AlignmentContext forward = null;
|
||||
private AlignmentContext reverse = null;
|
||||
private GenomeLoc loc;
|
||||
|
||||
private ArrayList<SAMRecord> allReads = new ArrayList<SAMRecord>();
|
||||
private ArrayList<SAMRecord> forwardReads = new ArrayList<SAMRecord>();
|
||||
private ArrayList<SAMRecord> reverseReads = new ArrayList<SAMRecord>();
|
||||
|
||||
private ArrayList<Integer> allOffsets = new ArrayList<Integer>();
|
||||
private ArrayList<Integer> forwardOffsets = new ArrayList<Integer>();
|
||||
private ArrayList<Integer> reverseOffsets = new ArrayList<Integer>();
|
||||
|
||||
|
||||
AlignmentContextBySample(GenomeLoc loc) {
|
||||
this.loc = loc;
|
||||
}
|
||||
|
||||
public AlignmentContext getContext(StratifiedContext context) {
|
||||
switch ( context ) {
|
||||
case OVERALL: return getOverallContext();
|
||||
case FORWARD: return getForwardContext();
|
||||
case REVERSE: return getReverseContext();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private AlignmentContext getOverallContext() {
|
||||
if ( overall == null )
|
||||
overall = new AlignmentContext(loc, allReads, allOffsets);
|
||||
return overall;
|
||||
}
|
||||
|
||||
private AlignmentContext getForwardContext() {
|
||||
if ( forward == null )
|
||||
forward = new AlignmentContext(loc, forwardReads, forwardOffsets);
|
||||
return forward;
|
||||
}
|
||||
|
||||
private AlignmentContext getReverseContext() {
|
||||
if ( reverse == null )
|
||||
reverse = new AlignmentContext(loc, reverseReads, reverseOffsets);
|
||||
return reverse;
|
||||
}
|
||||
|
||||
public void add(SAMRecord read, int offset) {
|
||||
if ( read.getReadNegativeStrandFlag() ) {
|
||||
reverseReads.add(read);
|
||||
reverseOffsets.add(offset);
|
||||
} else {
|
||||
forwardReads.add(read);
|
||||
forwardOffsets.add(offset);
|
||||
}
|
||||
allReads.add(read);
|
||||
allOffsets.add(offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -14,16 +14,18 @@ import java.util.*;
|
|||
public abstract class GenotypeCalculationModel implements Cloneable {
|
||||
|
||||
public enum Model {
|
||||
EM,
|
||||
ALL_MAFS
|
||||
EM_POINT_ESTIMATE,
|
||||
EM_ALL_MAFS
|
||||
}
|
||||
|
||||
protected BaseMismatchModel baseModel;
|
||||
protected Set<String> samples;
|
||||
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
|
||||
protected GenotypeWriter out;
|
||||
protected Logger logger;
|
||||
protected double heterozygosity;
|
||||
protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
|
||||
protected boolean GENOTYPE_MODE;
|
||||
protected boolean POOLED_INPUT;
|
||||
protected double LOD_THRESHOLD;
|
||||
protected int maxDeletionsInPileup;
|
||||
protected boolean VERBOSE;
|
||||
|
|
@ -37,34 +39,26 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
/**
|
||||
* Initialize the GenotypeCalculationModel object
|
||||
* Assumes that out is not null
|
||||
* @param baseModel base model to use
|
||||
* @param samples samples in input bam
|
||||
* @param platform default platform
|
||||
* @param out output writer
|
||||
* @param out logger
|
||||
* @param genotypeMode genotyping
|
||||
* @param lod lod threshold
|
||||
* @param maxDeletions max deletions to tolerate
|
||||
* @param verbose verbose flag
|
||||
* @param logger logger
|
||||
* @param UAC unified arg collection
|
||||
*/
|
||||
protected void initialize(BaseMismatchModel baseModel,
|
||||
Set<String> samples,
|
||||
EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform,
|
||||
GenotypeWriter out,
|
||||
Logger logger,
|
||||
boolean genotypeMode,
|
||||
double lod,
|
||||
int maxDeletions,
|
||||
boolean verbose) {
|
||||
this.baseModel = baseModel;
|
||||
protected void initialize(Set<String> samples,
|
||||
GenotypeWriter out,
|
||||
Logger logger,
|
||||
UnifiedArgumentCollection UAC) {
|
||||
this.samples = samples;
|
||||
defaultPlatform = platform;
|
||||
this.out = out;
|
||||
this.logger = logger;
|
||||
GENOTYPE_MODE = genotypeMode;
|
||||
LOD_THRESHOLD = lod;
|
||||
maxDeletionsInPileup = maxDeletions;
|
||||
VERBOSE = verbose;
|
||||
baseModel = UAC.baseModel;
|
||||
heterozygosity = UAC.heterozygosity;
|
||||
defaultPlatform = UAC.defaultPlatform;
|
||||
GENOTYPE_MODE = UAC.GENOTYPE;
|
||||
POOLED_INPUT = UAC.POOLED;
|
||||
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
||||
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
||||
VERBOSE = UAC.VERBOSE;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -74,12 +68,14 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone();
|
||||
gcm.baseModel = baseModel;
|
||||
gcm.samples = new HashSet<String>(samples);
|
||||
gcm.defaultPlatform = defaultPlatform;
|
||||
gcm.out = out;
|
||||
gcm.logger = logger;
|
||||
gcm.baseModel = baseModel;
|
||||
gcm.heterozygosity = heterozygosity;
|
||||
gcm.defaultPlatform = defaultPlatform;
|
||||
gcm.GENOTYPE_MODE = GENOTYPE_MODE;
|
||||
gcm.POOLED_INPUT = POOLED_INPUT;
|
||||
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
|
||||
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
|
||||
gcm.VERBOSE = VERBOSE;
|
||||
|
|
@ -88,6 +84,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param tracker rod data
|
||||
* @param ref reference base
|
||||
* @param context alignment context
|
||||
* @param priors priors to use for GL
|
||||
|
|
|
|||
|
|
@ -52,22 +52,22 @@ public class GenotypeCalculationModelFactory {
|
|||
* @param logger logger
|
||||
* @return model
|
||||
*/
|
||||
public static GenotypeCalculationModel makeGenotypeCalculation(UnifiedArgumentCollection UAC,
|
||||
Set<String> samples,
|
||||
public static GenotypeCalculationModel makeGenotypeCalculation(Set<String> samples,
|
||||
GenotypeWriter out,
|
||||
Logger logger) {
|
||||
Logger logger,
|
||||
UnifiedArgumentCollection UAC) {
|
||||
GenotypeCalculationModel gcm;
|
||||
switch ( UAC.genotypeModel ) {
|
||||
case EM:
|
||||
gcm = new EMGenotypeCalculationModel();
|
||||
case EM_POINT_ESTIMATE:
|
||||
gcm = new PointEstimateGenotypeCalculationModel();
|
||||
break;
|
||||
case ALL_MAFS:
|
||||
case EM_ALL_MAFS:
|
||||
gcm = new AllMAFsGenotypeCalculationModel();
|
||||
break;
|
||||
default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
|
||||
}
|
||||
|
||||
gcm.initialize(UAC.baseModel, samples, UAC.defaultPlatform, out, logger, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE);
|
||||
gcm.initialize(samples, out, logger, UAC);
|
||||
return gcm;
|
||||
}
|
||||
}
|
||||
|
|
@ -121,7 +121,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return get the one minus error value
|
||||
*/
|
||||
@Override
|
||||
public double getNegLog10PError() {
|
||||
return Math.abs(mGenotypeLikelihoods.getPosterior(getBestGenotype()) - mGenotypeLikelihoods.getPosterior(getNextBest()));
|
||||
}
|
||||
|
|
@ -149,7 +148,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return the bases, as a string
|
||||
*/
|
||||
@Override
|
||||
public String getBases() {
|
||||
return getBestGenotype().toString();
|
||||
}
|
||||
|
|
@ -159,7 +157,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return the ploidy value
|
||||
*/
|
||||
@Override
|
||||
public int getPloidy() {
|
||||
return 2;
|
||||
}
|
||||
|
|
@ -169,7 +166,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return true if we're homozygous, false otherwise
|
||||
*/
|
||||
@Override
|
||||
public boolean isHom() {
|
||||
return getBestGenotype().isHom();
|
||||
}
|
||||
|
|
@ -179,7 +175,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return true if we're het, false otherwise
|
||||
*/
|
||||
@Override
|
||||
public boolean isHet() {
|
||||
return !isHom();
|
||||
}
|
||||
|
|
@ -190,7 +185,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return position on the genome wrapped in GenomeLoc object
|
||||
*/
|
||||
@Override
|
||||
public GenomeLoc getLocation() {
|
||||
return this.mLocation;
|
||||
}
|
||||
|
|
@ -200,7 +194,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return true is a SNP
|
||||
*/
|
||||
@Override
|
||||
public boolean isPointGenotype() {
|
||||
return true;
|
||||
}
|
||||
|
|
@ -212,7 +205,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return true if we're a variant
|
||||
*/
|
||||
@Override
|
||||
public boolean isVariant(char ref) {
|
||||
return !Utils.dupString(this.getReference(), 2).equals(getBestGenotype().toString());
|
||||
}
|
||||
|
|
@ -287,13 +279,11 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public double[] getPosteriors() {
|
||||
return this.mGenotypeLikelihoods.getPosteriors();
|
||||
}
|
||||
|
||||
/** @return returns the sample name for this genotype */
|
||||
@Override
|
||||
public String getSampleName() {
|
||||
return this.mSampleName;
|
||||
}
|
||||
|
|
@ -303,7 +293,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like
|
|||
*
|
||||
* @return a string, representing the genotyping value
|
||||
*/
|
||||
@Override
|
||||
public String getFilteringValue() {
|
||||
return "0";
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,204 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculationModel {
|
||||
|
||||
protected PointEstimateGenotypeCalculationModel() {}
|
||||
|
||||
// overload this method so we can special-case the single sample
|
||||
public List<GenotypeCall> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
|
||||
|
||||
// we don't actually want to run EM for single samples
|
||||
if ( samples.size() == 1 ) {
|
||||
|
||||
// split the context (so we can get forward/reverse contexts for free)
|
||||
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context, new int[4]);
|
||||
if ( contexts == null )
|
||||
return null;
|
||||
|
||||
// get the context for the sample
|
||||
String sample = samples.iterator().next();
|
||||
AlignmentContextBySample sampleContext = contexts.get(sample);
|
||||
|
||||
// if there were no good bases, the context wouldn't exist
|
||||
if ( sampleContext == null )
|
||||
return null;
|
||||
|
||||
callsMetrics.nCalledBases++;
|
||||
|
||||
// create the genotype call object
|
||||
Pair<ReadBackedPileup, GenotypeLikelihoods> discoveryGL = getSingleSampleLikelihoods(ref, sampleContext, priors, StratifiedContext.OVERALL);
|
||||
GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, discoveryGL.second, discoveryGL.first);
|
||||
|
||||
if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) {
|
||||
double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError());
|
||||
if ( confidence >= LOD_THRESHOLD ) {
|
||||
callsMetrics.nConfidentCalls++;
|
||||
out.addGenotypeCall(call);
|
||||
} else {
|
||||
callsMetrics.nNonConfidentCalls++;
|
||||
}
|
||||
}
|
||||
return Arrays.asList(call);
|
||||
}
|
||||
|
||||
return super.calculateGenotype(tracker, ref, context, priors);
|
||||
}
|
||||
|
||||
private Pair<ReadBackedPileup, GenotypeLikelihoods> getSingleSampleLikelihoods(char ref, AlignmentContextBySample sampleContext, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
// create the pileup
|
||||
AlignmentContext myContext = sampleContext.getContext(contextType);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, myContext);
|
||||
pileup.setIncludeDeletionsInPileupString(true);
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
GL.setVerbose(VERBOSE);
|
||||
GL.add(pileup, true);
|
||||
return new Pair<ReadBackedPileup, GenotypeLikelihoods>(pileup, GL);
|
||||
}
|
||||
|
||||
protected double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts) {
|
||||
// An intelligent guess would be the observed base ratios
|
||||
// (plus some small number to account for sampling issues).
|
||||
int totalObservedBases = 0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
totalObservedBases += baseCounts[i];
|
||||
|
||||
double[] alleleFrequencies = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases;
|
||||
return alleleFrequencies;
|
||||
}
|
||||
|
||||
protected HashMap<String, GenotypeLikelihoods> initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
||||
HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
||||
|
||||
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
|
||||
|
||||
for ( String sample : contexts.keySet() ) {
|
||||
AlignmentContextBySample context = contexts.get(sample);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, AFPriors, defaultPlatform);
|
||||
GL.setVerbose(VERBOSE);
|
||||
GL.add(pileup, true);
|
||||
|
||||
GLs.put(sample, GL);
|
||||
}
|
||||
|
||||
return GLs;
|
||||
}
|
||||
|
||||
protected double[] calculateAlleleFrequencyPosteriors(HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
double[] newAlleleLikelihoods = new double[4];
|
||||
|
||||
for ( GenotypeLikelihoods GL : GLs.values() ) {
|
||||
double[] normalizedPosteriors = GL.getNormalizedPosteriors();
|
||||
|
||||
// calculate the posterior weighted frequencies for this sample
|
||||
double[] personalAllelePosteriors = new double[4];
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double posterior = normalizedPosteriors[g.ordinal()] / 2.0; // each base gets half the probability
|
||||
personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base1)] += posterior;
|
||||
personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base2)] += posterior;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
newAlleleLikelihoods[i] += personalAllelePosteriors[i];
|
||||
}
|
||||
|
||||
// normalize
|
||||
double sum = 0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
sum += newAlleleLikelihoods[i];
|
||||
for (int i = 0; i < 4; i++)
|
||||
newAlleleLikelihoods[i] /= sum;
|
||||
|
||||
return newAlleleLikelihoods;
|
||||
}
|
||||
|
||||
protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies) {
|
||||
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies);
|
||||
for ( GenotypeLikelihoods GL : GLs.values() )
|
||||
GL.setPriors(AFPriors);
|
||||
}
|
||||
|
||||
protected EMOutput computePofF(char ref, HashMap<String, GenotypeLikelihoods> GLs, double[] alleleFrequencies, int numSamplesInContext) {
|
||||
|
||||
// compute pD and pNull without allele frequencies
|
||||
double pD = compute_pD(GLs);
|
||||
double pNull = compute_pNull(ref, GLs);
|
||||
logger.debug("Original pD=" + pD + ", pNull=" + pNull);
|
||||
|
||||
// compute p0
|
||||
double pVar = 0.0;
|
||||
for (int i = 1; i < numSamplesInContext; i++)
|
||||
pVar += heterozygosity/(double)i;
|
||||
double p0 = Math.log10(1.0 - pVar);
|
||||
|
||||
// compute actual priors: theta / MAF
|
||||
double MAF;
|
||||
Integer[] sortedIndexes = Utils.SortPermutation(alleleFrequencies);
|
||||
if ( sortedIndexes[3] != BaseUtils.simpleBaseToBaseIndex(ref) )
|
||||
MAF = alleleFrequencies[sortedIndexes[3]];
|
||||
else
|
||||
MAF = alleleFrequencies[sortedIndexes[2]];
|
||||
|
||||
// compute pF
|
||||
double pF;
|
||||
double expectedChromosomes = 2.0 * (double)numSamplesInContext * MAF;
|
||||
if ( expectedChromosomes < 1.0 )
|
||||
pF = p0;
|
||||
else
|
||||
pF = Math.log10(heterozygosity / expectedChromosomes);
|
||||
logger.debug("p0=" + p0 + ", pF=" + pF);
|
||||
|
||||
pD += pF;
|
||||
pNull += p0;
|
||||
logger.debug("Final pD=" + pD + ", pNull=" + pNull);
|
||||
|
||||
return new EMOutput(pD, pNull, pF, GLs);
|
||||
}
|
||||
|
||||
private double compute_pD(HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
double pD = 0.0;
|
||||
for ( GenotypeLikelihoods GL : GLs.values() ) {
|
||||
double sum = 0.0;
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
sum += Math.pow(10, GL.getPosterior(g));
|
||||
}
|
||||
pD += Math.log10(sum);
|
||||
}
|
||||
return pD;
|
||||
}
|
||||
|
||||
private double compute_pNull(char ref, HashMap<String, GenotypeLikelihoods> GLs) {
|
||||
// compute null likelihoods
|
||||
double[] alleleLikelihoods = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
alleleLikelihoods[i] = 1e-6/3.0;
|
||||
alleleLikelihoods[BaseUtils.simpleBaseToBaseIndex(ref)] = 1.0-1e-6;
|
||||
DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleLikelihoods);
|
||||
|
||||
HashMap<String, GenotypeLikelihoods> GL_null = new HashMap<String, GenotypeLikelihoods>();
|
||||
try {
|
||||
for ( String sample : GLs.keySet() ) {
|
||||
GenotypeLikelihoods GL = (GenotypeLikelihoods)GLs.get(sample).clone();
|
||||
GL.setPriors(AFPriors);
|
||||
GL_null.put(sample, GL);
|
||||
}
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new StingException("Clone() not supported for given GenotypeLikelihoods subclass?");
|
||||
}
|
||||
|
||||
return compute_pD(GL_null);
|
||||
}
|
||||
}
|
||||
|
|
@ -26,29 +26,23 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Sep 29, 2009
|
||||
* Time: 12:07:43 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class UnifiedArgumentCollection {
|
||||
|
||||
// control the various models to be used
|
||||
@Argument(fullName = "genotypeModel", shortName = "gm", doc = "Genotype calculation model to employ -- EM is the default choice; the ALL_MAFS model is experimental", required = false)
|
||||
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM;
|
||||
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while EM_ALL_MAFS is under development", required = false)
|
||||
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
|
||||
|
||||
@Argument(fullName = "baseModel", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
|
||||
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||
|
||||
@Argument(fullName = "pooled", shortName = "pooled", doc = "Does the input bam represent pooled data (so that genotypes can't be called)?", required = false)
|
||||
public boolean POOLED = false;
|
||||
|
||||
|
||||
// control the output
|
||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
||||
|
|
@ -65,10 +59,10 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
|
||||
|
||||
@Argument(fullName = "maxDeletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
|
||||
@Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
|
||||
public Integer MAX_DELETIONS = 1;
|
||||
|
||||
@Argument(fullName = "maxCoverage", shortName = "maxCoverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
|
||||
@Argument(fullName = "max_coverage", shortName = "coverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
|
||||
public Integer MAX_READS_IN_PILEUP = 10000;
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,58 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
||||
/**
|
||||
A wrapper around the GenotypeLikelihoods class for the UnifiedGenotyper.
|
||||
This class incorporates both strand-based likelihoods and a combined likelihoods over both strands.
|
||||
*/
|
||||
|
||||
public class UnifiedGenotypeLikelihoods {
|
||||
|
||||
private GenotypeLikelihoods forwardStrandGL;
|
||||
private GenotypeLikelihoods reverseStrandGL;
|
||||
private GenotypeLikelihoods combinedGL;
|
||||
|
||||
public UnifiedGenotypeLikelihoods(BaseMismatchModel baseModel, DiploidGenotypePriors priors, EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform, boolean VERBOSE) {
|
||||
forwardStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
forwardStrandGL.setVerbose(VERBOSE);
|
||||
reverseStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
reverseStrandGL.setVerbose(VERBOSE);
|
||||
combinedGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
combinedGL.setVerbose(VERBOSE);
|
||||
}
|
||||
|
||||
public GenotypeLikelihoods getForwardStrandGenotypeLikelihoods() { return forwardStrandGL; }
|
||||
public GenotypeLikelihoods getReverseStrandGenotypeLikelihoods() { return reverseStrandGL; }
|
||||
public GenotypeLikelihoods getGenotypeLikelihoods() { return combinedGL; }
|
||||
|
||||
public void add(ReadBackedPileup pileup) {
|
||||
for (int i = 0; i < pileup.getReads().size(); i++) {
|
||||
int offset = pileup.getOffsets().get(i);
|
||||
// ignore deletions
|
||||
if ( offset == -1 )
|
||||
continue;
|
||||
|
||||
SAMRecord read = pileup.getReads().get(i);
|
||||
char base = read.getReadString().charAt(offset);
|
||||
add(base, read, offset);
|
||||
}
|
||||
}
|
||||
|
||||
public void add(char base, SAMRecord read, int offset) {
|
||||
if ( !read.getReadNegativeStrandFlag() )
|
||||
forwardStrandGL.add(base, read.getBaseQualities()[offset], read, offset);
|
||||
else
|
||||
reverseStrandGL.add(base, read.getBaseQualities()[offset], read, offset);
|
||||
combinedGL.add(base, read.getBaseQualities()[offset], read, offset);
|
||||
}
|
||||
|
||||
public void setPriors(DiploidGenotypePriors priors) {
|
||||
forwardStrandGL.setPriors(priors);
|
||||
reverseStrandGL.setPriors(priors);
|
||||
combinedGL.setPriors(priors);
|
||||
}
|
||||
}
|
||||
|
|
@ -55,8 +55,8 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
|
|||
@Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false)
|
||||
public File VARIANTS_FILE = null;
|
||||
|
||||
@Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used", required = false)
|
||||
public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
|
||||
@Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used; default is VCF", required = false)
|
||||
public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;
|
||||
|
||||
|
||||
// the model used for calculating genotypes
|
||||
|
|
@ -79,10 +79,10 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
|
|||
return (BaseUtils.simpleBaseToBaseIndex(ref) != -1 && context.getReads().size() != 0);
|
||||
}
|
||||
|
||||
/** Enable deletions in the pileup */
|
||||
/** Enable deletions in the pileup **/
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
/** Initialize the walker with some sensible defaults */
|
||||
/** Initialize the samples, output, and genotype calculation model **/
|
||||
public void initialize() {
|
||||
|
||||
// get all of the unique sample names
|
||||
|
|
@ -104,7 +104,7 @@ public class UnifiedGenotyper extends LocusWalker<List<GenotypeCall>, Integer> {
|
|||
writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, "UnifiedGenotyper",
|
||||
this.getToolkit().getArguments().referenceFile.getName());
|
||||
|
||||
gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(UAC, samples, writer, logger);
|
||||
gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, writer, logger, UAC);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue