Don't carry around an extra copy of the code for the Haplotype Caller

This commit is contained in:
Eric Banks 2011-12-09 11:09:40 -05:00
parent 442ceb6ad9
commit 64dad13e2d
2 changed files with 31 additions and 107 deletions

View File

@ -588,12 +588,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
/**
* Can be overridden by concrete subclasses
* @param vc variant context with genotype likelihoods
* @param log10AlleleFrequencyLikelihoods likelihoods
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
public GenotypesContext assignGenotypes(VariantContext vc,
double[][] log10AlleleFrequencyPosteriors,
double[][] log10AlleleFrequencyLikelihoods,
int AFofMaxLikelihood) {
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());

View File

@ -97,6 +97,7 @@ public class UnifiedGenotyperEngine {
// the standard filter to use for calls below the confidence threshold but above the emit threshold
private static final Set<String> filter = new HashSet<String>(1);
private final GenomeLocParser genomeLocParser;
private final boolean BAQEnabledOnCMDLine;
@ -114,6 +115,7 @@ public class UnifiedGenotyperEngine {
@Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0"})
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples) {
this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF;
genomeLocParser = toolkit.getGenomeLocParser();
this.samples = new TreeSet<String>(samples);
// note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ
this.UAC = UAC.clone();
@ -290,8 +292,13 @@ public class UnifiedGenotyperEngine {
return new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles).genotypes(genotypes).referenceBaseForIndel(refContext.getBase()).make();
}
// private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine
private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) {
public VariantCallContext calculateGenotypes(VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) {
return calculateGenotypes(null, null, null, null, vc, model);
}
public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) {
boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
@ -307,10 +314,13 @@ public class UnifiedGenotyperEngine {
}
// estimate our confidence in a reference call and return
if ( vc.getNSamples() == 0 )
if ( vc.getNSamples() == 0 ) {
if ( limitedContext )
return null;
return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ?
estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), false, 1.0) :
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
}
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyLikelihoods.get());
@ -349,25 +359,31 @@ public class UnifiedGenotyperEngine {
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
}
// strip out the alternate allele(s) if we're making a ref call
Set<Allele> myAlleles = new HashSet<Allele>(vc.getAlleles());
if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
myAlleles = new HashSet<Allele>(1);
myAlleles.add(vc.getReference());
}
// create the genotypes
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyLikelihoods.get(), bestAFguess);
// print out stats if we have a writer
if ( verboseWriter != null )
if ( verboseWriter != null && !limitedContext )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();
// if the site was downsampled, record that fact
if ( rawContext.hasPileupBeenDownsampled() )
if ( !limitedContext && rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( UAC.COMPUTE_SLOD && bestAFguess != 0 ) {
if ( UAC.COMPUTE_SLOD && !limitedContext && bestAFguess != 0 ) {
//final boolean DEBUG_SLOD = false;
// the overall lod
@ -412,26 +428,18 @@ public class UnifiedGenotyperEngine {
attributes.put("SB", strandScore);
}
GenomeLoc loc = refContext.getLocus();
GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc);
Set<Allele> myAlleles = new HashSet<Allele>(vc.getAlleles());
// strip out the alternate allele if it's a ref call
if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
myAlleles = new HashSet<Allele>(1);
myAlleles.add(vc.getReference());
}
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles);
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles);
builder.genotypes(genotypes);
builder.log10PError(phredScaledConfidence/-10.0);
if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter);
builder.attributes(attributes);
builder.referenceBaseForIndel(refContext.getBase());
if ( !limitedContext )
builder.referenceBaseForIndel(refContext.getBase());
VariantContext vcCall = builder.make();
if ( annotationEngine != null ) {
if ( annotationEngine != null && !limitedContext ) {
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
ReadBackedPileup pileup = null;
if (rawContext.hasExtendedEventPileup())
@ -446,91 +454,6 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
// A barebones entry point to the exact model when there is no tracker or stratified contexts available -- only GLs
public VariantCallContext calculateGenotypes(final VariantContext vc, final GenomeLoc loc, final GenotypeLikelihoodsCalculationModel.Model model) {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyLikelihoods.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
log10AlleleFrequencyPosteriors.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) {
logger.warn("the Unified Genotyper is currently set to genotype at most " + UAC.MAX_ALTERNATE_ALLELES + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + vc.getAlternateAlleles().size() + " alternate alleles; see the --max_alternate_alleles argument");
return null;
}
// estimate our confidence in a reference call and return
if ( vc.getNSamples() == 0 )
return null;
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyLikelihoods.get());
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[0]);
// calculate p(f>0)
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[0]);
double sum = 0.0;
for (int i = 1; i <= N; i++)
sum += normalizedPosteriors[i];
double PofF = Math.min(sum, 1.0); // deal with precision errors
double phredScaledConfidence;
if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0][0];
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += log10AlleleFrequencyPosteriors.get()[0][i];
}
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return null;
}
// create the genotypes
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyLikelihoods.get(), bestAFguess);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();
int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc);
Set<Allele> myAlleles = new HashSet<Allele>(vc.getAlleles());
// strip out the alternate allele if it's a ref call
if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
myAlleles = new HashSet<Allele>(1);
myAlleles.add(vc.getReference());
}
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles);
builder.genotypes(genotypes);
builder.log10PError(phredScaledConfidence/-10.0);
if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter);
builder.attributes(attributes);
builder.referenceBaseForIndel(vc.getReferenceBaseForIndel());
return new VariantCallContext(builder.make(), confidentlyCalled(phredScaledConfidence, PofF));
}
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
// TODO - temp fix until we can deal with extended events properly
// for indels, stop location is one more than ref allele length