Implementing reference confidence estimate in UGv2 as per UGv1

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4542 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-10-21 16:57:59 +00:00
parent cf9c9ae241
commit 225cf49128
2 changed files with 24 additions and 28 deletions

View File

@ -79,6 +79,9 @@ public class UnifiedGenotyperEngine {
// the priors object
private GenotypePriors genotypePriors;
// samples in input
private Set<String> samples = new TreeSet<String>();
// the various loggers and writers
private Logger logger = null;
private PrintStream verboseWriter = null;
@ -98,16 +101,18 @@ public class UnifiedGenotyperEngine {
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
// get the number of samples
// if we're supposed to assume a single sample, do so
int numSamples;
samples = new TreeSet<String>();
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
numSamples = 1;
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
numSamples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size();
initialize(toolkit, UAC, null, null, null, numSamples);
samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
initialize(toolkit, UAC, null, null, null, samples.size());
}
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
initialize(toolkit, UAC, logger, verboseWriter, engine, numSamples);
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples) {
this.samples = new TreeSet<String>(samples);
initialize(toolkit, UAC, logger, verboseWriter, engine, samples.size());
}
private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
@ -153,7 +158,7 @@ public class UnifiedGenotyperEngine {
// estimate our confidence in a reference call and return
if ( GLs.size() == 0 )
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false);
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false, 1.0);
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyPosteriors.get());
@ -194,7 +199,7 @@ public class UnifiedGenotyperEngine {
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true);
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true, 1.0 - PofF);
}
// create the genotypes
@ -356,26 +361,19 @@ public class UnifiedGenotyperEngine {
AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
// TODO: implement me
double P_of_ref = 1.0;
// use the AF=0 prob if it's calculated
//if ( ignoreCoveredSamples )
// P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
double P_of_ref = initialPofRef;
// for each sample that we haven't examined yet
//for ( String sample : samples ) {
// boolean isCovered = contexts.containsKey(sample);
// if ( ignoreCoveredSamples && isCovered )
// continue;
for ( String sample : samples ) {
boolean isCovered = contexts.containsKey(sample);
if ( ignoreCoveredSamples && isCovered )
continue;
P_of_ref = 0.5;
// int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
// P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
//}
int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
}
return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
}

View File

@ -78,9 +78,6 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
// samples in input
private Set<String> samples = new TreeSet<String>();
// enable deletions in the pileup
public boolean includeReadsWithDeletionAtLoci() { return true; }
@ -118,6 +115,7 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
public void initialize() {
// get all of the unique sample names
// if we're supposed to assume a single sample, do so
Set<String> samples = new TreeSet<String>();
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
@ -128,7 +126,7 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples.size());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);
// initialize the header
writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;