Implementing reference confidence estimate in UGv2 as per UGv1

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4542 348d0f76-0448-11de-a6fe-93d51630548a
2010-10-21 16:57:59 +00:00 · 2010-10-21 16:57:59 +00:00 · 225cf49128
parent cf9c9ae241
commit 225cf49128
2 changed files with 24 additions and 28 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -79,6 +79,9 @@ public class UnifiedGenotyperEngine {
    // the priors object
    private GenotypePriors genotypePriors;

+    // samples in input
+    private Set<String> samples = new TreeSet<String>();
+
    // the various loggers and writers
    private Logger logger = null;
    private PrintStream verboseWriter = null;
@ -98,16 +101,18 @@ public class UnifiedGenotyperEngine {
    public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
        // get the number of samples
        // if we're supposed to assume a single sample, do so
-        int numSamples;
+        samples = new TreeSet<String>();
        if ( UAC.ASSUME_SINGLE_SAMPLE != null )
-            numSamples = 1;
+            samples.add(UAC.ASSUME_SINGLE_SAMPLE);
        else
-            numSamples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size();
-        initialize(toolkit, UAC, null, null, null, numSamples);
+            samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
+
+        initialize(toolkit, UAC, null, null, null, samples.size());
    }

-    public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
-        initialize(toolkit, UAC, logger, verboseWriter, engine, numSamples);
+    public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples) {
+        this.samples = new TreeSet<String>(samples);
+        initialize(toolkit, UAC, logger, verboseWriter, engine, samples.size());
    }

    private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
@ -153,7 +158,7 @@ public class UnifiedGenotyperEngine {

        // estimate our confidence in a reference call and return
        if ( GLs.size() == 0 )
-            return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false);
+            return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false, 1.0);

        // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
        clearAFarray(log10AlleleFrequencyPosteriors.get());
@ -194,7 +199,7 @@ public class UnifiedGenotyperEngine {
        if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) {
            // technically, at this point our confidence in a reference call isn't accurately estimated
            //  because it didn't take into account samples with no data, so let's get a better estimate
-            return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true);
+            return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true, 1.0 - PofF);
        }

        // create the genotypes
@ -356,26 +361,19 @@ public class UnifiedGenotyperEngine {
            AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
    }

-    private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
+    private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {

-        // TODO: implement me
-
-        double P_of_ref = 1.0;
-
-        // use the AF=0 prob if it's calculated
-        //if ( ignoreCoveredSamples )
-        //    P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
+        double P_of_ref = initialPofRef;

        // for each sample that we haven't examined yet
-        //for ( String sample : samples ) {
-        //    boolean isCovered = contexts.containsKey(sample);
-        //    if ( ignoreCoveredSamples && isCovered )
-        //        continue;
+        for ( String sample : samples ) {
+            boolean isCovered = contexts.containsKey(sample);
+            if ( ignoreCoveredSamples && isCovered )
+                continue;

-        P_of_ref = 0.5;
-        //    int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
-        //    P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
-        //}
+            int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
+            P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
+        }

        return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java
@ -78,9 +78,6 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
    // the annotation engine
    private VariantAnnotatorEngine annotationEngine;

-    // samples in input
-    private Set<String> samples = new TreeSet<String>();
-
    // enable deletions in the pileup
    public boolean includeReadsWithDeletionAtLoci() { return true; }

@ -118,6 +115,7 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
    public void initialize() {
        // get all of the unique sample names
        // if we're supposed to assume a single sample, do so
+        Set<String> samples = new TreeSet<String>();
        if ( UAC.ASSUME_SINGLE_SAMPLE != null )
            samples.add(UAC.ASSUME_SINGLE_SAMPLE);
        else
@ -128,7 +126,7 @@ public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedG
            verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");

        annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
-        UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples.size());
+        UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);

        // initialize the header
        writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;