The previous version of the UG was always creating BAQ'd pileups for the underlying site QUAL calculation. This resulted in some slowdown in the code. But as far as I can tell, the code actually didn't apply the BAQ'd base quality anywhere when the BAQ field wasn't in the read, so this just saves us 20% of the runtime when BAQ isn't enabled from heading into the BAQ subsystem when we don't actually want to get the BAQ'd base qualities.

Fixed minor problem with WalkerTest for "" (for parameterization) md5s. Added an explicit integrationtest for BAQ NONE Now only creates the BAQ'd pileup, if the useBAQPileup parameter is provide in initializeAlternateAllele. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5891 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-27 14:00:52 +00:00 · 2011-05-27 14:00:52 +00:00 · 8ed82e5a08
parent 136c8c7900
commit 8ed82e5a08
4 changed files with 77 additions and 50 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
@ -101,12 +101,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC

            if ( !vc.isBiallelic() ) {
                // for multi-allelic sites go back to the reads and find the most likely alternate allele
-                initializeBestAlternateAllele(refBase, contexts);
+                initializeBestAlternateAllele(refBase, contexts, useBAQedPileup);
            } else {
                bestAlternateAllele = vc.getAlternateAllele(0).getBases()[0];
            }
        } else {
-            initializeBestAlternateAllele(refBase, contexts);
+            initializeBestAlternateAllele(refBase, contexts, useBAQedPileup);
        }

        // if there are no non-ref bases...
@ -148,12 +148,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
        return refAllele;
    }

-    protected void initializeBestAlternateAllele(byte ref, Map<String, AlignmentContext> contexts) {
+    protected void initializeBestAlternateAllele(byte ref, Map<String, AlignmentContext> contexts, boolean useBAQedPileup) {
        int[] qualCounts = new int[4];

        for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
            // calculate the sum of quality scores for each base
-            ReadBackedPileup pileup = createBAQedPileup( sample.getValue().getBasePileup() );
+            ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup();
            for ( PileupElement p : pileup ) {
                // ignore deletions
                if ( p.isDeletion() || p.getQual() < UAC.MIN_BASE_QUALTY_SCORE )
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -37,14 +37,15 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
 import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.baq.BAQ;
 import org.broadinstitute.sting.utils.pileup.*;
 import org.broad.tribble.vcf.VCFConstants;
+import com.google.java.contract.*;

 import java.io.PrintStream;
 import java.util.*;

 public class UnifiedGenotyperEngine {
-
    public static final String LOW_QUAL_FILTER_NAME = "LowQual";

    public enum OUTPUT_MODE {
@ -54,10 +55,10 @@ public class UnifiedGenotyperEngine {
    }

    // the unified argument collection
-    private UnifiedArgumentCollection UAC = null;
+    private final UnifiedArgumentCollection UAC;

    // the annotation engine
-    private VariantAnnotatorEngine annotationEngine;
+    private final VariantAnnotatorEngine annotationEngine;

    // the model used for calculating genotypes
    private ThreadLocal<Map<GenotypeLikelihoodsCalculationModel.Model, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<GenotypeLikelihoodsCalculationModel.Model, GenotypeLikelihoodsCalculationModel>>();
@ -66,49 +67,52 @@ public class UnifiedGenotyperEngine {
    private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();

    // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
-    private double[] log10AlleleFrequencyPriorsSNPs;
-    private double[] log10AlleleFrequencyPriorsIndels;
+    private final double[] log10AlleleFrequencyPriorsSNPs;
+    private final double[] log10AlleleFrequencyPriorsIndels;

    // the allele frequency likelihoods (allocated once as an optimization)
    private ThreadLocal<double[]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[]>();

    // the priors object
-    private GenotypePriors genotypePriorsSNPs;
-    private GenotypePriors genotypePriorsIndels;
+    private final GenotypePriors genotypePriorsSNPs;
+    private final GenotypePriors genotypePriorsIndels;

    // samples in input
-    private Set<String> samples = new TreeSet<String>();
+    private final Set<String> samples;

    // the various loggers and writers
-    private Logger logger = null;
-    private PrintStream verboseWriter = null;
+    private final Logger logger;
+    private final PrintStream verboseWriter;

    // number of chromosomes (2 * samples) in input
-    private int N;
+    private final int N;

    // the standard filter to use for calls below the confidence threshold but above the emit threshold
    private static final Set<String> filter = new HashSet<String>(1);

+    private final boolean BAQEnabledOnCMDLine;

+
+
+    // ---------------------------------------------------------------------------------------------------------
+    //
+    // Public interface functions
+    //
+    // ---------------------------------------------------------------------------------------------------------
+    @Requires({"toolkit != null", "UAC != null"})
    public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
-        // get the number of samples
-        // if we're supposed to assume a single sample, do so
-        samples = new TreeSet<String>();
-        if ( UAC.ASSUME_SINGLE_SAMPLE != null )
-            samples.add(UAC.ASSUME_SINGLE_SAMPLE);
-        else
-            samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
-
-        final Logger logger = Logger.getLogger(UnifiedGenotyperEngine.class);
-        initialize(UAC, logger, null, null, samples.size());
+        this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null,
+                // get the number of samples
+                // if we're supposed to assume a single sample, do so
+                UAC.ASSUME_SINGLE_SAMPLE != null ?
+                        new TreeSet<String>(Arrays.asList(UAC.ASSUME_SINGLE_SAMPLE)) :
+                        SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()));
    }

+    @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0"})
    public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples) {
+        this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF;
        this.samples = new TreeSet<String>(samples);
-        initialize(UAC, logger, verboseWriter, engine, samples.size());
-    }
-
-    private void initialize(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
        // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ
        this.UAC = UAC.clone();
        this.UAC.MIN_MAPPING_QUALTY_SCORE = Math.max(UAC.MIN_MAPPING_QUALTY_SCORE, UAC.MIN_BASE_QUALTY_SCORE);
@ -117,7 +121,7 @@ public class UnifiedGenotyperEngine {
        this.verboseWriter = verboseWriter;
        this.annotationEngine = engine;

-        N = 2 * numSamples;
+        N = 2 * this.samples.size();
        log10AlleleFrequencyPriorsSNPs = new double[N+1];
        log10AlleleFrequencyPriorsIndels = new double[N+1];
        computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, GenotypeLikelihoodsCalculationModel.Model.SNP);
@ -178,6 +182,33 @@ public class UnifiedGenotyperEngine {
        return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model);
    }

+    /**
+     * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants.
+     *
+     * @param tracker    the meta data tracker
+     * @param refContext the reference base
+     * @param rawContext contextual information around the locus
+     * @param vc         the GL-annotated variant context
+     * @return the VariantCallContext object
+     */
+    public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) {
+        final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext );
+        if( model == null ) {
+            return null;
+        }
+        Map<String, AlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model);
+        return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model);
+    }
+
+
+
+
+    // ---------------------------------------------------------------------------------------------------------
+    //
+    // Private implementation helpers
+    //
+    // ---------------------------------------------------------------------------------------------------------
+
    // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine
    private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, AlignmentContext> stratifiedContexts, AlignmentContextUtils.ReadOrientation type, Allele alternateAlleleToUse, boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model) {

@ -188,7 +219,7 @@ public class UnifiedGenotyperEngine {

        Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();

-        Allele refAllele = glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), GLs, alternateAlleleToUse, useBAQedPileup);
+        Allele refAllele = glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), GLs, alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine);

        if ( refAllele != null )
            return createVariantContextFromLikelihoods(refContext, refAllele, GLs);
@ -268,24 +299,6 @@ public class UnifiedGenotyperEngine {
                null);
    }

-    /**
-     * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants.
-     *
-     * @param tracker    the meta data tracker
-     * @param refContext the reference base
-     * @param rawContext contextual information around the locus
-     * @param vc         the GL-annotated variant context
-     * @return the VariantCallContext object
-     */
-    public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) {
-        final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext );
-        if( model == null ) {
-            return null;
-        }
-        Map<String, AlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model);
-        return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model);
-    }
-
    // private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine
    private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) {

--- a/java/test/org/broadinstitute/sting/WalkerTest.java
+++ b/java/test/org/broadinstitute/sting/WalkerTest.java
@ -211,7 +211,7 @@ public class WalkerTest extends BaseTest {
            if ( md5 == null )
                throw new IllegalArgumentException("Null MD5 found in test " + name);
            if ( md5.equals("") ) // ok
-                ;
+                continue;
            if ( ! StringUtils.isAlphanumeric(md5) )
                throw new IllegalArgumentException("MD5 contains non-alphanumeric characters test " + name + " md5=" + md5);
            if ( md5.length() != exampleMD5.length() )
--- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@ -234,6 +234,20 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
        executeTest(String.format("test calling with BAQ"), spec);
    }

+    @Test
+    public void testCallingWithBAQOff() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                baseCommand +
+                        " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" +
+                        " -o %s" +
+                        " -L 1:10,000,000-10,100,000" +
+                        " -baq OFF",
+                1,
+                Arrays.asList("f9d92a81294569b9c918848932c1a0ca"));
+
+        executeTest(String.format("test calling with BAQ OFF"), spec);
+    }
+
    // --------------------------------------------------------------------------------------------------------------
    //
    // testing indel caller