From 8ed82e5a08006f20c0e93b5112943433f5811304 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 27 May 2011 14:00:52 +0000 Subject: [PATCH] The previous version of the UG was always creating BAQ'd pileups for the underlying site QUAL calculation. This resulted in some slowdown in the code. But as far as I can tell, the code actually didn't apply the BAQ'd base quality anywhere when the BAQ field wasn't in the read, so this just saves us 20% of the runtime when BAQ isn't enabled from heading into the BAQ subsystem when we don't actually want to get the BAQ'd base qualities. Fixed minor problem with WalkerTest for "" (for parameterization) md5s. Added an explicit integrationtest for BAQ NONE Now only creates the BAQ'd pileup, if the useBAQPileup parameter is provide in initializeAlternateAllele. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5891 348d0f76-0448-11de-a6fe-93d51630548a --- ...NPGenotypeLikelihoodsCalculationModel.java | 8 +- .../genotyper/UnifiedGenotyperEngine.java | 103 ++++++++++-------- .../org/broadinstitute/sting/WalkerTest.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 14 +++ 4 files changed, 77 insertions(+), 50 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 50015fd95..bdccf4dd8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -101,12 +101,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC if ( !vc.isBiallelic() ) { // for multi-allelic sites go back to the reads and find the most likely alternate allele - initializeBestAlternateAllele(refBase, contexts); + initializeBestAlternateAllele(refBase, contexts, useBAQedPileup); } else { bestAlternateAllele = vc.getAlternateAllele(0).getBases()[0]; } } else { - initializeBestAlternateAllele(refBase, contexts); + initializeBestAlternateAllele(refBase, contexts, useBAQedPileup); } // if there are no non-ref bases... @@ -148,12 +148,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return refAllele; } - protected void initializeBestAlternateAllele(byte ref, Map contexts) { + protected void initializeBestAlternateAllele(byte ref, Map contexts, boolean useBAQedPileup) { int[] qualCounts = new int[4]; for ( Map.Entry sample : contexts.entrySet() ) { // calculate the sum of quality scores for each base - ReadBackedPileup pileup = createBAQedPileup( sample.getValue().getBasePileup() ); + ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup(); for ( PileupElement p : pileup ) { // ignore deletions if ( p.isDeletion() || p.getQual() < UAC.MIN_BASE_QUALTY_SCORE ) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 06b4aacd2..8d0c7eaf6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -37,14 +37,15 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.pileup.*; import org.broad.tribble.vcf.VCFConstants; +import com.google.java.contract.*; import java.io.PrintStream; import java.util.*; public class UnifiedGenotyperEngine { - public static final String LOW_QUAL_FILTER_NAME = "LowQual"; public enum OUTPUT_MODE { @@ -54,10 +55,10 @@ public class UnifiedGenotyperEngine { } // the unified argument collection - private UnifiedArgumentCollection UAC = null; + private final UnifiedArgumentCollection UAC; // the annotation engine - private VariantAnnotatorEngine annotationEngine; + private final VariantAnnotatorEngine annotationEngine; // the model used for calculating genotypes private ThreadLocal> glcm = new ThreadLocal>(); @@ -66,49 +67,52 @@ public class UnifiedGenotyperEngine { private ThreadLocal afcm = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private double[] log10AlleleFrequencyPriorsSNPs; - private double[] log10AlleleFrequencyPriorsIndels; + private final double[] log10AlleleFrequencyPriorsSNPs; + private final double[] log10AlleleFrequencyPriorsIndels; // the allele frequency likelihoods (allocated once as an optimization) private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); // the priors object - private GenotypePriors genotypePriorsSNPs; - private GenotypePriors genotypePriorsIndels; + private final GenotypePriors genotypePriorsSNPs; + private final GenotypePriors genotypePriorsIndels; // samples in input - private Set samples = new TreeSet(); + private final Set samples; // the various loggers and writers - private Logger logger = null; - private PrintStream verboseWriter = null; + private final Logger logger; + private final PrintStream verboseWriter; // number of chromosomes (2 * samples) in input - private int N; + private final int N; // the standard filter to use for calls below the confidence threshold but above the emit threshold private static final Set filter = new HashSet(1); + private final boolean BAQEnabledOnCMDLine; + + + // --------------------------------------------------------------------------------------------------------- + // + // Public interface functions + // + // --------------------------------------------------------------------------------------------------------- + @Requires({"toolkit != null", "UAC != null"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - // get the number of samples - // if we're supposed to assume a single sample, do so - samples = new TreeSet(); - if ( UAC.ASSUME_SINGLE_SAMPLE != null ) - samples.add(UAC.ASSUME_SINGLE_SAMPLE); - else - samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); - - final Logger logger = Logger.getLogger(UnifiedGenotyperEngine.class); - initialize(UAC, logger, null, null, samples.size()); + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, + // get the number of samples + // if we're supposed to assume a single sample, do so + UAC.ASSUME_SINGLE_SAMPLE != null ? + new TreeSet(Arrays.asList(UAC.ASSUME_SINGLE_SAMPLE)) : + SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader())); } + @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples) { + this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; this.samples = new TreeSet(samples); - initialize(UAC, logger, verboseWriter, engine, samples.size()); - } - - private void initialize(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) { // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ this.UAC = UAC.clone(); this.UAC.MIN_MAPPING_QUALTY_SCORE = Math.max(UAC.MIN_MAPPING_QUALTY_SCORE, UAC.MIN_BASE_QUALTY_SCORE); @@ -117,7 +121,7 @@ public class UnifiedGenotyperEngine { this.verboseWriter = verboseWriter; this.annotationEngine = engine; - N = 2 * numSamples; + N = 2 * this.samples.size(); log10AlleleFrequencyPriorsSNPs = new double[N+1]; log10AlleleFrequencyPriorsIndels = new double[N+1]; computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, GenotypeLikelihoodsCalculationModel.Model.SNP); @@ -178,6 +182,33 @@ public class UnifiedGenotyperEngine { return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); } + /** + * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param vc the GL-annotated variant context + * @return the VariantCallContext object + */ + public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) { + final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext ); + if( model == null ) { + return null; + } + Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model); + } + + + + + // --------------------------------------------------------------------------------------------------------- + // + // Private implementation helpers + // + // --------------------------------------------------------------------------------------------------------- + // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map stratifiedContexts, AlignmentContextUtils.ReadOrientation type, Allele alternateAlleleToUse, boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model) { @@ -188,7 +219,7 @@ public class UnifiedGenotyperEngine { Map GLs = new HashMap(); - Allele refAllele = glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), GLs, alternateAlleleToUse, useBAQedPileup); + Allele refAllele = glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), GLs, alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); if ( refAllele != null ) return createVariantContextFromLikelihoods(refContext, refAllele, GLs); @@ -268,24 +299,6 @@ public class UnifiedGenotyperEngine { null); } - /** - * Compute genotypes at a given locus. Entry point for engine calls from UGCallVariants. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param vc the GL-annotated variant context - * @return the VariantCallContext object - */ - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) { - final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext ); - if( model == null ) { - return null; - } - Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model); - } - // private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { diff --git a/java/test/org/broadinstitute/sting/WalkerTest.java b/java/test/org/broadinstitute/sting/WalkerTest.java index 1b8b4206a..822034720 100755 --- a/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/java/test/org/broadinstitute/sting/WalkerTest.java @@ -211,7 +211,7 @@ public class WalkerTest extends BaseTest { if ( md5 == null ) throw new IllegalArgumentException("Null MD5 found in test " + name); if ( md5.equals("") ) // ok - ; + continue; if ( ! StringUtils.isAlphanumeric(md5) ) throw new IllegalArgumentException("MD5 contains non-alphanumeric characters test " + name + " md5=" + md5); if ( md5.length() != exampleMD5.length() ) diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 082f23c69..fa2e9c687 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -234,6 +234,20 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest(String.format("test calling with BAQ"), spec); } + @Test + public void testCallingWithBAQOff() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + + " -o %s" + + " -L 1:10,000,000-10,100,000" + + " -baq OFF", + 1, + Arrays.asList("f9d92a81294569b9c918848932c1a0ca")); + + executeTest(String.format("test calling with BAQ OFF"), spec); + } + // -------------------------------------------------------------------------------------------------------------- // // testing indel caller