From b99e27bf9b71f52b5b9c9b2cfdff5816859939c0 Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 13 Mar 2011 00:07:51 +0000 Subject: [PATCH] In the process of optimizing ProduceBeagleInputWalker, discovered that the GenotypeLikelihoods, the UG, and Genotype objects were using old-style GL tags internally, and then converting from Likelihoods -> GL String -> Likelihoods -> PL String throughout the GATK. It was both painful and led to convoluted code throughout the system. Removed everything but GL conversion -> PL in the GenotypeLikelihoods objects, and now all of the codes in UG now immediately provides GenotypeLikelihoods to the Genotype objects, which is converted straight to PL now. Resulted in a 30% speed up in ProduceBeagleLikelihoods, passes integration tests without any modifications, and likely speeds up writing any VCFs with likelihoods. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5432 348d0f76-0448-11de-a6fe-93d51630548a --- .../beagle/ProduceBeagleInputWalker.java | 17 ++++++++---- .../walkers/genotyper/UnifiedGenotyper.java | 2 +- .../genotyper/UnifiedGenotyperEngine.java | 26 ++++--------------- .../sting/utils/vcf/VCFUtils.java | 10 ++----- 4 files changed, 20 insertions(+), 35 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 7965eefb1..202227249 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -49,6 +49,8 @@ import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.io.File; import java.io.PrintStream; +import java.text.DecimalFormat; +import java.text.MessageFormat; import java.util.*; /** @@ -56,7 +58,6 @@ import java.util.*; */ @Requires(value={},referenceMetaData=@RMD(name=ProduceBeagleInputWalker.ROD_NAME, type=VariantContext.class)) public class ProduceBeagleInputWalker extends RodWalker { - public static final String ROD_NAME = "variant"; public static final String VALIDATION_ROD_NAME = "validation"; @@ -72,6 +73,11 @@ public class ProduceBeagleInputWalker extends RodWalker { @Argument(doc="VQSqual key", shortName = "vqskey", required=false) protected String VQSLOD_KEY = "VQSqual"; + @Hidden + @Argument(doc="REMOVE ME", shortName = "rcwpio", required=false) + private static final boolean REMAIN_COMPATIBLE_WITH_PREVIOUS_IO = false; + + // @Hidden // @Argument(doc="Include filtered records", shortName = "ifr", fullName = "IncludeFilteredRecords", required=false) // protected boolean includeFilteredRecords = false; @@ -267,15 +273,16 @@ public class ProduceBeagleInputWalker extends RodWalker { if ( VQSRCalibrator != null ) log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods); - double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods); + double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods); // see if we need to randomly mask out genotype in this position. // todo -- remove me after testing - if ( log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS ) - for (double likeVal: normalizedLog10Likelihoods) + if ( REMAIN_COMPATIBLE_WITH_PREVIOUS_IO && (log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS) ) + for (double likeVal: normalizedLikelihoods) out.append(String.format("%.2f ",likeVal)); else - for (double likeVal: normalizedLog10Likelihoods) + for (double likeVal: normalizedLikelihoods) { out.append(String.format("%5.4f ",likeVal)); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 1b362f9eb..e2caf87c8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -157,7 +157,7 @@ public class UnifiedGenotyper extends LocusWalker stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext); if ( stratifiedContexts == null ) return null; - VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, alternateAlleleToUse); - return GLsToPLs(vc); + return calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, alternateAlleleToUse); } private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type, Allele alternateAlleleToUse) { @@ -253,9 +250,10 @@ public class UnifiedGenotyperEngine { } HashMap attributes = new HashMap(); - GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); + //GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); + GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(GL.getLikelihoods()); attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth()); - attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods); + attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false)); } @@ -274,20 +272,6 @@ public class UnifiedGenotyperEngine { null); } - private static VariantContext GLsToPLs(VariantContext vc) { - if ( vc == null ) - return null; - - HashMap calls = new HashMap(); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { - HashMap attributes = new HashMap(genotype.getValue().getAttributes()); - attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); - attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector())); - calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes)); - } - return VariantContext.modifyGenotypes(vc, calls); - } - /** * Compute genotypes at a given locus. * diff --git a/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java index 82ae0273f..7955fdb10 100755 --- a/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/vcf/VCFUtils.java @@ -187,18 +187,12 @@ public class VCFUtils { * return a set of supported format lines; what we currently support for output in the genotype fields of a VCF * @return a set of VCF format lines */ - public static Set getSupportedHeaderStrings(String glType) { + public static Set getSupportedHeaderStrings() { Set result = new HashSet(); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - - if ( glType == VCFConstants.GENOTYPE_LIKELIHOODS_KEY ) - result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); - else if ( glType == VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY ) - result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); - else - throw new ReviewedStingException("Unexpected GL type " + glType); + result.add(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); return result; }