From cbce3e3c83c72a8c7dff7b8fec00f00a2b419e83 Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 28 Oct 2010 01:48:47 +0000 Subject: [PATCH] General support for both GL (log10) and PL (phred-scaled) genotype likelihoods. All walkers now use the Tribble GenotypeLikelihoods object for parsing VCFs with genotype likelihood fields. Please use GenotypeLikelihoods object from now on for seamless support for GL and PL tags. UGv2 now uses PL by default. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4589 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/beagle/ProduceBeagleInputWalker.java | 15 +++------------ .../gatk/walkers/genotyper/UnifiedGenotyper.java | 2 +- .../walkers/variantutils/LiftoverVariants.java | 10 ++++++++++ .../genotyper/ExactAFCalculationModel.java | 2 +- .../walkers/genotyper/GridSearchAFEstimation.java | 2 +- .../walkers/genotyper/UnifiedGenotyperV2.java | 2 +- .../org/broadinstitute/sting/utils/MathUtils.java | 3 ++- .../broadinstitute/sting/utils/vcf/VCFUtils.java | 13 +++++++++++-- 8 files changed, 30 insertions(+), 19 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index d64acdbac..7186e8036 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -200,17 +200,8 @@ public class ProduceBeagleInputWalker extends RodWalker { /** * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key */ - if ( (isValidation && prior < 0.0) || genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { - String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(","); - - double[] likeArray = new double[glArray.length]; - - // convert to double array so we can normalize - int k=0; - for (String gl : glArray) { - likeArray[k++] = Double.valueOf(gl); - } - + if ( (isValidation && prior < 0.0) || genotype.isCalled() && genotype.hasLikelihoods()) { + double[] likeArray = genotype.getLikelihoods().getAsVector(); double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray); // see if we need to randomly mask out genotype in this position. Double d = generator.nextDouble(); @@ -234,7 +225,7 @@ public class ProduceBeagleInputWalker extends RodWalker { /** * otherwise, use the prior uniformly */ - else if (! isValidation && genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { + else if (! isValidation && genotype.isCalled() && ! genotype.hasLikelihoods() ) { // hack to deal with input VCFs with no genotype likelihoods. Just assume the called genotype // is confident. This is useful for Hapmap and 1KG release VCFs. double AA = (1.0-prior)/2.0; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 39dfadea5..03345b4a9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -143,7 +143,7 @@ public class UnifiedGenotyper extends LocusWalker { final Interval fromInterval = new Interval(vc.getChr(), vc.getStart(), vc.getStart(), false, String.format("%s:%d", vc.getChr(), vc.getStart())); final int length = vc.getEnd() - vc.getStart(); final Interval toInterval = liftOver.liftOver(fromInterval); + VariantContext originalVC = vc; if ( toInterval != null ) { // check whether the strand flips, and if so reverse complement everything @@ -95,6 +97,14 @@ public class LiftoverVariants extends RodWalker { } vc = VariantContextUtils.modifyLocation(vc, GenomeLocParser.createPotentiallyInvalidGenomeLoc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length)); + VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, ref.getBase(), false); + + if ( VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { + logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", + originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(), + originalVC.getReference(), originalVC.getAlternateAllele(0), newVC.getReference(), newVC.getAlternateAllele(0))); + } + writer.add(vc, ref.getBase()); successfulIntervals++; } else { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java index a3f42ec33..f31595f75 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -258,7 +258,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { attributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,String.format("%4.2f", 10*qual)); GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); - attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString()); + attributes.put(likelihoods.getKey(), likelihoods.getAsString()); calls.put(sample, new Genotype(sample, myAlleles, qual, null, attributes, false)); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java index 5affc6da4..db3a86045 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java @@ -128,7 +128,7 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel { attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup())); GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); - attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString()); + attributes.put(likelihoods.getKey(), likelihoods.getAsString()); calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false)); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java index 8d9b53530..3d21a6b6e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java @@ -156,7 +156,7 @@ public class UnifiedGenotyperV2 extends LocusWalker getSupportedHeaderStrings() { + public static Set getSupportedHeaderStrings(String glType) { Set result = new HashSet(); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Genotype Quality")); result.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Read Depth (only filtered reads used for calling)")); - result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); + + if ( glType == VCFConstants.GENOTYPE_LIKELIHOODS_KEY ) + result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); + else if ( glType == VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY ) + result.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, 3, VCFHeaderLineType.Float, "Normalized, Phred-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); + else + throw new ReviewedStingException("Unexpected GL type " + glType); + return result; }