From aecaa6d38e4115450ed15f73cf1895e116a4a9e8 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Mon, 9 Nov 2015 12:48:40 -0500 Subject: [PATCH] Allow GenotypeGVCFs to emit ref sites. --- .../walkers/genotyper/GenotypingEngine.java | 12 ++- .../genotyper/UnifiedGenotypingEngine.java | 3 +- .../afcalc/DiploidExactAFCalculator.java | 3 +- .../genotyper/afcalc/ExactAFCalculator.java | 15 ++- .../walkers/variantutils/GenotypeGVCFs.java | 97 ++++++++++--------- .../GenotypeGVCFsIntegrationTest.java | 17 ++-- .../SelectVariantsIntegrationTest.java | 4 +- .../variant/GATKVariantContextUtils.java | 2 +- 8 files changed, 87 insertions(+), 66 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index 91c27cdd0..451b49ab5 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -244,8 +244,11 @@ public abstract class GenotypingEngine genotypeLikelihoods = getGLs(vc.getGenotypes(), true); + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true, vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java index 7089cbb77..9a888f934 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java @@ -125,11 +125,22 @@ abstract class ExactAFCalculator extends AFCalculator { } /** - * Unpack GenotypesContext into arraylist of doubel values + * Unpack GenotypesContext into arraylist of double values * @param GLs Input genotype context * @return ArrayList of doubles corresponding to GL vectors */ protected static ArrayList getGLs(final GenotypesContext GLs, final boolean includeDummy) { + return getGLs(GLs, includeDummy, false); + } + + /** + * Unpack GenotypesContext into arraylist of double values + * @param GLs Input genotype context + * @param keepUninformative Don't filter out uninformative genotype likelihoods (i.e. all log likelihoods near 0) + * This is useful for VariantContexts with a NON_REF allele + * @return ArrayList of doubles corresponding to GL vectors + */ + protected static ArrayList getGLs(final GenotypesContext GLs, final boolean includeDummy, final boolean keepUninformative) { final ArrayList genotypeLikelihoods = new ArrayList<>(GLs.size() + 1); if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy @@ -137,7 +148,7 @@ abstract class ExactAFCalculator extends AFCalculator { if ( sample.hasLikelihoods() ) { final double[] gls = sample.getLikelihoods().getAsVector(); - if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) + if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL || keepUninformative ) genotypeLikelihoods.add(gls); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index 681bb07c3..feb2fbb2e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -68,6 +68,7 @@ import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AS_StandardAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode; import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.GeneralPloidyFailOverAFCalculatorProvider; @@ -132,7 +133,6 @@ import java.util.*; @Reference(window=@Window(start=-10,stop=10)) @SuppressWarnings("unused") public class GenotypeGVCFs extends RodWalker implements AnnotatorCompatible, TreeReducible { - /** * The gVCF files to merge together */ @@ -212,15 +212,10 @@ public class GenotypeGVCFs extends RodWalker vcfRods = GATKVCFUtils.getVCFHeadersFromRods(toolkit, variants); - final GATKVariantContextUtils.GenotypeMergeType mergeType; - if(uniquifySamples) { - mergeType = GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY; - } - else - mergeType = GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE; - + final GATKVariantContextUtils.GenotypeMergeType mergeType = uniquifySamples ? + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY : GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE; final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, mergeType)); - // create the annotation engine + annotationEngine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, Collections.emptyList(), this, toolkit); // create the genotyping engine @@ -255,15 +250,18 @@ public class GenotypeGVCFs extends RodWalker vcsAtThisLocus = tracker.getPrioritizedValue(variants, loc); + final Byte refBase = INCLUDE_NON_VARIANTS ? ref.getBase() : null; + final boolean removeNonRefSymbolicAllele = !INCLUDE_NON_VARIANTS; + final VariantContext combinedVC = ReferenceConfidenceVariantContextMerger.merge(vcsAtThisLocus, loc, + refBase, removeNonRefSymbolicAllele, uniquifySamples, annotationEngine); + return combinedVC == null ? null : regenotypeVC(tracker, ref, combinedVC); } /** @@ -275,65 +273,67 @@ public class GenotypeGVCFs extends RodWalker 0 ) { + result = genotypingEngine.calculateGenotypes(originalVC); + } + + if (result == null || (!isProperlyPolymorphic(result) && !INCLUDE_NON_VARIANTS)) { + return null; + } + + result = addGenotypingAnnotations(originalVC.getAttributes(), result); //At this point we should already have DP and AD annotated - VariantContext result = annotationEngine.finalizeAnnotations(rawResult, originalVC); + result = annotationEngine.finalizeAnnotations(result, originalVC); //do trimming after allele-specific annotation reduction or the mapping is difficult result = GATKVariantContextUtils.reverseTrimAlleles(result); - // if it turned monomorphic then we either need to ignore or fix such sites - boolean createRefGTs = false; - if ( result.isMonomorphicInSamples() ) { - if ( !INCLUDE_NON_VARIANTS ) - return null; - createRefGTs = true; - } // Re-annotate and fix/remove some of the original annotations. // Note that the order of these actions matters and is different for polymorphic and monomorphic sites. // For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later. // For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine. // We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes. - if ( createRefGTs ) { + if (result.isPolymorphicInSamples()) { + result = annotationEngine.annotateContext(tracker, ref, null, result); + result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, false)).make(); + } else if (INCLUDE_NON_VARIANTS) { result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make(); result = annotationEngine.annotateContext(tracker, ref, null, result); } else { - result = annotationEngine.annotateContext(tracker, ref, null, result); - result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, false)).make(); + return null; } - return result; } /** - * Determines whether the provided VariantContext has real alternate alleles + * Determines whether the provided VariantContext has real alternate alleles. + * + * There is a bit of a hack to handle the case because it is not defined in htsjdk.Allele + * We check for this as a biallelic symbolic allele. * * @param vc the VariantContext to evaluate * @return true if it has proper alternate alleles, false otherwise */ private boolean isProperlyPolymorphic(final VariantContext vc) { - return ( vc != null && - !vc.getAlternateAlleles().isEmpty() && - (!vc.isBiallelic() || - (!vc.getAlternateAllele(0).equals(Allele.SPAN_DEL) && - !vc.getAlternateAllele(0).equals(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED)) - ) - ); + //obvious cases + if (vc == null || vc.getAlternateAlleles().isEmpty()) { + return false; + } else if (vc.isBiallelic()) { + return !(vc.getAlternateAllele(0).equals(Allele.SPAN_DEL) || + vc.getAlternateAllele(0).equals(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED) || + vc.isSymbolic()); + } else { + return true; + } } /** @@ -435,6 +435,9 @@ public class GenotypeGVCFs extends RodWalker