From 4da0d1300c76bdd8427090d5818ea03adce5538e Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Thu, 29 Oct 2015 16:01:47 -0400 Subject: [PATCH] adding fraction informative reads annotation. --- .../walkers/variantutils/GenotypeGVCFs.java | 27 ++-- .../GenotypeGVCFsIntegrationTest.java | 13 +- .../annotator/FractionInformativeReads.java | 115 ++++++++++++++++++ .../annotator/VariantAnnotatorEngine.java | 20 +-- .../gatk/utils/variant/GATKVCFConstants.java | 1 + .../utils/variant/GATKVCFHeaderLines.java | 7 +- 6 files changed, 158 insertions(+), 25 deletions(-) create mode 100644 public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FractionInformativeReads.java diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index 3bff97484..c05d72445 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -55,29 +55,30 @@ import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; -import org.broadinstitute.gatk.utils.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; -import org.broadinstitute.gatk.utils.genotyper.SampleList; -import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; -import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; import org.broadinstitute.gatk.engine.walkers.Window; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.gatk.tools.walkers.genotyper.*; +import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.GeneralPloidyFailOverAFCalculatorProvider; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.engine.GATKVCFUtils; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; @@ -372,10 +373,10 @@ public class GenotypeGVCFs extends RodWalkerThe FractionInformativeReads annotation produces a single fraction for each site: sum(AD)/sum(DP). The sum in the numerator + * is over all the samples in the cohort and all the alleles in each sample. The sum in the denominator is over all the samples. + * + * + *

Caveats

+ * + * + *

Related annotations

+ * + */ + +public class FractionInformativeReads extends InfoFieldAnnotation implements ReducibleAnnotation { + @Override + public String getRawKeyName() { + return null; + } + + @Override + public List getKeyNames() { + return Collections.singletonList(GATKVCFConstants.FRACTION_INFORMATIVE_READS_KEY); + } + + @Override + public Map annotateRawData(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc, Map stratifiedPerReadAlleleLikelihoodMap) { + return null; + } + + @Override + public Map combineRawData(List allelesList, List listOfRawData) { + return null; + } + + @Override + public Map finalizeRawData(VariantContext vc, VariantContext originalVC) { + + int totalAD = 0; + for (final Genotype gt : vc.getGenotypes()){ + if(gt != null) { + if(gt.hasAD()) { + totalAD += MathUtils.sum(gt.getAD()); + continue; + } + // this is needed since the finalizing of HOM_REF genotypes comes after the finalizing of annotations. so the AD field is null at this point. + // TODO: this will become unneeded if the above statement is false in which case it can be safely removed. + if(gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) { + totalAD += Integer.parseInt((String) gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)); + } + } + } + final int depth = vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + return Collections.singletonMap(GATKVCFConstants.FRACTION_INFORMATIVE_READS_KEY, (Object) (depth != 0 ? totalAD / (double) depth : 0)); + } + + @Override + public void calculateRawData(VariantContext vc, Map pralm, ReducibleAnnotationData rawAnnotations) { + + } + + @Override + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc, Map stratifiedPerReadAlleleLikelihoodMap) { + return null; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java index 4af58114a..bddc020c9 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -31,15 +31,15 @@ import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.*; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import java.util.*; @@ -351,13 +351,15 @@ public class VariantAnnotatorEngine { // go through all the requested info annotationTypes for ( final InfoFieldAnnotation annotationType : requestedReducibleInfoAnnotations ) { + ReducibleAnnotation currentASannotation = (ReducibleAnnotation)annotationType; - final Map annotationsFromCurrentType = currentASannotation.finalizeRawData(vc, originalVC); - if ( annotationsFromCurrentType != null ) { - infoAnnotations.putAll(annotationsFromCurrentType); - //clean up raw annotation data after annotations are finalized - infoAnnotations.remove(currentASannotation.getRawKeyName()); - } + + final Map annotationsFromCurrentType = currentASannotation.finalizeRawData(vc, originalVC); + if ( annotationsFromCurrentType != null ) { + infoAnnotations.putAll(annotationsFromCurrentType); + //clean up raw annotation data after annotations are finalized + infoAnnotations.remove(currentASannotation.getRawKeyName()); + } } // generate a new annotated VC diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index 4947d6195..ff04c2971 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -61,6 +61,7 @@ public final class GATKVCFConstants { public static final String EVENT_DISTANCE_MIN_KEY = "MIN_ED"; //M2 public static final String FISHER_STRAND_KEY = "FS"; public static final String AS_FISHER_STRAND_KEY = "AS_FS"; + public static final String FRACTION_INFORMATIVE_READS_KEY = "FractionInformativeReads"; public static final String AS_SB_TABLE_KEY = "AS_SB_TABLE"; public static final String GC_CONTENT_KEY = "GC"; public static final String GQ_MEAN_KEY = "GQ_MEAN"; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java index 70b7108c8..f0fa0724c 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java @@ -27,9 +27,10 @@ package org.broadinstitute.gatk.utils.variant; import htsjdk.variant.vcf.*; -import static org.broadinstitute.gatk.utils.variant.GATKVCFConstants.*; +import java.util.HashMap; +import java.util.Map; -import java.util.*; +import static org.broadinstitute.gatk.utils.variant.GATKVCFConstants.*; /** * This class contains the VCFHeaderLine definitions for the annotation keys in GATKVCFConstants. @@ -84,6 +85,7 @@ public class GATKVCFHeaderLines { addFormatLine(new VCFFormatHeaderLine(PL_FOR_ALL_SNP_ALLELES_KEY, 10, VCFHeaderLineType.Integer, "Phred-scaled genotype likelihoods for all 4 possible bases regardless of whether there is statistical evidence for them. Ordering is always PL for AA AC CC GA GC GG TA TC TG TT.")); addFormatLine(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_ID_KEY, 1, VCFHeaderLineType.String, "Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group")); addFormatLine(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_GT_KEY, 1, VCFHeaderLineType.String, "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another")); + addFormatLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_KEY, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")); addFormatLine(new VCFFormatHeaderLine(REFERENCE_GENOTYPE_QUALITY, 1, VCFHeaderLineType.Integer, "Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)")); addFormatLine(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_KEY, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); @@ -141,6 +143,7 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(RAW_MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Raw data for Mapping Quality Rank Sum")); addInfoLine(new VCFInfoHeaderLine(AS_RAW_MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.String, "Allele-specfic raw data for Mapping Quality Rank Sum")); addInfoLine(new VCFInfoHeaderLine(AS_MAP_QUAL_RANK_SUM_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele-specific Mapping Quality Rank Sum")); + addInfoLine(new VCFInfoHeaderLine(FRACTION_INFORMATIVE_READS_KEY, 1, VCFHeaderLineType.Float, "The fraction of informative reads out of the total reads")); addInfoLine(new VCFInfoHeaderLine(MENDEL_VIOLATION_LR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); addInfoLine(new VCFInfoHeaderLine(HI_CONF_DENOVO_KEY, 1, VCFHeaderLineType.String, "High confidence possible de novo mutation (GQ >= 20 for all trio members)=[comma-delimited list of child samples]"));