From 3b1ee17727691d64235257fddf6a2968ce63a40e Mon Sep 17 00:00:00 2001 From: Kristian Cibulskis Date: Wed, 15 Apr 2015 22:23:54 -0400 Subject: [PATCH] added "artifact detection mode" for PON creation added "str_contraction" artifact filter (improves specificity, especially in exomes) refactored out VCF constants and added descriptions added "artifact detection mode" for PON creation added "str_contraction" artifact filter (improves specificity, especially in exomes) added new dream evaulation markdown added results for SMC 4 fixed up documentation, moved location to /dsde/working/mutect/dream_smc, and checked in scala script added "artifact detection mode" for PON creation added "str_contraction" artifact filter (improves specificity, especially in exomes) fixed bug which would overwrite germline_risk filter errors updated "how to" documents and records fixed license text thinned down FP regression test from 700 sites to 100. we have better ways (DREAM, NN) to check accuracy of the method and 100 is good enough to catch regressions why oh why do the MD5-based unit tests produce different results on different machine architectures? I hate that :/ Thanks to GG, LDG and DR -- test should now produce the same results regardless of machine architecture disabled downsampling... hopefully in the final attempt to make this work cross architecture! enforced LOGLESS_CACHING... hopefully in the final final attempt to make this work cross architecture! refactored out VCF constants and added descriptions --- .../gatk/utils/variant/GATKVCFConstants.java | 21 ++++++++++-- .../utils/variant/GATKVCFHeaderLines.java | 33 +++++++++++++++---- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index 95cf3e593..8b093d4d5 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -51,10 +51,14 @@ public final class GATKVCFConstants { public static final String SPANNING_DELETIONS_KEY = "Dels"; public static final String ORIGINAL_DP_KEY = "DP_Orig"; //SelectVariants public static final String DOWNSAMPLED_KEY = "DS"; + public static final String EVENT_COUNT_IN_HAPLOTYPE_KEY = "ECNT"; //M2 + public static final String EVENT_DISTANCE_MAX_KEY = "MAX_ED"; //M2 + public static final String EVENT_DISTANCE_MIN_KEY = "MIN_ED"; //M2 public static final String FISHER_STRAND_KEY = "FS"; public static final String GC_CONTENT_KEY = "GC"; public static final String GQ_MEAN_KEY = "GQ_MEAN"; public static final String GQ_STDEV_KEY = "GQ_STDDEV"; + public static final String HAPLOTYPE_COUNT_KEY = "HCNT"; //M2 public static final String HAPLOTYPE_SCORE_KEY = "HaplotypeScore"; public static final String HI_CONF_DENOVO_KEY = "hiConfDeNovo"; public static final String HOMOPOLYMER_RUN_KEY = "HRun"; @@ -80,8 +84,10 @@ public final class GATKVCFConstants { public static final String ORIGINAL_CONTIG_KEY = "OriginalChr"; //LiftoverVariants public static final String ORIGINAL_START_KEY = "OriginalStart"; //LiftoverVariants public static final String N_BASE_COUNT_KEY = "PercentNBase"; + public static final String NORMAL_LOD_KEY = "NLOD"; //M2 public static final String RBP_INCONSISTENT_KEY = "PhasingInconsistent"; //ReadBackedPhasing public static final String GENOTYPE_PRIOR_KEY = "PG"; + public static final String PANEL_OF_NORMALS_COUNT_KEY = "PON"; //M2 public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; public static final String QUAL_BY_DEPTH_KEY = "QD"; public static final String BEAGLE_R2_KEY = "R2"; //BeagleOutputToVCF @@ -93,11 +99,13 @@ public final class GATKVCFConstants { public static final String STRAND_ODDS_RATIO_KEY = "SOR"; public static final String STR_PRESENT_KEY = "STR"; public static final String TRANSMISSION_DISEQUILIBRIUM_KEY = "TDT"; + public static final String TUMOR_LOD_KEY = "TLOD"; //M2 public static final String VARIANT_TYPE_KEY = "VariantType"; public static final String VQS_LOD_KEY = "VQSLOD"; //FORMAT keys public static final String ALLELE_BALANCE_KEY = "AB"; + public static final String ALLELE_FRACTION_KEY = "AF"; //M2 public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; public static final String RBP_HAPLOTYPE_KEY = "HP"; //ReadBackedPhasing public static final String AVG_INTERVAL_DP_BY_SAMPLE_KEY = "IDP"; //DiagnoseTargets @@ -110,6 +118,7 @@ public final class GATKVCFConstants { public static final String HAPLOTYPE_CALLER_PHASING_GT_KEY = "PGT"; public static final String HAPLOTYPE_CALLER_PHASING_ID_KEY = "PID"; public static final String PHRED_SCALED_POSTERIORS_KEY = "PP"; //FamilyLikelihoodsUtils / PosteriorLikelihoodsUtils + public static final String QUALITY_SCORE_SUM_KEY = "QSS"; //M2 public static final String REFERENCE_GENOTYPE_QUALITY = "RGQ"; public static final String STRAND_COUNT_BY_SAMPLE_KEY = "SAC"; public static final String STRAND_BIAS_BY_SAMPLE_KEY = "SB"; @@ -120,8 +129,16 @@ public final class GATKVCFConstants { /* Note that many filters used throughout GATK (most notably in VariantRecalibration) are dynamic, their names (or descriptions) depend on some threshold. Those filters are not included here */ - public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC"; - public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + public static final String ALT_ALLELE_IN_NORMAL_FILTER_NAME = "alt_allele_in_normal"; //M2 + public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC"; + public static final String CLUSTERED_EVENTS_FILTER_NAME = "clustered_events"; //M2 + public static final String GERMLINE_RISK_FILTER_NAME = "germline_risk"; //M2 + public static final String HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME = "homologous_mapping_event"; //M2 + public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + public static final String MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME = "multi_event_alt_allele_in_normal"; //M2 + public static final String PON_FILTER_NAME = "panel_of_normals"; //M2 + public static final String STR_CONTRACTION_FILTER_NAME = "str_contraction"; //M2 + public static final String TUMOR_LOD_FILTER_NAME = "t_lod_fstar"; //M2 // Symbolic alleles public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT"; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java index 89b9510d2..6dc7a2122 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java @@ -25,16 +25,11 @@ package org.broadinstitute.gatk.utils.variant; -import htsjdk.variant.vcf.VCFFilterHeaderLine; -import htsjdk.variant.vcf.VCFFormatHeaderLine; -import htsjdk.variant.vcf.VCFHeaderLineCount; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.*; import static org.broadinstitute.gatk.utils.variant.GATKVCFConstants.*; -import java.util.HashMap; -import java.util.Map; +import java.util.*; /** * This class contains the VCFHeaderLine definitions for the annotation keys in GATKVCFConstants. @@ -66,6 +61,16 @@ public class GATKVCFHeaderLines { addFilterLine(new VCFFilterHeaderLine(LOW_QUAL_FILTER_NAME, "Low quality")); addFilterLine(new VCFFilterHeaderLine(BEAGLE_MONO_FILTER_NAME, "This site was set to monomorphic by Beagle")); + // M2-related filters + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Evidence seen in the normal sample")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME, "Clustered events observed in the tumor ")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME, "Evidence indicates this site is germline, not somatic")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME, "More than three events were observed in the tumor")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Multiple events observed in tumor and normal")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.PON_FILTER_NAME, "Seen in at least 2 samples in the panel of normals")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TUMOR_LOD_FILTER_NAME, "Tumor does not meet likelihood threshold")); + addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME, "Site filtered due to contraction of short repeat region")); + addFormatLine(new VCFFormatHeaderLine(ALLELE_BALANCE_KEY, 1, VCFHeaderLineType.Float, "Allele balance for each het genotype")); addFormatLine(new VCFFormatHeaderLine(MAPPING_QUALITY_ZERO_BY_SAMPLE_KEY, 1, VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample")); addFormatLine(new VCFFormatHeaderLine(MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample")); @@ -89,6 +94,10 @@ public class GATKVCFHeaderLines { addFormatLine(new VCFFormatHeaderLine(JOINT_POSTERIOR_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred-scaled joint posterior probability of the genotype combination (after applying family priors)")); addFormatLine(new VCFFormatHeaderLine(ORIGINAL_GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Original Genotype input to Beagle")); + // M2-related info lines + addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.ALLELE_FRACTION_KEY, 1, VCFHeaderLineType.Float, "Allele fraction of the event in the tumor")); + + addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed")); addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed")); addInfoLine(new VCFInfoHeaderLine(DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); @@ -147,5 +156,15 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(BEAGLE_AC_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Count from Comparison ROD at this site")); addInfoLine(new VCFInfoHeaderLine(BEAGLE_AF_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Frequency from Comparison ROD at this site")); addInfoLine(new VCFInfoHeaderLine(BEAGLE_AN_COMP_KEY, 1, VCFHeaderLineType.Float, "Allele Number from Comparison ROD at this site")); + + // M2-related info lines + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.String, "Number of events in this haplotype")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, 1, VCFHeaderLineType.Integer, "Maximum distance between events in this active region")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, 1, VCFHeaderLineType.Integer, "Minimum distance between events in this active region")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, 1, VCFHeaderLineType.String, "Number of haplotypes that support this variant")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.NORMAL_LOD_KEY, 1, VCFHeaderLineType.String, "Normal LOD score")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY, 1, VCFHeaderLineType.String, "Count from Panel of Normals")); + addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_LOD_KEY, 1, VCFHeaderLineType.String, "Tumor LOD score")); + } }