Merge pull request #982 from broadinstitute/kc_fp_analysis

added "artifact detection mode" for PON creation
This commit is contained in:
kcibul 2015-05-15 07:45:20 -04:00
commit 28a7ea43ec
2 changed files with 45 additions and 9 deletions

View File

@ -51,10 +51,14 @@ public final class GATKVCFConstants {
public static final String SPANNING_DELETIONS_KEY = "Dels";
public static final String ORIGINAL_DP_KEY = "DP_Orig"; //SelectVariants
public static final String DOWNSAMPLED_KEY = "DS";
public static final String EVENT_COUNT_IN_HAPLOTYPE_KEY = "ECNT"; //M2
public static final String EVENT_DISTANCE_MAX_KEY = "MAX_ED"; //M2
public static final String EVENT_DISTANCE_MIN_KEY = "MIN_ED"; //M2
public static final String FISHER_STRAND_KEY = "FS";
public static final String GC_CONTENT_KEY = "GC";
public static final String GQ_MEAN_KEY = "GQ_MEAN";
public static final String GQ_STDEV_KEY = "GQ_STDDEV";
public static final String HAPLOTYPE_COUNT_KEY = "HCNT"; //M2
public static final String HAPLOTYPE_SCORE_KEY = "HaplotypeScore";
public static final String HI_CONF_DENOVO_KEY = "hiConfDeNovo";
public static final String HOMOPOLYMER_RUN_KEY = "HRun";
@ -80,8 +84,10 @@ public final class GATKVCFConstants {
public static final String ORIGINAL_CONTIG_KEY = "OriginalChr"; //LiftoverVariants
public static final String ORIGINAL_START_KEY = "OriginalStart"; //LiftoverVariants
public static final String N_BASE_COUNT_KEY = "PercentNBase";
public static final String NORMAL_LOD_KEY = "NLOD"; //M2
public static final String RBP_INCONSISTENT_KEY = "PhasingInconsistent"; //ReadBackedPhasing
public static final String GENOTYPE_PRIOR_KEY = "PG";
public static final String PANEL_OF_NORMALS_COUNT_KEY = "PON"; //M2
public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE";
public static final String QUAL_BY_DEPTH_KEY = "QD";
public static final String BEAGLE_R2_KEY = "R2"; //BeagleOutputToVCF
@ -93,11 +99,13 @@ public final class GATKVCFConstants {
public static final String STRAND_ODDS_RATIO_KEY = "SOR";
public static final String STR_PRESENT_KEY = "STR";
public static final String TRANSMISSION_DISEQUILIBRIUM_KEY = "TDT";
public static final String TUMOR_LOD_KEY = "TLOD"; //M2
public static final String VARIANT_TYPE_KEY = "VariantType";
public static final String VQS_LOD_KEY = "VQSLOD";
//FORMAT keys
public static final String ALLELE_BALANCE_KEY = "AB";
public static final String ALLELE_FRACTION_KEY = "AF"; //M2
public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL";
public static final String RBP_HAPLOTYPE_KEY = "HP"; //ReadBackedPhasing
public static final String AVG_INTERVAL_DP_BY_SAMPLE_KEY = "IDP"; //DiagnoseTargets
@ -110,6 +118,7 @@ public final class GATKVCFConstants {
public static final String HAPLOTYPE_CALLER_PHASING_GT_KEY = "PGT";
public static final String HAPLOTYPE_CALLER_PHASING_ID_KEY = "PID";
public static final String PHRED_SCALED_POSTERIORS_KEY = "PP"; //FamilyLikelihoodsUtils / PosteriorLikelihoodsUtils
public static final String QUALITY_SCORE_SUM_KEY = "QSS"; //M2
public static final String REFERENCE_GENOTYPE_QUALITY = "RGQ";
public static final String STRAND_COUNT_BY_SAMPLE_KEY = "SAC";
public static final String STRAND_BIAS_BY_SAMPLE_KEY = "SB";
@ -120,8 +129,16 @@ public final class GATKVCFConstants {
/* Note that many filters used throughout GATK (most notably in VariantRecalibration) are dynamic,
their names (or descriptions) depend on some threshold. Those filters are not included here
*/
public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC";
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
public static final String ALT_ALLELE_IN_NORMAL_FILTER_NAME = "alt_allele_in_normal"; //M2
public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC";
public static final String CLUSTERED_EVENTS_FILTER_NAME = "clustered_events"; //M2
public static final String GERMLINE_RISK_FILTER_NAME = "germline_risk"; //M2
public static final String HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME = "homologous_mapping_event"; //M2
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
public static final String MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME = "multi_event_alt_allele_in_normal"; //M2
public static final String PON_FILTER_NAME = "panel_of_normals"; //M2
public static final String STR_CONTRACTION_FILTER_NAME = "str_contraction"; //M2
public static final String TUMOR_LOD_FILTER_NAME = "t_lod_fstar"; //M2
// Symbolic alleles
public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT";

View File

@ -25,16 +25,11 @@
package org.broadinstitute.gatk.utils.variant;
import htsjdk.variant.vcf.VCFFilterHeaderLine;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.vcf.*;
import static org.broadinstitute.gatk.utils.variant.GATKVCFConstants.*;
import java.util.HashMap;
import java.util.Map;
import java.util.*;
/**
* This class contains the VCFHeaderLine definitions for the annotation keys in GATKVCFConstants.
@ -66,6 +61,16 @@ public class GATKVCFHeaderLines {
addFilterLine(new VCFFilterHeaderLine(LOW_QUAL_FILTER_NAME, "Low quality"));
addFilterLine(new VCFFilterHeaderLine(BEAGLE_MONO_FILTER_NAME, "This site was set to monomorphic by Beagle"));
// M2-related filters
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Evidence seen in the normal sample"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.CLUSTERED_EVENTS_FILTER_NAME, "Clustered events observed in the tumor "));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.GERMLINE_RISK_FILTER_NAME, "Evidence indicates this site is germline, not somatic"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.HOMOLOGOUS_MAPPING_EVENT_FILTER_NAME, "More than three events were observed in the tumor"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.MULTI_EVENT_ALT_ALLELE_IN_NORMAL_FILTER_NAME, "Multiple events observed in tumor and normal"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.PON_FILTER_NAME, "Seen in at least 2 samples in the panel of normals"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.TUMOR_LOD_FILTER_NAME, "Tumor does not meet likelihood threshold"));
addFilterLine(new VCFFilterHeaderLine(GATKVCFConstants.STR_CONTRACTION_FILTER_NAME, "Site filtered due to contraction of short repeat region"));
addFormatLine(new VCFFormatHeaderLine(ALLELE_BALANCE_KEY, 1, VCFHeaderLineType.Float, "Allele balance for each het genotype"));
addFormatLine(new VCFFormatHeaderLine(MAPPING_QUALITY_ZERO_BY_SAMPLE_KEY, 1, VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample"));
addFormatLine(new VCFFormatHeaderLine(MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample"));
@ -89,6 +94,10 @@ public class GATKVCFHeaderLines {
addFormatLine(new VCFFormatHeaderLine(JOINT_POSTERIOR_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred-scaled joint posterior probability of the genotype combination (after applying family priors)"));
addFormatLine(new VCFFormatHeaderLine(ORIGINAL_GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Original Genotype input to Beagle"));
// M2-related info lines
addFormatLine(new VCFFormatHeaderLine(GATKVCFConstants.ALLELE_FRACTION_KEY, 1, VCFHeaderLineType.Float, "Allele fraction of the event in the tumor"));
addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"));
addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"));
addInfoLine(new VCFInfoHeaderLine(DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
@ -147,5 +156,15 @@ public class GATKVCFHeaderLines {
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AC_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Count from Comparison ROD at this site"));
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AF_COMP_KEY, 1, VCFHeaderLineType.Integer, "Allele Frequency from Comparison ROD at this site"));
addInfoLine(new VCFInfoHeaderLine(BEAGLE_AN_COMP_KEY, 1, VCFHeaderLineType.Float, "Allele Number from Comparison ROD at this site"));
// M2-related info lines
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.String, "Number of events in this haplotype"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MAX_KEY, 1, VCFHeaderLineType.Integer, "Maximum distance between events in this active region"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.EVENT_DISTANCE_MIN_KEY, 1, VCFHeaderLineType.Integer, "Minimum distance between events in this active region"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.HAPLOTYPE_COUNT_KEY, 1, VCFHeaderLineType.String, "Number of haplotypes that support this variant"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.NORMAL_LOD_KEY, 1, VCFHeaderLineType.String, "Normal LOD score"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.PANEL_OF_NORMALS_COUNT_KEY, 1, VCFHeaderLineType.String, "Count from Panel of Normals"));
addInfoLine(new VCFInfoHeaderLine(GATKVCFConstants.TUMOR_LOD_KEY, 1, VCFHeaderLineType.String, "Tumor LOD score"));
}
}