From f04e51c6c2b74a79644e9473230410a8ba85fe92 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 15:38:56 -0400
Subject: [PATCH 1/8] Adding docs from Andrey since his repo was all screwed
up.
---
.../indels/SomaticIndelDetectorWalker.java | 143 ++++++++++++------
1 file changed, 94 insertions(+), 49 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
index e5ad3106d..8bba8eac2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
@@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.*;
+
/**
+ * Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well,
+ * but this latter functionality is now superceded by UnifiedGenotyper.
+ *
+ *
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
- * data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read
- * and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample,
- * or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified,
- * the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline
- * if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic
- * if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains
- * only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords.
+ * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
+ * include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
+ * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
+ * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
+ * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
+ * metrics for the post-processing tools to make the final decision). The calls are performed by default
+ * from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I
+ * command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments,
+ * respectively. Indels are called from the tumor sample and annotated as germline
+ * if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic
+ * if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling
+ * is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold
+ * in tumor sample, it will not be reported.
*
- * If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you,
- * please email asivache at broadinstitute dot org and he will gladly explain everything in more detail.
+ * To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input
+ * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
+ * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
*
+ *
Input
+ *
+ * Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
+ *
+ *
+ * Output
+ *
+ * Indel calls with associated metrics.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T SomaticIndelDetector \
+ * -o indels.vcf \
+ * -verbose indels.txt
+ * -I:normal normal.bam \
+ * -I:tumor tumor.bam
+ *
*
*/
+
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
public class SomaticIndelDetectorWalker extends ReadWalker {
// @Output
// PrintStream out;
- @Output(doc="File to which variants should be written",required=true)
+ @Output(doc="File to write variants (indels) in VCF format",required=true)
protected VCFWriter vcf_writer = null;
@Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
@@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
@Hidden
@Argument(fullName = "genotype_intervals", shortName = "genotype",
- doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
+ doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false)
public String genotypeIntervalsFile = null;
@Hidden
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
- doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
- "if the list turns out to be unsorted, it will throw an exception. "+
- "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
- "to sort and keep it in memory (increases memory usage!).")
+ doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
+ "if the list turns out to be unsorted, it will throw an exception. "+
+ "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
+ "to sort and keep it in memory (increases memory usage!).")
protected boolean GENOTYPE_NOT_SORTED = false;
@Hidden
- @Argument(fullName="unpaired", shortName="unpaired",
- doc="Perform unpaired calls (no somatic status detection)", required=false)
+ @Argument(fullName="unpaired", shortName="unpaired",
+ doc="Perform unpaired calls (no somatic status detection)", required=false)
boolean call_unpaired = false;
- boolean call_somatic ;
+ boolean call_somatic ;
- @Argument(fullName="verboseOutput", shortName="verbose",
- doc="Verbose output file in text format", required=false)
- java.io.File verboseOutput = null;
+ @Argument(fullName="verboseOutput", shortName="verbose",
+ doc="Verbose output file in text format", required=false)
+ java.io.File verboseOutput = null;
@Argument(fullName="bedOutput", shortName="bed",
- doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
+ doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
java.io.File bedOutput = null;
- @Argument(fullName="minCoverage", shortName="minCoverage",
- doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false)
- int minCoverage = 6;
+ @Argument(fullName="minCoverage", shortName="minCoverage",
+ doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
+ "with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
+ int minCoverage = 6;
- @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
- doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
- int minNormalCoverage = 4;
+ @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
+ doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+
+ "or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
+ int minNormalCoverage = 4;
- @Argument(fullName="minFraction", shortName="minFraction",
- doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
- " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
- double minFraction = 0.3;
+ @Argument(fullName="minFraction", shortName="minFraction",
+ doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
+ " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
+ double minFraction = 0.3;
- @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
- doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false)
- double minConsensusFraction = 0.7;
+ @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
+ doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+
+ "all indel observations at the site exceeds this threshold", required=false)
+ double minConsensusFraction = 0.7;
- @Argument(fullName="minIndelCount", shortName="minCnt",
- doc="Minimum count of reads supporting consensus indel required for making the call. "+
- " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
- "(minIndelCount not met) will not pass.", required=false)
- int minIndelCount = 0;
+ @Argument(fullName="minIndelCount", shortName="minCnt",
+ doc="Minimum count of reads supporting consensus indel required for making the call. "+
+ " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
+ "(minIndelCount not met) will not pass.", required=false)
+ int minIndelCount = 0;
- @Argument(fullName="refseq", shortName="refseq",
- doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
- String RefseqFileName = null;
+ @Argument(fullName="refseq", shortName="refseq",
+ doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+
+ "GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
+ String RefseqFileName = null;
- @Argument(fullName="blacklistedLanes", shortName="BL",
- doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
- "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
- PlatformUnitFilterHelper dummy;
- @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false;
+//@Argument(fullName="blacklistedLanes", shortName="BL",
+// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
+// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
+//PlatformUnitFilterHelper dummy;
+
+ @Hidden
+ @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",
+ required=false) Boolean DEBUG = false;
@Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
- "May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200;
+ "May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+
+ "window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+
+ "than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+
+ "if window is too small",required=false) int WINDOW_SIZE = 200;
@Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
" the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
+
+
private WindowContext tumor_context;
private WindowContext normal_context;
private int currentContigIndex = -1;
From fe474b77f85f325ed20d6cb6c50dc298d024d03e Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:05:39 -0400
Subject: [PATCH 2/8] Updating docs so printing looks nicer
---
.../gatk/walkers/variantutils/VariantValidationAssessor.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
index b98646270..ea8549474 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
@@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
- * Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
+ * Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes)
*
*
* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes).
From 4ef6a4598c3704fd5aac5f5302a148ddfedd3958 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:10:34 -0400
Subject: [PATCH 3/8] Updating docs to include output
---
.../walkers/varianteval/VariantEvalWalker.java | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
index 266b97af0..28f4f2a56 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -56,6 +56,22 @@ import java.util.*;
* Output
*
* Evaluation tables detailing the results of the eval modules which were applied.
+ * For example:
+ *
+ * output.eval.gatkreport:
+ * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
+ * CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ...
+ * CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ...
+ * CountVariants dbsnp CpG eval none known 65900028 47068 0 47068 0.00071423 ...
+ * CountVariants dbsnp CpG eval none novel 65900028 88702 0 88702 0.00134601 ...
+ * CountVariants dbsnp all eval none all 65900028 330818 0 330818 0.00502000 ...
+ * CountVariants dbsnp all eval none known 65900028 120685 0 120685 0.00183133 ...
+ * CountVariants dbsnp all eval none novel 65900028 210133 0 210133 0.00318866 ...
+ * CountVariants dbsnp non_CpG eval none all 65900028 195048 0 195048 0.00295976 ...
+ * CountVariants dbsnp non_CpG eval none known 65900028 73617 0 73617 0.00111710 ...
+ * CountVariants dbsnp non_CpG eval none novel 65900028 121431 0 121431 0.00184265 ...
+ * ...
+ *
*
*
* Examples
From 6d02a34bfba1537f294f5a077b24702e539b87a5 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:17:54 -0400
Subject: [PATCH 4/8] Updating docs to include output
---
.../variantutils/VariantValidationAssessor.java | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
index ea8549474..8eaf976d0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java
@@ -57,7 +57,16 @@ import java.util.*;
*
* Output
*
- * An annotated VCF.
+ * An annotated VCF. Additionally, a table like the following will be output:
+ *
+ * Total number of samples assayed: 185
+ * Total number of records processed: 152
+ * Number of Hardy-Weinberg violations: 34 (22%)
+ * Number of no-call violations: 12 (7%)
+ * Number of homozygous variant violations: 0 (0%)
+ * Number of records passing all filters: 106 (69%)
+ * Number of passing records that are polymorphic: 98 (92%)
+ *
*
*
* Examples
From fd1831b4a520e68b15b6b5b958aa2d04ade4e287 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:25:03 -0400
Subject: [PATCH 5/8] Updating docs to include more details
---
.../gatk/walkers/fasta/FastaAlternateReferenceWalker.java | 6 ++++--
.../sting/gatk/walkers/fasta/FastaReferenceWalker.java | 3 +++
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
index fd912334f..4e2c17bf6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
@@ -43,8 +43,10 @@ import java.util.List;
* Generates an alternative reference sequence over the specified interval.
*
*
- * Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
- * Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
+ * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
+ * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
+ * Note that if there are multiple variants at a site, it takes the first one seen.
+ * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order).
*
*
Input
*
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
index 5f3b37cc8..7ae5c5c75 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
@@ -42,6 +42,9 @@ import java.io.PrintStream;
*
*
* The output format can be partially controlled using the provided command-line arguments.
+ * Specify intervals with the usual -L argument to output only the reference bases within your intervals.
+ * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
+ * separate fasta sequence (named numerically in order).
*
*
Input
*
From 2f58fdb369a3cd4857281dd210427fac6352ca88 Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Thu, 15 Sep 2011 16:26:11 -0400
Subject: [PATCH 6/8] Adding expected output doc to CountCovariates
---
.../recalibration/CountCovariatesWalker.java | 36 +++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
index 98c8950e3..1bdb70bdd 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
@@ -76,6 +76,42 @@ import java.util.Map;
* Output
*
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
+ * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
+ *
+ * The first 20 lines of such a file is shown below.
+ * * The file begins with a series of comment lines describing:
+ * ** The number of counted loci
+ * ** The number of counted bases
+ * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
+ *
+ * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
+ *
+ * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
+ * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
+ * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+ *
+ *
+ * # Counted Sites 19451059
+ * # Counted Bases 56582018
+ * # Skipped Sites 82666
+ * # Fraction Skipped 1 / 235 bp
+ * ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
+ * SRR006446,11,65,CA,9,1,10
+ * SRR006446,11,48,TA,10,0,40
+ * SRR006446,11,67,AA,27,0,40
+ * SRR006446,11,61,GA,11,1,10
+ * SRR006446,12,34,CA,47,1,17
+ * SRR006446,12,30,GA,52,1,17
+ * SRR006446,12,36,AA,352,1,25
+ * SRR006446,12,17,TA,182,11,12
+ * SRR006446,11,48,TG,2,0,40
+ * SRR006446,11,67,AG,1,0,40
+ * SRR006446,12,34,CG,9,0,40
+ * SRR006446,12,30,GG,43,0,40
+ * ERR001876,4,31,AG,1,0,40
+ * ERR001876,4,31,AT,2,2,1
+ * ERR001876,4,31,CA,1,0,40
+ *
*
*
* Examples
From 9dc6354130b23683c31a7b2c1ef8c2ed94da1946 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Thu, 15 Sep 2011 16:55:24 -0400
Subject: [PATCH 7/8] Oops didn't mean to touch this test before
---
.../gatk/walkers/varianteval/VariantEvalIntegrationTest.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index d8f7ad3b6..99622cbf6 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -42,7 +42,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-T VariantEval",
"-R " + b37KGReference,
"--dbsnp " + b37dbSNP132,
- "--eval " + variantEvalTestDataRoot + "CEU.trio.callsForVE.vcf",
+ "--eval " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf",
"-noEV",
"-EV TiTvVariantEvaluator",
"-ST Sample",
From d78e00e5b2cd5e8a1b1aa75209100b039e521442 Mon Sep 17 00:00:00 2001
From: David Roazen
Date: Thu, 15 Sep 2011 16:09:07 -0400
Subject: [PATCH 8/8] Renaming VariantAnnotator SnpEff keys
This is to head off potential confusion with the output from the SnpEff tool itself,
which also uses a key named EFF.
---
.../sting/gatk/walkers/annotator/SnpEff.java | 90 ++++++++++---------
.../stratifications/FunctionalClass.java | 4 +-
.../VariantAnnotatorIntegrationTest.java | 2 +-
.../VariantEvalIntegrationTest.java | 2 +-
4 files changed, 53 insertions(+), 45 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
index bb3685fb5..4ead77506 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
@@ -68,23 +68,31 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
// Key names for the INFO field annotations we will add to each record, along
// with parsing-related information:
public enum InfoFieldKey {
- EFF (-1),
- EFF_IMPACT (0),
- EFF_CODON_CHANGE (1),
- EFF_AMINO_ACID_CHANGE (2),
- EFF_GENE_NAME (3),
- EFF_GENE_BIOTYPE (4),
- EFF_TRANSCRIPT_ID (6),
- EFF_EXON_ID (7);
+ EFFECT_KEY ("SNPEFF_EFFECT", -1),
+ IMPACT_KEY ("SNPEFF_IMPACT", 0),
+ CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
+ AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
+ GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
+ GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
+ TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
+ EXON_ID_KEY ("SNPEFF_EXON_ID", 7);
+
+ // Actual text of the key
+ private final String keyName;
// Index within the effect metadata subfields from the SnpEff EFF annotation
// where each key's associated value can be found during parsing.
private final int fieldIndex;
- InfoFieldKey ( int fieldIndex ) {
+ InfoFieldKey ( String keyName, int fieldIndex ) {
+ this.keyName = keyName;
this.fieldIndex = fieldIndex;
}
+ public String getKeyName() {
+ return keyName;
+ }
+
public int getFieldIndex() {
return fieldIndex;
}
@@ -292,27 +300,27 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
}
public List getKeyNames() {
- return Arrays.asList( InfoFieldKey.EFF.toString(),
- InfoFieldKey.EFF_IMPACT.toString(),
- InfoFieldKey.EFF_CODON_CHANGE.toString(),
- InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(),
- InfoFieldKey.EFF_GENE_NAME.toString(),
- InfoFieldKey.EFF_GENE_BIOTYPE.toString(),
- InfoFieldKey.EFF_TRANSCRIPT_ID.toString(),
- InfoFieldKey.EFF_EXON_ID.toString()
+ return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
+ InfoFieldKey.IMPACT_KEY.getKeyName(),
+ InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
+ InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
+ InfoFieldKey.GENE_NAME_KEY.getKeyName(),
+ InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
+ InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
+ InfoFieldKey.EXON_ID_KEY.getKeyName()
);
}
public List getDescriptions() {
return Arrays.asList(
- new VCFInfoHeaderLine(InfoFieldKey.EFF.toString(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_IMPACT.toString(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_CODON_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_NAME.toString(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_BIOTYPE.toString(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
- new VCFInfoHeaderLine(InfoFieldKey.EFF_EXON_ID.toString(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
+ new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
+ new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
+ new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
);
}
@@ -375,16 +383,16 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
}
try {
- impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]);
+ impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
}
catch ( IllegalArgumentException e ) {
- parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]));
+ parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
}
- codonChange = effectMetadata[InfoFieldKey.EFF_CODON_CHANGE.getFieldIndex()];
- aminoAcidChange = effectMetadata[InfoFieldKey.EFF_AMINO_ACID_CHANGE.getFieldIndex()];
- geneName = effectMetadata[InfoFieldKey.EFF_GENE_NAME.getFieldIndex()];
- geneBiotype = effectMetadata[InfoFieldKey.EFF_GENE_BIOTYPE.getFieldIndex()];
+ codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
+ aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
+ geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
+ geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
try {
@@ -398,8 +406,8 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
coding = EffectCoding.UNKNOWN;
}
- transcriptID = effectMetadata[InfoFieldKey.EFF_TRANSCRIPT_ID.getFieldIndex()];
- exonID = effectMetadata[InfoFieldKey.EFF_EXON_ID.getFieldIndex()];
+ transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()];
+ exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()];
}
private void parseError ( String message ) {
@@ -443,14 +451,14 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
public Map getAnnotations() {
Map annotations = new LinkedHashMap(Utils.optimumHashSize(InfoFieldKey.values().length));
- addAnnotation(annotations, InfoFieldKey.EFF.toString(), effect.toString());
- addAnnotation(annotations, InfoFieldKey.EFF_IMPACT.toString(), impact.toString());
- addAnnotation(annotations, InfoFieldKey.EFF_CODON_CHANGE.toString(), codonChange);
- addAnnotation(annotations, InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), aminoAcidChange);
- addAnnotation(annotations, InfoFieldKey.EFF_GENE_NAME.toString(), geneName);
- addAnnotation(annotations, InfoFieldKey.EFF_GENE_BIOTYPE.toString(), geneBiotype);
- addAnnotation(annotations, InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), transcriptID);
- addAnnotation(annotations, InfoFieldKey.EFF_EXON_ID.toString(), exonID);
+ addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
+ addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
+ addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
+ addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
+ addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
+ addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
+ addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
+ addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
return annotations;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
index a32857ffc..88ffcaaeb 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
@@ -62,8 +62,8 @@ public class FunctionalClass extends VariantStratifier {
annotationId++;
} while (eval.hasAttribute(key));
- } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFF.name() ) ) {
- SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFF.name()).toString());
+ } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName() ) ) {
+ SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName()).toString());
if ( snpEffType == SnpEff.EffectType.STOP_GAINED )
type = FunctionalType.nonsense;
else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING )
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index f902ce276..08baae7a7 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -134,7 +134,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation +
"snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000",
1,
- Arrays.asList("a1c3ba9efc28ee0606339604095076ea")
+ Arrays.asList("486fc6a5ca1819f5ab180d5d72b1ebc9")
);
executeTest("Testing SnpEff annotations", spec);
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index 99622cbf6..b90e6d0ff 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -32,7 +32,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
1,
Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12")
);
- executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec);
+ executeTest("testFunctionClassWithSnpeff", spec);
}
@Test