diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java index 30b2f828d..8e241bb2c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java @@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; * @version 0.1 */ public class PlatformFilter extends ReadFilter { - @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false) + @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false) protected String[] PLFilterNames; public boolean filterOut(SAMRecord rec) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index fdfac6bf7..4f072e88c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; * -I input1.bam \ * -I input2.bam \ * --read_filter MappingQualityZero + * + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T PrintReads \ + * -o output.bam \ + * -I input.bam \ + * -n 2000 * * */ diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 14abbca5b..4ead77506 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -45,7 +45,7 @@ import java.util.*; * (http://snpeff.sourceforge.net/). * * For each variant, chooses one of the effects of highest biological impact from the SnpEff - * output file (which must be provided on the command line via --snpEffFile .vcf), + * output file (which must be provided on the command line via --snpEffFile filename.vcf), * and adds annotations on that effect. * * @author David Roazen @@ -68,23 +68,31 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio // Key names for the INFO field annotations we will add to each record, along // with parsing-related information: public enum InfoFieldKey { - EFF (-1), - EFF_IMPACT (0), - EFF_CODON_CHANGE (1), - EFF_AMINO_ACID_CHANGE (2), - EFF_GENE_NAME (3), - EFF_GENE_BIOTYPE (4), - EFF_TRANSCRIPT_ID (6), - EFF_EXON_ID (7); + EFFECT_KEY ("SNPEFF_EFFECT", -1), + IMPACT_KEY ("SNPEFF_IMPACT", 0), + CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1), + AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2), + GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3), + GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4), + TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6), + EXON_ID_KEY ("SNPEFF_EXON_ID", 7); + + // Actual text of the key + private final String keyName; // Index within the effect metadata subfields from the SnpEff EFF annotation // where each key's associated value can be found during parsing. private final int fieldIndex; - InfoFieldKey ( int fieldIndex ) { + InfoFieldKey ( String keyName, int fieldIndex ) { + this.keyName = keyName; this.fieldIndex = fieldIndex; } + public String getKeyName() { + return keyName; + } + public int getFieldIndex() { return fieldIndex; } @@ -292,27 +300,27 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio } public List getKeyNames() { - return Arrays.asList( InfoFieldKey.EFF.toString(), - InfoFieldKey.EFF_IMPACT.toString(), - InfoFieldKey.EFF_CODON_CHANGE.toString(), - InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), - InfoFieldKey.EFF_GENE_NAME.toString(), - InfoFieldKey.EFF_GENE_BIOTYPE.toString(), - InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), - InfoFieldKey.EFF_EXON_ID.toString() + return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(), + InfoFieldKey.IMPACT_KEY.getKeyName(), + InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), + InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), + InfoFieldKey.GENE_NAME_KEY.getKeyName(), + InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), + InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), + InfoFieldKey.EXON_ID_KEY.getKeyName() ); } public List getDescriptions() { return Arrays.asList( - new VCFInfoHeaderLine(InfoFieldKey.EFF.toString(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_IMPACT.toString(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), - new VCFInfoHeaderLine(InfoFieldKey.EFF_CODON_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_NAME.toString(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_BIOTYPE.toString(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EFF_EXON_ID.toString(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") + new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), + new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") ); } @@ -375,16 +383,16 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio } try { - impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]); + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); } catch ( IllegalArgumentException e ) { - parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()])); + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); } - codonChange = effectMetadata[InfoFieldKey.EFF_CODON_CHANGE.getFieldIndex()]; - aminoAcidChange = effectMetadata[InfoFieldKey.EFF_AMINO_ACID_CHANGE.getFieldIndex()]; - geneName = effectMetadata[InfoFieldKey.EFF_GENE_NAME.getFieldIndex()]; - geneBiotype = effectMetadata[InfoFieldKey.EFF_GENE_BIOTYPE.getFieldIndex()]; + codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()]; + aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()]; + geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()]; + geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()]; if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { try { @@ -398,8 +406,8 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio coding = EffectCoding.UNKNOWN; } - transcriptID = effectMetadata[InfoFieldKey.EFF_TRANSCRIPT_ID.getFieldIndex()]; - exonID = effectMetadata[InfoFieldKey.EFF_EXON_ID.getFieldIndex()]; + transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()]; + exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()]; } private void parseError ( String message ) { @@ -443,14 +451,14 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio public Map getAnnotations() { Map annotations = new LinkedHashMap(Utils.optimumHashSize(InfoFieldKey.values().length)); - addAnnotation(annotations, InfoFieldKey.EFF.toString(), effect.toString()); - addAnnotation(annotations, InfoFieldKey.EFF_IMPACT.toString(), impact.toString()); - addAnnotation(annotations, InfoFieldKey.EFF_CODON_CHANGE.toString(), codonChange); - addAnnotation(annotations, InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), aminoAcidChange); - addAnnotation(annotations, InfoFieldKey.EFF_GENE_NAME.toString(), geneName); - addAnnotation(annotations, InfoFieldKey.EFF_GENE_BIOTYPE.toString(), geneBiotype); - addAnnotation(annotations, InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), transcriptID); - addAnnotation(annotations, InfoFieldKey.EFF_EXON_ID.toString(), exonID); + addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString()); + addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange); + addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange); + addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName); + addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype); + addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID); + addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID); return annotations; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 3a18fe610..86f97a36c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -69,14 +69,23 @@ import java.util.*; *

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: + *

* - no suffix: per locus coverage + *

* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases + *

* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases + *

* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval + *

* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples + *

* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene + *

* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples + *

* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases + *

* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases *

* diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index fd912334f..4e2c17bf6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -43,8 +43,10 @@ import java.util.List; * Generates an alternative reference sequence over the specified interval. * *

- * Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). - * Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'. + * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). + * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * Note that if there are multiple variants at a site, it takes the first one seen. + * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). * *

Input

*

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java index 5f3b37cc8..7ae5c5c75 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java @@ -42,6 +42,9 @@ import java.io.PrintStream; * *

* The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). * *

Input

*

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index e5ad3106d..8bba8eac2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.*; import java.util.*; + /** + * Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well, + * but this latter functionality is now superceded by UnifiedGenotyper. + * + *

* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing - * data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read - * and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample, - * or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified, - * the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline - * if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic - * if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains - * only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords. + * data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs + * include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many + * forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional + * statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will + * attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional + * metrics for the post-processing tools to make the final decision). The calls are performed by default + * from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I + * command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments, + * respectively. Indels are called from the tumor sample and annotated as germline + * if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic + * if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling + * is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold + * in tumor sample, it will not be reported. * - * If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you, - * please email asivache at broadinstitute dot org and he will gladly explain everything in more detail. + * To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input + * bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged + * on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups). * + *

Input

+ *

+ * Tumor and normal bam files (or single sample bam file(s) in --unpaired mode). + *

+ * + *

Output

+ *

+ * Indel calls with associated metrics. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SomaticIndelDetector \
+ *   -o indels.vcf \
+ *   -verbose indels.txt
+ *   -I:normal normal.bam \
+ *   -I:tumor tumor.bam
+ * 
* */ + @ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class}) public class SomaticIndelDetectorWalker extends ReadWalker { // @Output // PrintStream out; - @Output(doc="File to which variants should be written",required=true) + @Output(doc="File to write variants (indels) in VCF format",required=true) protected VCFWriter vcf_writer = null; @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true) @@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker { @Hidden @Argument(fullName = "genotype_intervals", shortName = "genotype", - doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false) + doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false) public String genotypeIntervalsFile = null; @Hidden @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, - doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ - "if the list turns out to be unsorted, it will throw an exception. "+ - "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ - "to sort and keep it in memory (increases memory usage!).") + doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ + "if the list turns out to be unsorted, it will throw an exception. "+ + "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ + "to sort and keep it in memory (increases memory usage!).") protected boolean GENOTYPE_NOT_SORTED = false; @Hidden - @Argument(fullName="unpaired", shortName="unpaired", - doc="Perform unpaired calls (no somatic status detection)", required=false) + @Argument(fullName="unpaired", shortName="unpaired", + doc="Perform unpaired calls (no somatic status detection)", required=false) boolean call_unpaired = false; - boolean call_somatic ; + boolean call_somatic ; - @Argument(fullName="verboseOutput", shortName="verbose", - doc="Verbose output file in text format", required=false) - java.io.File verboseOutput = null; + @Argument(fullName="verboseOutput", shortName="verbose", + doc="Verbose output file in text format", required=false) + java.io.File verboseOutput = null; @Argument(fullName="bedOutput", shortName="bed", - doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) + doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) java.io.File bedOutput = null; - @Argument(fullName="minCoverage", shortName="minCoverage", - doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false) - int minCoverage = 6; + @Argument(fullName="minCoverage", shortName="minCoverage", + doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+ + "with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false) + int minCoverage = 6; - @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", - doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) - int minNormalCoverage = 4; + @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", + doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+ + "or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) + int minNormalCoverage = 4; - @Argument(fullName="minFraction", shortName="minFraction", - doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ - " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) - double minFraction = 0.3; + @Argument(fullName="minFraction", shortName="minFraction", + doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ + " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) + double minFraction = 0.3; - @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", - doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false) - double minConsensusFraction = 0.7; + @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", + doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+ + "all indel observations at the site exceeds this threshold", required=false) + double minConsensusFraction = 0.7; - @Argument(fullName="minIndelCount", shortName="minCnt", - doc="Minimum count of reads supporting consensus indel required for making the call. "+ - " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ - "(minIndelCount not met) will not pass.", required=false) - int minIndelCount = 0; + @Argument(fullName="minIndelCount", shortName="minCnt", + doc="Minimum count of reads supporting consensus indel required for making the call. "+ + " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ + "(minIndelCount not met) will not pass.", required=false) + int minIndelCount = 0; - @Argument(fullName="refseq", shortName="refseq", - doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) - String RefseqFileName = null; + @Argument(fullName="refseq", shortName="refseq", + doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+ + "GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) + String RefseqFileName = null; - @Argument(fullName="blacklistedLanes", shortName="BL", - doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ - "by this application, so they will not contribute indels to consider and will not be counted.", required=false) - PlatformUnitFilterHelper dummy; - @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false; +//@Argument(fullName="blacklistedLanes", shortName="BL", +// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ +// "by this application, so they will not contribute indels to consider and will not be counted.", required=false) +//PlatformUnitFilterHelper dummy; + + @Hidden + @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on", + required=false) Boolean DEBUG = false; @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+ - "May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200; + "May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+ + "window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+ + "than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+ + "if window is too small",required=false) int WINDOW_SIZE = 200; @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+ " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000; + + private WindowContext tumor_context; private WindowContext normal_context; private int currentContigIndex = -1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 98c8950e3..1bdb70bdd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -76,6 +76,42 @@ import java.util.Map; *

Output

*

* A recalibration table file in CSV format that is used by the TableRecalibration walker. + * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. + * + * The first 20 lines of such a file is shown below. + * * The file begins with a series of comment lines describing: + * ** The number of counted loci + * ** The number of counted bases + * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases + * + * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. + * + * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change + * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of + * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate. + * + *

+ * # Counted Sites    19451059
+ * # Counted Bases    56582018
+ * # Skipped Sites    82666
+ * # Fraction Skipped 1 / 235 bp
+ * ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
+ * SRR006446,11,65,CA,9,1,10
+ * SRR006446,11,48,TA,10,0,40
+ * SRR006446,11,67,AA,27,0,40
+ * SRR006446,11,61,GA,11,1,10
+ * SRR006446,12,34,CA,47,1,17
+ * SRR006446,12,30,GA,52,1,17
+ * SRR006446,12,36,AA,352,1,25
+ * SRR006446,12,17,TA,182,11,12
+ * SRR006446,11,48,TG,2,0,40
+ * SRR006446,11,67,AG,1,0,40
+ * SRR006446,12,34,CG,9,0,40
+ * SRR006446,12,30,GG,43,0,40
+ * ERR001876,4,31,AG,1,0,40
+ * ERR001876,4,31,AT,2,2,1
+ * ERR001876,4,31,CA,1,0,40
+ * 
*

* *

Examples

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 01e8cd321..48cba6a1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -61,7 +61,7 @@ import java.util.List; * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC * * are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: - * + *
  * Valid                     // amplicon is valid
  * SITE_IS_FILTERED=1        // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
  * VARIANT_TOO_NEAR_PROBE=1  // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
@@ -72,10 +72,10 @@ import java.util.List;
  * END_TOO_CLOSE,            // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
  * NO_VARIANTS_FOUND,        // no variants found within the amplicon region
  * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
- * 

+ *

* *

Examples

- *

+ * 
  *    java
  *      -jar GenomeAnalysisTK.jar
  *      -T ValidationAmplicons
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
index 266b97af0..28f4f2a56 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -56,6 +56,22 @@ import java.util.*;
  * 

Output

*

* Evaluation tables detailing the results of the eval modules which were applied. + * For example: + *

+ * output.eval.gatkreport:
+ * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
+ * CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
+ * CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
+ * CountVariants  dbsnp     CpG      eval     none            known    65900028        47068        0         47068         0.00071423  ...
+ * CountVariants  dbsnp     CpG      eval     none            novel    65900028        88702        0         88702         0.00134601  ...
+ * CountVariants  dbsnp     all      eval     none            all      65900028        330818       0         330818        0.00502000  ...
+ * CountVariants  dbsnp     all      eval     none            known    65900028        120685       0         120685        0.00183133  ...
+ * CountVariants  dbsnp     all      eval     none            novel    65900028        210133       0         210133        0.00318866  ...
+ * CountVariants  dbsnp     non_CpG  eval     none            all      65900028        195048       0         195048        0.00295976  ...
+ * CountVariants  dbsnp     non_CpG  eval     none            known    65900028        73617        0         73617         0.00111710  ...
+ * CountVariants  dbsnp     non_CpG  eval     none            novel    65900028        121431       0         121431        0.00184265  ...
+ * ...
+ * 
*

* *

Examples

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index 193a65591..88ffcaaeb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; @@ -11,12 +12,19 @@ import java.util.List; * Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation. */ public class FunctionalClass extends VariantStratifier { + + public enum FunctionalType { + silent, + missense, + nonsense + } + + @Override public void initialize() { states.add("all"); - states.add("silent"); - states.add("missense"); - states.add("nonsense"); + for ( FunctionalType type : FunctionalType.values() ) + states.add(type.name()); } @@ -26,10 +34,12 @@ public class FunctionalClass extends VariantStratifier { relevantStates.add("all"); if (eval != null && eval.isVariant()) { - String type = null; + FunctionalType type = null; if (eval.hasAttribute("refseq.functionalClass")) { - type = eval.getAttributeAsString("refseq.functionalClass"); + try { + type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass")); + } catch ( Exception e ) {} // don't error out if the type isn't supported } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -37,24 +47,33 @@ public class FunctionalClass extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); - - if ( newtype != null && !newtype.equalsIgnoreCase("null") && - ( type == null || - ( type.equals("silent") && !newtype.equals("silent") ) || - ( type.equals("missense") && newtype.equals("nonsense") ) ) - ) { - type = newtype; + String newtypeStr = eval.getAttributeAsString(key); + if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) { + try { + FunctionalType newType = FunctionalType.valueOf(newtypeStr); + if ( type == null || + ( type == FunctionalType.silent && newType != FunctionalType.silent ) || + ( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) { + type = newType; + } + } catch ( Exception e ) {} // don't error out if the type isn't supported } annotationId++; } while (eval.hasAttribute(key)); + + } else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName() ) ) { + SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName()).toString()); + if ( snpEffType == SnpEff.EffectType.STOP_GAINED ) + type = FunctionalType.nonsense; + else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING ) + type = FunctionalType.missense; + else if ( snpEffType == SnpEff.EffectType.SYNONYMOUS_CODING ) + type = FunctionalType.silent; } - if (type != null) { - if (type.equals("silent")) { relevantStates.add("silent"); } - else if (type.equals("missense")) { relevantStates.add("missense"); } - else if (type.equals("nonsense")) { relevantStates.add("nonsense"); } + if ( type != null ) { + relevantStates.add(type.name()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index b98646270..8eaf976d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.util.*; /** - * Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes) + * Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes) * *

* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes). @@ -57,7 +57,16 @@ import java.util.*; * *

Output

*

- * An annotated VCF. + * An annotated VCF. Additionally, a table like the following will be output: + *

+ *     Total number of samples assayed:                  185
+ *     Total number of records processed:                152
+ *     Number of Hardy-Weinberg violations:              34 (22%)
+ *     Number of no-call violations:                     12 (7%)
+ *     Number of homozygous variant violations:          0 (0%)
+ *     Number of records passing all filters:            106 (69%)
+ *     Number of passing records that are polymorphic:   98 (92%)
+ * 
*

* *

Examples

diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index f902ce276..08baae7a7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -134,7 +134,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000", 1, - Arrays.asList("a1c3ba9efc28ee0606339604095076ea") + Arrays.asList("486fc6a5ca1819f5ab180d5d72b1ebc9") ); executeTest("Testing SnpEff annotations", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index e992684bc..b90e6d0ff 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -6,7 +6,7 @@ import org.testng.annotations.Test; import java.util.Arrays; public class VariantEvalIntegrationTest extends WalkerTest { - private static String variantEvalTestDataRoot = validationDataLocation + "/VariantEval"; + private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf"; private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf"; @@ -14,6 +14,27 @@ public class VariantEvalIntegrationTest extends WalkerTest { private static String cmdRoot = "-T VariantEval" + " -R " + b36KGReference; + @Test + public void testFunctionClassWithSnpeff() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "--dbsnp " + b37dbSNP132, + "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-noEV", + "-EV TiTvVariantEvaluator", + "-noST", + "-ST FunctionalClass", + "-BTI eval", + "-o %s" + ), + 1, + Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12") + ); + executeTest("testFunctionClassWithSnpeff", spec); + } + @Test public void testStratifySamplesAndExcludeMonomorphicSites() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/perl/liftOverVCF.pl b/public/perl/liftOverVCF.pl index 21cb8bb6b..ba4198292 100755 --- a/public/perl/liftOverVCF.pl +++ b/public/perl/liftOverVCF.pl @@ -36,7 +36,7 @@ my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; # lift over the file print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; if ($recordOriginalLocation) { $cmd .= " -recordOriginalLocation"; } @@ -66,7 +66,7 @@ system($cmd) == 0 or quit("The sorting step failed. Please correct the necessar # Filter the VCF for bad records print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out"; system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying."); # clean up