diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 90036407f..7fe16c9df 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -51,14 +51,48 @@ import java.io.PrintStream; import java.util.*; /** - * A parallelizable walker designed to quickly aggregate relevant coverage statistics across samples in the input - * file. Assesses the mean and median granular coverages of each sample, and generates part of a cumulative - * distribution of % bases and % targets covered for certain depths. The granularity of DOC can be set by command - * line arguments. + * Toolbox for assessing sequence coverage by a wide array of metrics, partitioned by sample, read group, or library * + *
+ * DepthOfCoverage processes a set of bam files to determine coverage at different levels of partitioning and + * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by + * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, + * and/or percentage of bases covered to or beyond a threshold. + * Additionally, reads and bases can be filtered by mapping or base quality score. + * + *
+ * One or more bam files (with proper headers) to be analyzed for coverage statistics + * (Optional) A REFSEQ Rod to aggregate coverage to the gene level + *
+ * + *+ * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: + * - no suffix: per locus coverage + * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases + * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases + * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval + * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples + * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene + * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples + * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases + * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases + *
+ * + *+ * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T VariantEval \ + * -o file_name_base \ + * -I input_bams.list + * [-geneList refSeq.sorted.txt] \ + * [-pt readgroup] \ + * [-ct 4 -ct 6 -ct 10] \ + * [-L my_capture_genes.interval_list] + ** - * @Author chartl - * @Date Feb 22, 2010 */ // todo -- cache the map from sample names to means in the print functions, rather than regenerating each time // todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 83a8ce7d7..70f3c6a1a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -44,7 +44,9 @@ import java.util.Set; public abstract class AlleleFrequencyCalculationModel implements Cloneable { public enum Model { + /** The default model with the best performance in all cases */ EXACT, + /** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */ GRID_SEARCH } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 594c1dd28..60dfe4fe7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -53,7 +53,9 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { } public enum GENOTYPING_MODE { + /** the default; the Unified Genotyper will choose the most likely alternate allele */ DISCOVERY, + /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */ GENOTYPE_GIVEN_ALLELES } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 1a76bfd07..e7f89bf08 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -36,31 +36,54 @@ import java.io.File; public class UnifiedArgumentCollection { - // control the various models to be used @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; + /** + * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. + */ @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false) public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; + /** + * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are: + * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 + */ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; + /** + * Specifies how to determine the alternate allele to use for genotyping + */ @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + /** + * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with + * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this + * is the default). Note that the confidence (QUAL) values for multi-sample low-pass (e.g. 4x per sample) calling might + * be significantly smaller with the new EXACT model than with our older GRID_SEARCH model, as the latter tended to + * over-estimate the confidence; for low-pass calling we tend to use much smaller thresholds (e.g. 4). + */ @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + /** + * the minimum phred-scaled Qscore threshold to emit low confidence calls. Genotypes with confidence >= this but less + * than the calling threshold are emitted but marked as filtered. + */ @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + /** + * This argument is not enabled by default because it increases the runtime by an appreciable amount. + */ @Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false) public boolean COMPUTE_SLOD = false; @@ -80,7 +103,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "abort_at_too_much_coverage", doc = "Don't call a site if the downsampled coverage is greater than this value", required = false) public int COVERAGE_AT_WHICH_TO_ABORT = -1; - // control the various parameters to be used @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) public int MIN_BASE_QUALTY_SCORE = 17; @@ -91,11 +113,17 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - // indel-related arguments + /** + * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site. + * Decreasing this value will increase sensitivity but at the cost of larger calling time and a larger number of false positives. + */ @Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false) public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5; + /** + * This argument informs the prior probability of having an indel at a site. + */ @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) public double INDEL_HETEROZYGOSITY = 1.0/8000; @@ -126,22 +154,23 @@ public class UnifiedArgumentCollection { @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; + @Hidden @Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false) public boolean dovit = false; + @Hidden @Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false) public boolean GSA_PRODUCTION_ONLY = false; + @Hidden - @Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false) public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL; @Hidden - @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) + @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) public boolean IGNORE_SNP_ALLELES = false; - @Deprecated @Argument(fullName="output_all_callable_bases", shortName="all_bases", doc="Please use --output_mode EMIT_ALL_SITES instead" ,required=false) private Boolean ALL_BASES_DEPRECATED = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index d31bb6fb9..8d2101d8f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -45,11 +45,71 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; import java.util.*; - /** - * A variant caller which unifies the approaches of several disparate callers. Works for single-sample and - * multi-sample data. The user can choose from several different incorporated calculation models. + * A variant caller which unifies the approaches of several disparate callers -- Works for single-sample and multi-sample data. + * + *
+ * The GATK Unified Genotyper is a multiple-sample, technology-aware SNP and indel caller. It uses a Bayesian genotype + * likelihood model to estimate simultaneously the most likely genotypes and allele frequency in a population of N samples, + * emitting an accurate posterior probability of there being a segregating variant allele at each locus as well as for the + * genotype of each sample. The system can either emit just the variant sites or complete genotypes (which includes + * homozygous reference calls) satisfying some phred-scaled confidence value. The genotyper can make accurate calls on + * both single sample data and multi-sample data. + * + *
+ * The read data from which to make variant calls. + *
+ * + *+ * A raw, unfiltered, highly specific callset in VCF format. + *
+ * + *+ * java -jar GenomeAnalysisTK.jar \ + * -R resources/Homo_sapiens_assembly18.fasta \ + * -T UnifiedGenotyper \ + * -I sample1.bam [-I sample2.bam ...] \ + * --dbsnp dbSNP.vcf \ + * -o snps.raw.vcf \ + * -stand_call_conf [50.0] \ + * -stand_emit_conf 10.0 \ + * -dcov [50] \ + * [-L targets.interval_list] + *+ * + *
+ * The above command will call all of the samples in your provided BAM files [-I arguments] together and produce a VCF file + * with sites and genotypes for all samples. The easiest way to get the dbSNP file is from the GATK resource bundle. Several + * arguments have parameters that should be chosen based on the average coverage per sample in your data. See the detailed + * argument descriptions below. + *
+ * + *+ * java -jar /path/to/GenomeAnalysisTK.jar \ + * -l INFO \ + * -R resources/Homo_sapiens_assembly18.fasta \ + * -T UnifiedGenotyper \ + * -I /DCC/ftp/pilot_data/data/NA12878/alignment/NA12878.SLX.maq.SRP000031.2009_08.bam \ + * -o my.vcf \ + * --output_mode EMIT_ALL_SITES + *+ * + *
+ * The input read data whose base quality scores need to be assessed. + *
* A database of known polymorphic sites to skip over. *
* @@ -134,6 +136,10 @@ public class CountCovariatesWalker extends LocusWalker+ * The input read data whose base quality scores need to be recalibrated. + *
* The recalibration table file in CSV format that was generated by the CountCovariates walker. *
* diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 61149e5d9..cd2891874 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -30,21 +30,77 @@ import java.util.LinkedList; import java.util.List; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 6/13/11 - * Time: 2:12 PM - * To change this template use File | Settings | File Templates. + * Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation + * + *+ * ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe + * sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within + * the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies + * reasons why the site may fail validation (nearby variation, for example). + *
+ * + *+ * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an + * interval list defining the size of the amplicons around the sites to be validated + *
+ * + *+ * Output is a FASTA-formatted file with some modifications at probe sites. For instance: + *
+ * >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414 + * CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC + * >20:792122 Valid 20_792122 + * TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT + * >20:994145 Valid 20_994145 + * TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC + * >20:1074230 SITE_IS_FILTERED=1, 20_1074230 + * ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC + * >20:1084330 DELETION=1, 20_1084330 + * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC + *+ * are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: + * + * Valid // amplicon is valid + * SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant") + * VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak + * MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon + * DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate + * DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak + * START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer + * END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer + * NO_VARIANTS_FOUND, // no variants found within the amplicon region + * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself) + * + * + *