dding docs for DepthOfCoverage and ValidationAmplicons
This commit is contained in:
parent
10d8033bcf
commit
a8935c99fc
|
|
@ -51,14 +51,48 @@ import java.io.PrintStream;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A parallelizable walker designed to quickly aggregate relevant coverage statistics across samples in the input
|
||||
* file. Assesses the mean and median granular coverages of each sample, and generates part of a cumulative
|
||||
* distribution of % bases and % targets covered for certain depths. The granularity of DOC can be set by command
|
||||
* line arguments.
|
||||
* Toolbox for assessing sequence coverage by a wide array of metrics, partitioned by sample, read group, or library
|
||||
*
|
||||
* <p>
|
||||
* DepthOfCoverage processes a set of bam files to determine coverage at different levels of partitioning and
|
||||
* aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by
|
||||
* sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles,
|
||||
* and/or percentage of bases covered to or beyond a threshold.
|
||||
* Additionally, reads and bases can be filtered by mapping or base quality score.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more bam files (with proper headers) to be analyzed for coverage statistics
|
||||
* (Optional) A REFSEQ Rod to aggregate coverage to the gene level
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
|
||||
* - no suffix: per locus coverage
|
||||
* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
|
||||
* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
|
||||
* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
|
||||
* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
|
||||
* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
|
||||
* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
|
||||
* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
|
||||
* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T VariantEval \
|
||||
* -o file_name_base \
|
||||
* -I input_bams.list
|
||||
* [-geneList refSeq.sorted.txt] \
|
||||
* [-pt readgroup] \
|
||||
* [-ct 4 -ct 6 -ct 10] \
|
||||
* [-L my_capture_genes.interval_list]
|
||||
* </pre>
|
||||
*
|
||||
* @Author chartl
|
||||
* @Date Feb 22, 2010
|
||||
*/
|
||||
// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time
|
||||
// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n)
|
||||
|
|
|
|||
|
|
@ -30,21 +30,77 @@ import java.util.LinkedList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: 6/13/11
|
||||
* Time: 2:12 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation
|
||||
*
|
||||
* <p>
|
||||
* ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe
|
||||
* sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within
|
||||
* the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies
|
||||
* reasons why the site may fail validation (nearby variation, for example).
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an
|
||||
* interval list defining the size of the amplicons around the sites to be validated
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Output is a FASTA-formatted file with some modifications at probe sites. For instance:
|
||||
* <pre>
|
||||
* >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414
|
||||
* CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC
|
||||
* >20:792122 Valid 20_792122
|
||||
* TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT
|
||||
* >20:994145 Valid 20_994145
|
||||
* TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC
|
||||
* >20:1074230 SITE_IS_FILTERED=1, 20_1074230
|
||||
* ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC
|
||||
* >20:1084330 DELETION=1, 20_1084330
|
||||
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
|
||||
*</pre>
|
||||
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
|
||||
*
|
||||
* Valid // amplicon is valid
|
||||
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
|
||||
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
|
||||
* MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon
|
||||
* DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate
|
||||
* DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak
|
||||
* START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer
|
||||
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
|
||||
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
|
||||
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre></pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ValidationAmplicons
|
||||
* -R /humgen/1kg/reference/human_g1k_v37.fasta
|
||||
* -BTI ProbeIntervals
|
||||
* -ProbeIntervals:table interval_table.table
|
||||
* -ValidateAlleles:vcf sites_to_validate.vcf
|
||||
* -MaskAlleles:vcf mask_sites.vcf
|
||||
* --virtualPrimerSize 30
|
||||
* -o probes.fasta
|
||||
* </pre>
|
||||
*
|
||||
* @author chartl
|
||||
* @since July 2011
|
||||
*/
|
||||
@Requires(value={DataSource.REFERENCE})
|
||||
public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
||||
@Input(fullName = "ProbeIntervals", doc="Chris document me", required=true)
|
||||
@Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+
|
||||
"intervals surrounding the probe sites amplicons should be designed for", required=true)
|
||||
RodBinding<TableFeature> probeIntervals;
|
||||
|
||||
@Input(fullName = "ValidateAlleles", doc="Chris document me", required=true)
|
||||
@Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true)
|
||||
RodBinding<VariantContext> validateAlleles;
|
||||
|
||||
@Input(fullName = "MaskAlleles", doc="Chris document me", required=true)
|
||||
@Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true)
|
||||
RodBinding<VariantContext> maskAlleles;
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue