dding docs for DepthOfCoverage and ValidationAmplicons
This commit is contained in:
parent
10d8033bcf
commit
a8935c99fc
|
|
@ -51,14 +51,48 @@ import java.io.PrintStream;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A parallelizable walker designed to quickly aggregate relevant coverage statistics across samples in the input
|
* Toolbox for assessing sequence coverage by a wide array of metrics, partitioned by sample, read group, or library
|
||||||
* file. Assesses the mean and median granular coverages of each sample, and generates part of a cumulative
|
|
||||||
* distribution of % bases and % targets covered for certain depths. The granularity of DOC can be set by command
|
|
||||||
* line arguments.
|
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* DepthOfCoverage processes a set of bam files to determine coverage at different levels of partitioning and
|
||||||
|
* aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by
|
||||||
|
* sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles,
|
||||||
|
* and/or percentage of bases covered to or beyond a threshold.
|
||||||
|
* Additionally, reads and bases can be filtered by mapping or base quality score.
|
||||||
|
*
|
||||||
|
* <h2>Input</h2>
|
||||||
|
* <p>
|
||||||
|
* One or more bam files (with proper headers) to be analyzed for coverage statistics
|
||||||
|
* (Optional) A REFSEQ Rod to aggregate coverage to the gene level
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Output</h2>
|
||||||
|
* <p>
|
||||||
|
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
|
||||||
|
* - no suffix: per locus coverage
|
||||||
|
* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
|
||||||
|
* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
|
||||||
|
* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
|
||||||
|
* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
|
||||||
|
* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
|
||||||
|
* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
|
||||||
|
* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
|
||||||
|
* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Examples</h2>
|
||||||
|
* <pre>
|
||||||
|
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||||
|
* -R ref.fasta \
|
||||||
|
* -T VariantEval \
|
||||||
|
* -o file_name_base \
|
||||||
|
* -I input_bams.list
|
||||||
|
* [-geneList refSeq.sorted.txt] \
|
||||||
|
* [-pt readgroup] \
|
||||||
|
* [-ct 4 -ct 6 -ct 10] \
|
||||||
|
* [-L my_capture_genes.interval_list]
|
||||||
|
* </pre>
|
||||||
*
|
*
|
||||||
* @Author chartl
|
|
||||||
* @Date Feb 22, 2010
|
|
||||||
*/
|
*/
|
||||||
// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time
|
// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time
|
||||||
// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n)
|
// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n)
|
||||||
|
|
|
||||||
|
|
@ -30,21 +30,77 @@ import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation
|
||||||
* User: chartl
|
*
|
||||||
* Date: 6/13/11
|
* <p>
|
||||||
* Time: 2:12 PM
|
* ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe
|
||||||
* To change this template use File | Settings | File Templates.
|
* sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within
|
||||||
|
* the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies
|
||||||
|
* reasons why the site may fail validation (nearby variation, for example).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Input</h2>
|
||||||
|
* <p>
|
||||||
|
* Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an
|
||||||
|
* interval list defining the size of the amplicons around the sites to be validated
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Output</h2>
|
||||||
|
* <p>
|
||||||
|
* Output is a FASTA-formatted file with some modifications at probe sites. For instance:
|
||||||
|
* <pre>
|
||||||
|
* >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414
|
||||||
|
* CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC
|
||||||
|
* >20:792122 Valid 20_792122
|
||||||
|
* TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT
|
||||||
|
* >20:994145 Valid 20_994145
|
||||||
|
* TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC
|
||||||
|
* >20:1074230 SITE_IS_FILTERED=1, 20_1074230
|
||||||
|
* ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC
|
||||||
|
* >20:1084330 DELETION=1, 20_1084330
|
||||||
|
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
|
||||||
|
*</pre>
|
||||||
|
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
|
||||||
|
*
|
||||||
|
* Valid // amplicon is valid
|
||||||
|
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
|
||||||
|
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
|
||||||
|
* MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon
|
||||||
|
* DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate
|
||||||
|
* DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak
|
||||||
|
* START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer
|
||||||
|
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
|
||||||
|
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
|
||||||
|
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Examples</h2>
|
||||||
|
* <pre></pre>
|
||||||
|
* java
|
||||||
|
* -jar GenomeAnalysisTK.jar
|
||||||
|
* -T ValidationAmplicons
|
||||||
|
* -R /humgen/1kg/reference/human_g1k_v37.fasta
|
||||||
|
* -BTI ProbeIntervals
|
||||||
|
* -ProbeIntervals:table interval_table.table
|
||||||
|
* -ValidateAlleles:vcf sites_to_validate.vcf
|
||||||
|
* -MaskAlleles:vcf mask_sites.vcf
|
||||||
|
* --virtualPrimerSize 30
|
||||||
|
* -o probes.fasta
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* @author chartl
|
||||||
|
* @since July 2011
|
||||||
*/
|
*/
|
||||||
@Requires(value={DataSource.REFERENCE})
|
@Requires(value={DataSource.REFERENCE})
|
||||||
public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
public class ValidationAmplicons extends RodWalker<Integer,Integer> {
|
||||||
@Input(fullName = "ProbeIntervals", doc="Chris document me", required=true)
|
@Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+
|
||||||
|
"intervals surrounding the probe sites amplicons should be designed for", required=true)
|
||||||
RodBinding<TableFeature> probeIntervals;
|
RodBinding<TableFeature> probeIntervals;
|
||||||
|
|
||||||
@Input(fullName = "ValidateAlleles", doc="Chris document me", required=true)
|
@Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true)
|
||||||
RodBinding<VariantContext> validateAlleles;
|
RodBinding<VariantContext> validateAlleles;
|
||||||
|
|
||||||
@Input(fullName = "MaskAlleles", doc="Chris document me", required=true)
|
@Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true)
|
||||||
RodBinding<VariantContext> maskAlleles;
|
RodBinding<VariantContext> maskAlleles;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue