Adding docs from Andrey since his repo was all screwed up.
This commit is contained in:
parent
ce73dc4071
commit
f04e51c6c2
|
|
@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well,
|
||||||
|
* but this latter functionality is now superceded by UnifiedGenotyper.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
|
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
|
||||||
* data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read
|
* data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
|
||||||
* and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample,
|
* include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
|
||||||
* or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified,
|
* forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
|
||||||
* the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline
|
* statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
|
||||||
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic
|
* attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
|
||||||
* if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains
|
* metrics for the post-processing tools to make the final decision). The calls are performed by default
|
||||||
* only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords.
|
* from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I
|
||||||
|
* command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments,
|
||||||
|
* respectively. Indels are called from the tumor sample and annotated as germline
|
||||||
|
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic
|
||||||
|
* if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling
|
||||||
|
* is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold
|
||||||
|
* in tumor sample, it will not be reported.
|
||||||
*
|
*
|
||||||
* <b>If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you,
|
* To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input
|
||||||
* please email asivache at broadinstitute dot org and he will gladly explain everything in more detail.</b>
|
* bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
|
||||||
|
* on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
|
||||||
*
|
*
|
||||||
|
* <h2>Input</h2>
|
||||||
|
* <p>
|
||||||
|
* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Output</h2>
|
||||||
|
* <p>
|
||||||
|
* Indel calls with associated metrics.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Examples</h2>
|
||||||
|
* <pre>
|
||||||
|
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||||
|
* -R ref.fasta \
|
||||||
|
* -T SomaticIndelDetector \
|
||||||
|
* -o indels.vcf \
|
||||||
|
* -verbose indels.txt
|
||||||
|
* -I:normal normal.bam \
|
||||||
|
* -I:tumor tumor.bam
|
||||||
|
* </pre>
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
|
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
|
||||||
public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
||||||
// @Output
|
// @Output
|
||||||
// PrintStream out;
|
// PrintStream out;
|
||||||
@Output(doc="File to which variants should be written",required=true)
|
@Output(doc="File to write variants (indels) in VCF format",required=true)
|
||||||
protected VCFWriter vcf_writer = null;
|
protected VCFWriter vcf_writer = null;
|
||||||
|
|
||||||
@Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
|
@Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
|
||||||
|
|
@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName = "genotype_intervals", shortName = "genotype",
|
@Argument(fullName = "genotype_intervals", shortName = "genotype",
|
||||||
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
|
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false)
|
||||||
public String genotypeIntervalsFile = null;
|
public String genotypeIntervalsFile = null;
|
||||||
|
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
|
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
|
||||||
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
|
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
|
||||||
"if the list turns out to be unsorted, it will throw an exception. "+
|
"if the list turns out to be unsorted, it will throw an exception. "+
|
||||||
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
|
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
|
||||||
"to sort and keep it in memory (increases memory usage!).")
|
"to sort and keep it in memory (increases memory usage!).")
|
||||||
protected boolean GENOTYPE_NOT_SORTED = false;
|
protected boolean GENOTYPE_NOT_SORTED = false;
|
||||||
|
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName="unpaired", shortName="unpaired",
|
@Argument(fullName="unpaired", shortName="unpaired",
|
||||||
doc="Perform unpaired calls (no somatic status detection)", required=false)
|
doc="Perform unpaired calls (no somatic status detection)", required=false)
|
||||||
boolean call_unpaired = false;
|
boolean call_unpaired = false;
|
||||||
boolean call_somatic ;
|
boolean call_somatic ;
|
||||||
|
|
||||||
@Argument(fullName="verboseOutput", shortName="verbose",
|
@Argument(fullName="verboseOutput", shortName="verbose",
|
||||||
doc="Verbose output file in text format", required=false)
|
doc="Verbose output file in text format", required=false)
|
||||||
java.io.File verboseOutput = null;
|
java.io.File verboseOutput = null;
|
||||||
|
|
||||||
@Argument(fullName="bedOutput", shortName="bed",
|
@Argument(fullName="bedOutput", shortName="bed",
|
||||||
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
|
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
|
||||||
java.io.File bedOutput = null;
|
java.io.File bedOutput = null;
|
||||||
|
|
||||||
@Argument(fullName="minCoverage", shortName="minCoverage",
|
@Argument(fullName="minCoverage", shortName="minCoverage",
|
||||||
doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false)
|
doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
|
||||||
int minCoverage = 6;
|
"with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
|
||||||
|
int minCoverage = 6;
|
||||||
|
|
||||||
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
|
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
|
||||||
doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
|
doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+
|
||||||
int minNormalCoverage = 4;
|
"or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
|
||||||
|
int minNormalCoverage = 4;
|
||||||
|
|
||||||
@Argument(fullName="minFraction", shortName="minFraction",
|
@Argument(fullName="minFraction", shortName="minFraction",
|
||||||
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
|
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
|
||||||
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
|
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
|
||||||
double minFraction = 0.3;
|
double minFraction = 0.3;
|
||||||
|
|
||||||
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
|
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
|
||||||
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false)
|
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+
|
||||||
double minConsensusFraction = 0.7;
|
"all indel observations at the site exceeds this threshold", required=false)
|
||||||
|
double minConsensusFraction = 0.7;
|
||||||
|
|
||||||
@Argument(fullName="minIndelCount", shortName="minCnt",
|
@Argument(fullName="minIndelCount", shortName="minCnt",
|
||||||
doc="Minimum count of reads supporting consensus indel required for making the call. "+
|
doc="Minimum count of reads supporting consensus indel required for making the call. "+
|
||||||
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
|
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
|
||||||
"(minIndelCount not met) will not pass.", required=false)
|
"(minIndelCount not met) will not pass.", required=false)
|
||||||
int minIndelCount = 0;
|
int minIndelCount = 0;
|
||||||
|
|
||||||
@Argument(fullName="refseq", shortName="refseq",
|
@Argument(fullName="refseq", shortName="refseq",
|
||||||
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
|
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+
|
||||||
String RefseqFileName = null;
|
"GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
|
||||||
|
String RefseqFileName = null;
|
||||||
|
|
||||||
@Argument(fullName="blacklistedLanes", shortName="BL",
|
//@Argument(fullName="blacklistedLanes", shortName="BL",
|
||||||
doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
|
// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
|
||||||
"by this application, so they will not contribute indels to consider and will not be counted.", required=false)
|
// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
|
||||||
PlatformUnitFilterHelper dummy;
|
//PlatformUnitFilterHelper dummy;
|
||||||
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false;
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",
|
||||||
|
required=false) Boolean DEBUG = false;
|
||||||
@Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
|
@Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
|
||||||
"May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200;
|
"May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+
|
||||||
|
"window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+
|
||||||
|
"than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+
|
||||||
|
"if window is too small",required=false) int WINDOW_SIZE = 200;
|
||||||
@Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
|
@Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
|
||||||
" the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
|
" the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private WindowContext tumor_context;
|
private WindowContext tumor_context;
|
||||||
private WindowContext normal_context;
|
private WindowContext normal_context;
|
||||||
private int currentContigIndex = -1;
|
private int currentContigIndex = -1;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue