Merge branch 'master' of ssh://copper.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
b2e8e11128
|
|
@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class PlatformFilter extends ReadFilter {
|
||||
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this strign", required=false)
|
||||
@Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false)
|
||||
protected String[] PLFilterNames;
|
||||
|
||||
public boolean filterOut(SAMRecord rec) {
|
||||
|
|
|
|||
|
|
@ -68,6 +68,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
|||
* -I input1.bam \
|
||||
* -I input2.bam \
|
||||
* --read_filter MappingQualityZero
|
||||
*
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T PrintReads \
|
||||
* -o output.bam \
|
||||
* -I input.bam \
|
||||
* -n 2000
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ import java.util.*;
|
|||
* (http://snpeff.sourceforge.net/).
|
||||
*
|
||||
* For each variant, chooses one of the effects of highest biological impact from the SnpEff
|
||||
* output file (which must be provided on the command line via --snpEffFile <filename>.vcf),
|
||||
* output file (which must be provided on the command line via --snpEffFile filename.vcf),
|
||||
* and adds annotations on that effect.
|
||||
*
|
||||
* @author David Roazen
|
||||
|
|
@ -68,23 +68,31 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
|
|||
// Key names for the INFO field annotations we will add to each record, along
|
||||
// with parsing-related information:
|
||||
public enum InfoFieldKey {
|
||||
EFF (-1),
|
||||
EFF_IMPACT (0),
|
||||
EFF_CODON_CHANGE (1),
|
||||
EFF_AMINO_ACID_CHANGE (2),
|
||||
EFF_GENE_NAME (3),
|
||||
EFF_GENE_BIOTYPE (4),
|
||||
EFF_TRANSCRIPT_ID (6),
|
||||
EFF_EXON_ID (7);
|
||||
EFFECT_KEY ("SNPEFF_EFFECT", -1),
|
||||
IMPACT_KEY ("SNPEFF_IMPACT", 0),
|
||||
CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1),
|
||||
AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2),
|
||||
GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3),
|
||||
GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4),
|
||||
TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6),
|
||||
EXON_ID_KEY ("SNPEFF_EXON_ID", 7);
|
||||
|
||||
// Actual text of the key
|
||||
private final String keyName;
|
||||
|
||||
// Index within the effect metadata subfields from the SnpEff EFF annotation
|
||||
// where each key's associated value can be found during parsing.
|
||||
private final int fieldIndex;
|
||||
|
||||
InfoFieldKey ( int fieldIndex ) {
|
||||
InfoFieldKey ( String keyName, int fieldIndex ) {
|
||||
this.keyName = keyName;
|
||||
this.fieldIndex = fieldIndex;
|
||||
}
|
||||
|
||||
public String getKeyName() {
|
||||
return keyName;
|
||||
}
|
||||
|
||||
public int getFieldIndex() {
|
||||
return fieldIndex;
|
||||
}
|
||||
|
|
@ -292,27 +300,27 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
|
|||
}
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
return Arrays.asList( InfoFieldKey.EFF.toString(),
|
||||
InfoFieldKey.EFF_IMPACT.toString(),
|
||||
InfoFieldKey.EFF_CODON_CHANGE.toString(),
|
||||
InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(),
|
||||
InfoFieldKey.EFF_GENE_NAME.toString(),
|
||||
InfoFieldKey.EFF_GENE_BIOTYPE.toString(),
|
||||
InfoFieldKey.EFF_TRANSCRIPT_ID.toString(),
|
||||
InfoFieldKey.EFF_EXON_ID.toString()
|
||||
return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(),
|
||||
InfoFieldKey.IMPACT_KEY.getKeyName(),
|
||||
InfoFieldKey.CODON_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_NAME_KEY.getKeyName(),
|
||||
InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(),
|
||||
InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(),
|
||||
InfoFieldKey.EXON_ID_KEY.getKeyName()
|
||||
);
|
||||
}
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFF.toString(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFF_IMPACT.toString(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFF_CODON_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_NAME.toString(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFF_GENE_BIOTYPE.toString(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFF_EXON_ID.toString(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
|
||||
new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant")
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -375,16 +383,16 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
|
|||
}
|
||||
|
||||
try {
|
||||
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]);
|
||||
impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]);
|
||||
}
|
||||
catch ( IllegalArgumentException e ) {
|
||||
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.EFF_IMPACT.getFieldIndex()]));
|
||||
parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]));
|
||||
}
|
||||
|
||||
codonChange = effectMetadata[InfoFieldKey.EFF_CODON_CHANGE.getFieldIndex()];
|
||||
aminoAcidChange = effectMetadata[InfoFieldKey.EFF_AMINO_ACID_CHANGE.getFieldIndex()];
|
||||
geneName = effectMetadata[InfoFieldKey.EFF_GENE_NAME.getFieldIndex()];
|
||||
geneBiotype = effectMetadata[InfoFieldKey.EFF_GENE_BIOTYPE.getFieldIndex()];
|
||||
codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()];
|
||||
aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()];
|
||||
geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()];
|
||||
geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()];
|
||||
|
||||
if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) {
|
||||
try {
|
||||
|
|
@ -398,8 +406,8 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
|
|||
coding = EffectCoding.UNKNOWN;
|
||||
}
|
||||
|
||||
transcriptID = effectMetadata[InfoFieldKey.EFF_TRANSCRIPT_ID.getFieldIndex()];
|
||||
exonID = effectMetadata[InfoFieldKey.EFF_EXON_ID.getFieldIndex()];
|
||||
transcriptID = effectMetadata[InfoFieldKey.TRANSCRIPT_ID_KEY.getFieldIndex()];
|
||||
exonID = effectMetadata[InfoFieldKey.EXON_ID_KEY.getFieldIndex()];
|
||||
}
|
||||
|
||||
private void parseError ( String message ) {
|
||||
|
|
@ -443,14 +451,14 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
|
|||
public Map<String, Object> getAnnotations() {
|
||||
Map<String, Object> annotations = new LinkedHashMap<String, Object>(Utils.optimumHashSize(InfoFieldKey.values().length));
|
||||
|
||||
addAnnotation(annotations, InfoFieldKey.EFF.toString(), effect.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.EFF_IMPACT.toString(), impact.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.EFF_CODON_CHANGE.toString(), codonChange);
|
||||
addAnnotation(annotations, InfoFieldKey.EFF_AMINO_ACID_CHANGE.toString(), aminoAcidChange);
|
||||
addAnnotation(annotations, InfoFieldKey.EFF_GENE_NAME.toString(), geneName);
|
||||
addAnnotation(annotations, InfoFieldKey.EFF_GENE_BIOTYPE.toString(), geneBiotype);
|
||||
addAnnotation(annotations, InfoFieldKey.EFF_TRANSCRIPT_ID.toString(), transcriptID);
|
||||
addAnnotation(annotations, InfoFieldKey.EFF_EXON_ID.toString(), exonID);
|
||||
addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString());
|
||||
addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange);
|
||||
addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName);
|
||||
addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype);
|
||||
addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID);
|
||||
addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID);
|
||||
|
||||
return annotations;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,14 +69,23 @@ import java.util.*;
|
|||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
|
||||
* </p><p>
|
||||
* - no suffix: per locus coverage
|
||||
* </p><p>
|
||||
* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
|
||||
* </p><p>
|
||||
* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
|
||||
* </p><p>
|
||||
* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
|
||||
* </p><p>
|
||||
* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
|
||||
* </p><p>
|
||||
* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
|
||||
* </p><p>
|
||||
* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
|
||||
* </p><p>
|
||||
* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
|
||||
* </p><p>
|
||||
* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
|
||||
* </p>
|
||||
*
|
||||
|
|
|
|||
|
|
@ -43,8 +43,10 @@ import java.util.List;
|
|||
* Generates an alternative reference sequence over the specified interval.
|
||||
*
|
||||
* <p>
|
||||
* Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
|
||||
* Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
|
||||
* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
|
||||
* Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
|
||||
* Note that if there are multiple variants at a site, it takes the first one seen.
|
||||
* Reference bases for each interval will be output as a separate fasta sequence (named numerically in order).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
|
|
|
|||
|
|
@ -42,6 +42,9 @@ import java.io.PrintStream;
|
|||
*
|
||||
* <p>
|
||||
* The output format can be partially controlled using the provided command-line arguments.
|
||||
* Specify intervals with the usual -L argument to output only the reference bases within your intervals.
|
||||
* Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
|
||||
* separate fasta sequence (named numerically in order).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
|
|
|
|||
|
|
@ -68,26 +68,59 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
|||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Tool for calling indels in Tumor-Normal paired sample mode; this tool supports single-sample mode as well,
|
||||
* but this latter functionality is now superceded by UnifiedGenotyper.
|
||||
*
|
||||
* <p>
|
||||
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
|
||||
* data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read
|
||||
* and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample,
|
||||
* or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified,
|
||||
* the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline
|
||||
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic
|
||||
* if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains
|
||||
* only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords.
|
||||
* data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
|
||||
* include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
|
||||
* forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
|
||||
* statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
|
||||
* attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
|
||||
* metrics for the post-processing tools to make the final decision). The calls are performed by default
|
||||
* from a matched tumor-normal pair of samples. In this case, two (sets of) input bam files must be specified using tagged -I
|
||||
* command line arguments: normal and tumor bam(s) must be passed with -I:normal and -I:tumor arguments,
|
||||
* respectively. Indels are called from the tumor sample and annotated as germline
|
||||
* if even a weak evidence for the same indel, not necessarily a confident call, exists in the normal sample, or as somatic
|
||||
* if normal sample has coverage at the site but no indication for an indel. Note that strictly speaking the calling
|
||||
* is not even attempted in normal sample: if there is an indel in normal that is not detected/does not pass a threshold
|
||||
* in tumor sample, it will not be reported.
|
||||
*
|
||||
* <b>If any of the general usage of this tool or any of the command-line arguments for this tool are not clear to you,
|
||||
* please email asivache at broadinstitute dot org and he will gladly explain everything in more detail.</b>
|
||||
* To make indel calls and associated metrics for a single sample, this tool can be run with --unpaired flag (input
|
||||
* bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
|
||||
* on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Indel calls with associated metrics.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T SomaticIndelDetector \
|
||||
* -o indels.vcf \
|
||||
* -verbose indels.txt
|
||||
* -I:normal normal.bam \
|
||||
* -I:tumor tumor.bam
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
|
||||
public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
||||
// @Output
|
||||
// PrintStream out;
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
@Output(doc="File to write variants (indels) in VCF format",required=true)
|
||||
protected VCFWriter vcf_writer = null;
|
||||
|
||||
@Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true)
|
||||
|
|
@ -102,68 +135,80 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
|
|||
|
||||
@Hidden
|
||||
@Argument(fullName = "genotype_intervals", shortName = "genotype",
|
||||
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
|
||||
doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false)
|
||||
public String genotypeIntervalsFile = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
|
||||
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
|
||||
"if the list turns out to be unsorted, it will throw an exception. "+
|
||||
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
|
||||
"to sort and keep it in memory (increases memory usage!).")
|
||||
doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
|
||||
"if the list turns out to be unsorted, it will throw an exception. "+
|
||||
"Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
|
||||
"to sort and keep it in memory (increases memory usage!).")
|
||||
protected boolean GENOTYPE_NOT_SORTED = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="unpaired", shortName="unpaired",
|
||||
doc="Perform unpaired calls (no somatic status detection)", required=false)
|
||||
@Argument(fullName="unpaired", shortName="unpaired",
|
||||
doc="Perform unpaired calls (no somatic status detection)", required=false)
|
||||
boolean call_unpaired = false;
|
||||
boolean call_somatic ;
|
||||
boolean call_somatic ;
|
||||
|
||||
@Argument(fullName="verboseOutput", shortName="verbose",
|
||||
doc="Verbose output file in text format", required=false)
|
||||
java.io.File verboseOutput = null;
|
||||
@Argument(fullName="verboseOutput", shortName="verbose",
|
||||
doc="Verbose output file in text format", required=false)
|
||||
java.io.File verboseOutput = null;
|
||||
|
||||
@Argument(fullName="bedOutput", shortName="bed",
|
||||
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
|
||||
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
|
||||
java.io.File bedOutput = null;
|
||||
|
||||
@Argument(fullName="minCoverage", shortName="minCoverage",
|
||||
doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false)
|
||||
int minCoverage = 6;
|
||||
@Argument(fullName="minCoverage", shortName="minCoverage",
|
||||
doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
|
||||
"with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
|
||||
int minCoverage = 6;
|
||||
|
||||
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
|
||||
doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
|
||||
int minNormalCoverage = 4;
|
||||
@Argument(fullName="minNormalCoverage", shortName="minNormalCoverage",
|
||||
doc="used only in default (somatic) mode; normal sample must have at least minNormalCoverage "+
|
||||
"or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false)
|
||||
int minNormalCoverage = 4;
|
||||
|
||||
@Argument(fullName="minFraction", shortName="minFraction",
|
||||
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
|
||||
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
|
||||
double minFraction = 0.3;
|
||||
@Argument(fullName="minFraction", shortName="minFraction",
|
||||
doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+
|
||||
" (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false)
|
||||
double minFraction = 0.3;
|
||||
|
||||
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
|
||||
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false)
|
||||
double minConsensusFraction = 0.7;
|
||||
@Argument(fullName="minConsensusFraction", shortName="minConsensusFraction",
|
||||
doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt "+
|
||||
"all indel observations at the site exceeds this threshold", required=false)
|
||||
double minConsensusFraction = 0.7;
|
||||
|
||||
@Argument(fullName="minIndelCount", shortName="minCnt",
|
||||
doc="Minimum count of reads supporting consensus indel required for making the call. "+
|
||||
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
|
||||
"(minIndelCount not met) will not pass.", required=false)
|
||||
int minIndelCount = 0;
|
||||
@Argument(fullName="minIndelCount", shortName="minCnt",
|
||||
doc="Minimum count of reads supporting consensus indel required for making the call. "+
|
||||
" This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+
|
||||
"(minIndelCount not met) will not pass.", required=false)
|
||||
int minIndelCount = 0;
|
||||
|
||||
@Argument(fullName="refseq", shortName="refseq",
|
||||
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
|
||||
String RefseqFileName = null;
|
||||
@Argument(fullName="refseq", shortName="refseq",
|
||||
doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with "+
|
||||
"GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
|
||||
String RefseqFileName = null;
|
||||
|
||||
@Argument(fullName="blacklistedLanes", shortName="BL",
|
||||
doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
|
||||
"by this application, so they will not contribute indels to consider and will not be counted.", required=false)
|
||||
PlatformUnitFilterHelper dummy;
|
||||
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false;
|
||||
//@Argument(fullName="blacklistedLanes", shortName="BL",
|
||||
// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
|
||||
// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
|
||||
//PlatformUnitFilterHelper dummy;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",
|
||||
required=false) Boolean DEBUG = false;
|
||||
@Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+
|
||||
"May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200;
|
||||
"May need to be increased to accomodate longer reads or longer deletions. A read can be fit into the "+
|
||||
"window if its length on the reference (i.e. read length + length of deletion gap(s) if any) is smaller "+
|
||||
"than the window size. Reads that do not fit will be ignored, so long deletions can not be called "+
|
||||
"if window is too small",required=false) int WINDOW_SIZE = 200;
|
||||
@Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+
|
||||
" the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000;
|
||||
|
||||
|
||||
|
||||
private WindowContext tumor_context;
|
||||
private WindowContext normal_context;
|
||||
private int currentContigIndex = -1;
|
||||
|
|
|
|||
|
|
@ -76,6 +76,42 @@ import java.util.Map;
|
|||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
|
||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||
*
|
||||
* The first 20 lines of such a file is shown below.
|
||||
* * The file begins with a series of comment lines describing:
|
||||
* ** The number of counted loci
|
||||
* ** The number of counted bases
|
||||
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
|
||||
*
|
||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||
*
|
||||
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
|
||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*
|
||||
* <pre>
|
||||
* # Counted Sites 19451059
|
||||
* # Counted Bases 56582018
|
||||
* # Skipped Sites 82666
|
||||
* # Fraction Skipped 1 / 235 bp
|
||||
* ReadGroup,QualityScore,Cycle,Dinuc,nObservations,nMismatches,Qempirical
|
||||
* SRR006446,11,65,CA,9,1,10
|
||||
* SRR006446,11,48,TA,10,0,40
|
||||
* SRR006446,11,67,AA,27,0,40
|
||||
* SRR006446,11,61,GA,11,1,10
|
||||
* SRR006446,12,34,CA,47,1,17
|
||||
* SRR006446,12,30,GA,52,1,17
|
||||
* SRR006446,12,36,AA,352,1,25
|
||||
* SRR006446,12,17,TA,182,11,12
|
||||
* SRR006446,11,48,TG,2,0,40
|
||||
* SRR006446,11,67,AG,1,0,40
|
||||
* SRR006446,12,34,CG,9,0,40
|
||||
* SRR006446,12,30,GG,43,0,40
|
||||
* ERR001876,4,31,AG,1,0,40
|
||||
* ERR001876,4,31,AT,2,2,1
|
||||
* ERR001876,4,31,CA,1,0,40
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ import java.util.List;
|
|||
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
|
||||
*</pre>
|
||||
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
|
||||
*
|
||||
*<pre>
|
||||
* Valid // amplicon is valid
|
||||
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
|
||||
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
|
||||
|
|
@ -72,10 +72,10 @@ import java.util.List;
|
|||
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
|
||||
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
|
||||
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
|
||||
* </p>
|
||||
* </pre></p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre></pre>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ValidationAmplicons
|
||||
|
|
|
|||
|
|
@ -56,6 +56,22 @@ import java.util.*;
|
|||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Evaluation tables detailing the results of the eval modules which were applied.
|
||||
* For example:
|
||||
* <pre>
|
||||
* output.eval.gatkreport:
|
||||
* ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
|
||||
* CountVariants CompRod CpG EvalRod JexlExpression Novelty nProcessedLoci nCalledLoci nRefLoci nVariantLoci variantRate ...
|
||||
* CountVariants dbsnp CpG eval none all 65900028 135770 0 135770 0.00206024 ...
|
||||
* CountVariants dbsnp CpG eval none known 65900028 47068 0 47068 0.00071423 ...
|
||||
* CountVariants dbsnp CpG eval none novel 65900028 88702 0 88702 0.00134601 ...
|
||||
* CountVariants dbsnp all eval none all 65900028 330818 0 330818 0.00502000 ...
|
||||
* CountVariants dbsnp all eval none known 65900028 120685 0 120685 0.00183133 ...
|
||||
* CountVariants dbsnp all eval none novel 65900028 210133 0 210133 0.00318866 ...
|
||||
* CountVariants dbsnp non_CpG eval none all 65900028 195048 0 195048 0.00295976 ...
|
||||
* CountVariants dbsnp non_CpG eval none known 65900028 73617 0 73617 0.00111710 ...
|
||||
* CountVariants dbsnp non_CpG eval none novel 65900028 121431 0 121431 0.00184265 ...
|
||||
* ...
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
|
|||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -11,12 +12,19 @@ import java.util.List;
|
|||
* Stratifies by nonsense, missense, silent, and all annotations in the input ROD, from the INFO field annotation.
|
||||
*/
|
||||
public class FunctionalClass extends VariantStratifier {
|
||||
|
||||
public enum FunctionalType {
|
||||
silent,
|
||||
missense,
|
||||
nonsense
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
states.add("all");
|
||||
states.add("silent");
|
||||
states.add("missense");
|
||||
states.add("nonsense");
|
||||
for ( FunctionalType type : FunctionalType.values() )
|
||||
states.add(type.name());
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -26,10 +34,12 @@ public class FunctionalClass extends VariantStratifier {
|
|||
relevantStates.add("all");
|
||||
|
||||
if (eval != null && eval.isVariant()) {
|
||||
String type = null;
|
||||
FunctionalType type = null;
|
||||
|
||||
if (eval.hasAttribute("refseq.functionalClass")) {
|
||||
type = eval.getAttributeAsString("refseq.functionalClass");
|
||||
try {
|
||||
type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass"));
|
||||
} catch ( Exception e ) {} // don't error out if the type isn't supported
|
||||
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
|
||||
int annotationId = 1;
|
||||
String key;
|
||||
|
|
@ -37,24 +47,33 @@ public class FunctionalClass extends VariantStratifier {
|
|||
do {
|
||||
key = String.format("refseq.functionalClass_%d", annotationId);
|
||||
|
||||
String newtype = eval.getAttributeAsString(key);
|
||||
|
||||
if ( newtype != null && !newtype.equalsIgnoreCase("null") &&
|
||||
( type == null ||
|
||||
( type.equals("silent") && !newtype.equals("silent") ) ||
|
||||
( type.equals("missense") && newtype.equals("nonsense") ) )
|
||||
) {
|
||||
type = newtype;
|
||||
String newtypeStr = eval.getAttributeAsString(key);
|
||||
if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) {
|
||||
try {
|
||||
FunctionalType newType = FunctionalType.valueOf(newtypeStr);
|
||||
if ( type == null ||
|
||||
( type == FunctionalType.silent && newType != FunctionalType.silent ) ||
|
||||
( type == FunctionalType.missense && newType == FunctionalType.nonsense ) ) {
|
||||
type = newType;
|
||||
}
|
||||
} catch ( Exception e ) {} // don't error out if the type isn't supported
|
||||
}
|
||||
|
||||
annotationId++;
|
||||
} while (eval.hasAttribute(key));
|
||||
|
||||
} else if ( eval.hasAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName() ) ) {
|
||||
SnpEff.EffectType snpEffType = SnpEff.EffectType.valueOf(eval.getAttribute(SnpEff.InfoFieldKey.EFFECT_KEY.getKeyName()).toString());
|
||||
if ( snpEffType == SnpEff.EffectType.STOP_GAINED )
|
||||
type = FunctionalType.nonsense;
|
||||
else if ( snpEffType == SnpEff.EffectType.NON_SYNONYMOUS_CODING )
|
||||
type = FunctionalType.missense;
|
||||
else if ( snpEffType == SnpEff.EffectType.SYNONYMOUS_CODING )
|
||||
type = FunctionalType.silent;
|
||||
}
|
||||
|
||||
if (type != null) {
|
||||
if (type.equals("silent")) { relevantStates.add("silent"); }
|
||||
else if (type.equals("missense")) { relevantStates.add("missense"); }
|
||||
else if (type.equals("nonsense")) { relevantStates.add("nonsense"); }
|
||||
if ( type != null ) {
|
||||
relevantStates.add(type.name());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
|
||||
* Annotates a validation (from Sequenom for example) VCF with QC metrics (HW-equilibrium, % failed probes)
|
||||
*
|
||||
* <p>
|
||||
* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes).
|
||||
|
|
@ -57,7 +57,16 @@ import java.util.*;
|
|||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* An annotated VCF.
|
||||
* An annotated VCF. Additionally, a table like the following will be output:
|
||||
* <pre>
|
||||
* Total number of samples assayed: 185
|
||||
* Total number of records processed: 152
|
||||
* Number of Hardy-Weinberg violations: 34 (22%)
|
||||
* Number of no-call violations: 12 (7%)
|
||||
* Number of homozygous variant violations: 0 (0%)
|
||||
* Number of records passing all filters: 106 (69%)
|
||||
* Number of passing records that are polymorphic: 98 (92%)
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
|
|
|
|||
|
|
@ -134,7 +134,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation +
|
||||
"snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000",
|
||||
1,
|
||||
Arrays.asList("a1c3ba9efc28ee0606339604095076ea")
|
||||
Arrays.asList("486fc6a5ca1819f5ab180d5d72b1ebc9")
|
||||
);
|
||||
executeTest("Testing SnpEff annotations", spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import org.testng.annotations.Test;
|
|||
import java.util.Arrays;
|
||||
|
||||
public class VariantEvalIntegrationTest extends WalkerTest {
|
||||
private static String variantEvalTestDataRoot = validationDataLocation + "/VariantEval";
|
||||
private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval";
|
||||
private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf";
|
||||
private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf";
|
||||
private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf";
|
||||
|
|
@ -14,6 +14,27 @@ public class VariantEvalIntegrationTest extends WalkerTest {
|
|||
private static String cmdRoot = "-T VariantEval" +
|
||||
" -R " + b36KGReference;
|
||||
|
||||
@Test
|
||||
public void testFunctionClassWithSnpeff() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
buildCommandLine(
|
||||
"-T VariantEval",
|
||||
"-R " + b37KGReference,
|
||||
"--dbsnp " + b37dbSNP132,
|
||||
"--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf",
|
||||
"-noEV",
|
||||
"-EV TiTvVariantEvaluator",
|
||||
"-noST",
|
||||
"-ST FunctionalClass",
|
||||
"-BTI eval",
|
||||
"-o %s"
|
||||
),
|
||||
1,
|
||||
Arrays.asList("f5f811ceb973d7fd6c1b2b734f1b2b12")
|
||||
);
|
||||
executeTest("testFunctionClassWithSnpeff", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStratifySamplesAndExcludeMonomorphicSites() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ my $unsorted_vcf = "$tmp_prefix.unsorted.vcf";
|
|||
|
||||
# lift over the file
|
||||
print "Lifting over the vcf...";
|
||||
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
|
||||
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
|
||||
if ($recordOriginalLocation) {
|
||||
$cmd .= " -recordOriginalLocation";
|
||||
}
|
||||
|
|
@ -66,7 +66,7 @@ system($cmd) == 0 or quit("The sorting step failed. Please correct the necessar
|
|||
|
||||
# Filter the VCF for bad records
|
||||
print "\nFixing/removing bad records...\n";
|
||||
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out";
|
||||
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out";
|
||||
system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying.");
|
||||
|
||||
# clean up
|
||||
|
|
|
|||
Loading…
Reference in New Issue