Merge branch 'master' of ssh://chartl@tin.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Christopher Hartl 2011-08-17 14:06:25 -04:00
commit eea8087c73
24 changed files with 814 additions and 186 deletions

View File

@ -96,24 +96,23 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
loadArgumentsIntoObject(walker);
argumentSources.add(walker);
Collection<RMDTriplet> newStyle = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
Collection<RMDTriplet> rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
// todo: remove me when the old style system is removed
if ( getArgumentCollection().RODBindings.size() > 0 ) {
logger.warn("################################################################################");
logger.warn("################################################################################");
logger.warn("Deprecated -B rod binding syntax detected. This syntax will be retired in GATK 1.2.");
logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2.");
logger.warn("Please use arguments defined by each specific walker instead.");
for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) {
logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags());
}
logger.warn("################################################################################");
logger.warn("################################################################################");
System.exit(1);
}
Collection<RMDTriplet> oldStyle = ListFileUtils.unpackRODBindingsOldStyle(getArgumentCollection().RODBindings, parser);
oldStyle.addAll(newStyle);
engine.setReferenceMetaDataFiles(oldStyle);
engine.setReferenceMetaDataFiles(rodBindings);
for (ReadFilter filter: filters) {
loadArgumentsIntoObject(filter);

View File

@ -39,8 +39,7 @@ import org.simpleframework.xml.*;
public class DbsnpArgumentCollection {
/**
* A dbSNP VCF file. Variants in this track will be treated as "known" variants
* in tools using this track.
* A dbSNP VCF file.
*/
@Input(fullName="dbsnp", shortName = "D", doc="dbSNP file", required=false)
public RodBinding<VariantContext> dbsnp;

View File

@ -101,6 +101,8 @@ public class GATKArgumentCollection {
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
@Deprecated
@Hidden
@ElementList(required = false)
@Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :<name>,<type> <file>", required = false)
public ArrayList<String> RODBindings = new ArrayList<String>();
@ -340,14 +342,6 @@ public class GATKArgumentCollection {
return false;
}
}
if (other.RODBindings.size() != RODBindings.size()) {
return false;
}
for (int x = 0; x < RODBindings.size(); x++) {
if (!RODBindings.get(x).equals(other.RODBindings.get(x))) {
return false;
}
}
if (!other.samFiles.equals(this.samFiles)) {
return false;
}

View File

@ -40,26 +40,65 @@ import java.util.TreeSet;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
/**
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear
* in the input file. It can dynamically merge the contents of multiple input BAM files, resulting
* in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter
* command line argument.
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
*
* <p>
* PrintReads can dynamically merge the contents of multiple input BAM files, resulting
* in merged output sorted in coordinate order. Can also optionally filter reads based on the
* --read_filter command line argument.
*
* <h2>Input</h2>
* <p>
* One or more bam files.
* </p>
*
* <h2>Output</h2>
* <p>
* A single processed bam file.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T PrintReads \
* -o output.bam \
* -I input1.bam \
* -I input2.bam \
* --read_filter MappingQualityZero
* </pre>
*
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@Requires({DataSource.READS, DataSource.REFERENCE})
public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
/** an optional argument to dump the reads out to a BAM file */
@Output(doc="Write output to this BAM filename instead of STDOUT")
SAMFileWriter out;
@Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false)
String readGroup = null;
/**
* For example, --platform ILLUMINA or --platform 454.
*/
@Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false)
String platform = null; // E.g. ILLUMINA, 454
String platform = null;
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
int nReadsToPrint = -1;
/**
* Only reads from samples listed in the provided file(s) will be included in the output.
*/
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
public Set<File> sampleFile = new TreeSet<File>();
/**
* Only reads from the sample(s) will be included in the output.
*/
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
public Set<String> sampleNames = new TreeSet<String>();

View File

@ -49,7 +49,34 @@ import java.util.*;
/**
* Annotates variant calls with context information. Users can specify which of the available annotations to use.
* Annotates variant calls with context information.
*
* <p>
* VariantAnnotator is a GATK tool for annotating variant calls based on their context.
* The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself.
*
* <h2>Input</h2>
* <p>
* A variant set to annotate and optionally one or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* An annotated VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantAnnotator \
* -I input.bam \
* -o output.vcf \
* -A DepthOfCoverage
* --variant input.vcf \
* --dbsnp dbsnp.vcf
* </pre>
*
*/
@Requires(value={})
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@ -69,8 +96,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return snpEffFile; }
/**
* A dbSNP VCF file from which to annotate.
*
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
*/
@ArgumentCollection
@ -101,15 +126,25 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
protected String sampleName = null;
/**
* See the -list argument to view available annotations.
*/
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected List<String> annotationsToUse = new ArrayList<String>();
/**
* See the -list argument to view available groups.
*/
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected List<String> annotationGroupsToUse = new ArrayList<String>();
/**
* This option enables you to add annotations from one VCF to another.
*
* For example, if you want to annotate your 'variant' VCF with the AC field value from the rod bound to 'resource',
* you can specify '-E resource.AC' and records in the output VCF will be annotated with 'resource.AC=N' when a record exists in that rod at the given position.
* If multiple records in the rod overlap the given position, one is chosen arbitrarily.
*/
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false)
protected List<String> expressionsToUse = new ArrayList<String>();
@ -127,8 +162,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;
private HashMap<String, String> nonVCFsampleName = new HashMap<String, String>();
private VariantAnnotatorEngine engine;
private Collection<VariantContext> indelBufferContext;
@ -164,12 +197,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
List<String> rodName = Arrays.asList(variantCollection.variants.getName());
Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
// add the non-VCF sample from the command-line, if applicable
if ( sampleName != null ) {
nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
samples.add(sampleName.toUpperCase());
}
// if there are no valid samples, warn the user
if ( samples.size() == 0 ) {
logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");

View File

@ -38,12 +38,32 @@ import java.util.List;
/**
* Walks along reference and calculates the GC content for each interval.
*
*
* <h2>Input</h2>
* <p>
* One or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* GC content calculations per interval.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T GCContentByInterval \
* -o output.txt \
* -I input.bam \
* -L input.intervals
* </pre>
*
*/
@Allows(value = {DataSource.REFERENCE})
@Requires(value = {DataSource.REFERENCE})
@By(DataSource.REFERENCE)
public class GCContentByIntervalWalker extends LocusWalker<Long, Long> {
@Output
protected PrintStream out;

View File

@ -40,18 +40,48 @@ import java.util.List;
/**
* Generates an alternative reference sequence over the specified interval. Given variant ROD tracks,
* it replaces the reference bases at variation sites with the bases supplied by the ROD(s). Additionally,
* allows for a "snpmask" ROD to set overlapping bases to 'N'.
* Generates an alternative reference sequence over the specified interval.
*
* <p>
* Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
* Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
*
* <h2>Input</h2>
* <p>
* The reference, requested intervals, and any number of variant rod files.
* </p>
*
* <h2>Output</h2>
* <p>
* A fasta file representing the requested intervals.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T FastaAlternateReferenceMaker \
* -o output.fasta \
* -L input.intervals \
* --variant input.vcf \
* [--snpmask mask.vcf]
* </pre>
*
*/
@WalkerName("FastaAlternateReferenceMaker")
@Reference(window=@Window(start=-1,stop=50))
@Requires(value={DataSource.REFERENCE})
public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
/**
* Variants from these input files are used by this tool to construct an alternate reference.
*/
@Input(fullName = "variant", shortName = "V", doc="variants to model", required=false)
public List<RodBinding<VariantContext>> variants = Collections.emptyList();
/**
* Snps from this file are used as a mask when constructing the alternate reference.
*/
@Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false)
public RodBinding<VariantContext> snpmask;

View File

@ -38,14 +38,44 @@ import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
/**
* Renders a new reference in FASTA format consisting of only those loci provided in the input data set. Has optional
* features to control the output format.
* Renders a new reference in FASTA format consisting of only those loci provided in the input data set.
*
* <p>
* The output format can be partially controlled using the provided command-line arguments.
*
* <h2>Input</h2>
* <p>
* The reference and requested intervals.
* </p>
*
* <h2>Output</h2>
* <p>
* A fasta file representing the requested intervals.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T FastaReference \
* -o output.fasta \
* -L input.intervals
* </pre>
*
*/
@WalkerName("FastaReferenceMaker")
public class FastaReferenceWalker extends RefWalker<Pair<GenomeLoc, String>, GenomeLoc> {
@Output PrintStream out;
@Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) public int fastaLineWidth=60;
@Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity) - CAUTION: adjacent intervals will automatically be merged", required=false) public boolean fastaRawSeqs=false;
@Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false)
public int fastaLineWidth=60;
/**
* Please note that when using this argument adjacent intervals will automatically be merged.
*/
@Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false)
public boolean fastaRawSeqs=false;
protected FastaSequence fasta;

View File

@ -45,6 +45,34 @@ import java.util.*;
/**
* Filters variant calls using a number of user-selectable, parameterizable criteria.
*
* <p>
* VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria.
* Records are hard-filtered by changing the value in the FILTER field to something other than PASS.
*
* <h2>Input</h2>
* <p>
* A variant set to filter.
* </p>
*
* <h2>Output</h2>
* <p>
* A filtered VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantFiltration \
* -o output.vcf \
* --variant input.vcf \
* --filterExpression "AB < 0.2 || MQ0 > 50" \
* --filterName "Nov09filters" \
* --mask mask.vcf \
* --maskName InDel
* </pre>
*
*/
@Reference(window=@Window(start=-50,stop=50))
public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
@ -52,33 +80,65 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* Any variant which overlaps entries from the provided mask rod will be filtered.
*/
@Input(fullName="mask", doc="Input ROD mask", required=false)
public RodBinding<Feature> mask;
@Output(doc="File to which variants should be written", required=true)
protected VCFWriter writer = null;
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter (see wiki docs for more info)", required=false)
/**
* VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using
* --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
*/
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
protected ArrayList<String> FILTER_EXPS = new ArrayList<String>();
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
/**
* This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
*/
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
protected ArrayList<String> FILTER_NAMES = new ArrayList<String>();
/**
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
* VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag).
* One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
* methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1").
*/
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false)
protected ArrayList<String> GENOTYPE_FILTER_EXPS = new ArrayList<String>();
/**
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
*/
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
protected ArrayList<String> GENOTYPE_FILTER_NAMES = new ArrayList<String>();
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]", required=false)
/**
* Works together with the --clusterWindowSize argument.
*/
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false)
protected Integer clusterSize = 3;
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]", required=false)
/**
* Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1.
*/
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false)
protected Integer clusterWindow = 0;
@Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered; [default:0]", required=false)
@Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false)
protected Integer MASK_EXTEND = 0;
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false)
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
protected String MASK_NAME = "Mask";
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)?", required=false)
/**
* By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing.
* Use this argument to have it evaluate as failing filters instead for these cases.
*/
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
protected Boolean FAIL_MISSING_VALUES = false;
// JEXL expressions for the filters

View File

@ -69,7 +69,7 @@ import java.util.*;
* <p>
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
* or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
* or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,

View File

@ -35,16 +35,46 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
/**
* Left aligns indels in reads.
* Left-aligns indels from reads in a bam file.
*
* <p>
* LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be
* placed at multiple positions and still represent the same haplotype. While a standard convention is to place an
* indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
*
* <h2>Input</h2>
* <p>
* A bam file to left-align.
* </p>
*
* <h2>Output</h2>
* <p>
* A left-aligned bam.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx3g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T LeftAlignIndels \
* -I input.bam \
* -o output.vcf
* </pre>
*
*/
public class LeftAlignIndels extends ReadWalker<Integer, Integer> {
@Output(required=false, doc="Output bam")
protected StingSAMFileWriter writer = null;
@Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
"If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false)
/**
* If set too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool
* may run out of memory. We recommend that you additionally tell Java to use a temp directory with plenty of available
* space (by setting java.io.tempdir on the command-line).
*/
@Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the output writer", required=false)
protected int MAX_RECORDS_IN_RAM = 500000;
public void initialize() {

View File

@ -23,7 +23,10 @@
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -49,16 +52,46 @@ import java.util.*;
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
/**
* Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads).
*
* <p>
* Performs physical phasing of SNP calls, based on sequencing reads.
* </p>
*
* <h2>Input</h2>
* <p>
* VCF file of SNP calls, BAM file of sequence reads.
* </p>
*
* <h2>Output</h2>
* <p>
* Phased VCF file.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T ReadBackedPhasing
* -R reference.fasta
* -I reads.bam
* --variant:vcf SNPs.vcf
* -BTI variant
* -BTIMR INTERSECTION
* -o phased_SNPs.vcf
* --phaseQualityThresh 20.0
* </pre>
*
* @author Menachem Fromer
* @since July 2010
*/
@Allows(value = {DataSource.READS, DataSource.REFERENCE})
@Requires(value = {DataSource.READS, DataSource.REFERENCE})
@By(DataSource.READS)
@ReadFilters({MappingQualityZeroReadFilter.class})
// Filter out all reads with zero mapping quality
@ReadFilters({MappingQualityZeroReadFilter.class})
public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, PhasingStats> {
private static final boolean DEBUG = false;
@ -73,13 +106,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
@Output(doc = "File to which variants should be written", required = true)
protected VCFWriter writer = null;
@Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads; [default:20000]", required = false)
@Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads for the phasing procedure", required = false)
protected Integer cacheWindow = 20000;
@Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm; [default:10]", required = false)
@Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm", required = false)
protected Integer maxPhaseSites = 10; // 2^10 == 10^3 diploid haplotypes
@Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing; [default:10.0]", required = false)
@Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing", required = false)
protected Double phaseQualityThresh = 10.0; // PQ = 10.0 <=> P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9
@Hidden
@ -87,10 +120,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
protected String variantStatsFilePrefix = null;
private PhasingQualityStatsWriter statsWriter = null;
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing [default: 17]", required = false)
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing", required = false)
public int MIN_BASE_QUALITY_SCORE = 17;
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing [default: 20]", required = false)
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing", required = false)
public int MIN_MAPPING_QUALITY_SCORE = 20;
@Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false)
@ -111,10 +144,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
public static final String PHASING_INCONSISTENT_KEY = "PhasingInconsistent";
@Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records [default:false]", required = false)
@Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records", required = false)
protected boolean enableMergePhasedSegregatingPolymorphismsToMNP = false;
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record", required = false)
protected int maxGenomicDistanceForMNP = 1;
@Hidden

View File

@ -11,7 +11,31 @@ import java.io.PrintStream;
/**
* Walks over the input data set, calculating the total number of covered loci for diagnostic purposes.
*
* <p>
* Simplest example of a locus walker.
*
*
* <h2>Input</h2>
* <p>
* One or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of loci traversed.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountLoci \
* -o output.txt \
* -I input.bam \
* [-L input.intervals]
* </pre>
*
*/
public class CountLociWalker extends LocusWalker<Integer, Long> implements TreeReducible<Long> {
@Output(doc="Write count to this file instead of STDOUT")

View File

@ -39,6 +39,26 @@ import java.util.List;
* query name order. Breaks counts down by total pairs and number
* of paired reads.
*
*
* <h2>Input</h2>
* <p>
* One or more bam files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of pairs seen.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountPairs \
* -o output.txt \
* -I input.bam
* </pre>
*
* @author mhanna
*/
public class CountPairsWalker extends ReadPairWalker<Integer,Long> {

View File

@ -25,7 +25,10 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -33,25 +36,55 @@ import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.Collections;
import java.util.List;
/**
* Prints out counts of the number of reference ordered data objects are
* each locus for debugging RefWalkers.
* Prints out counts of the number of reference ordered data objects encountered.
*
*
* <h2>Input</h2>
* <p>
* One or more rod files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of rods seen.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountRODsByRef \
* -o output.txt \
* --rod input.vcf
* </pre>
*
*/
public class CountRodByRefWalker extends RefWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
public class CountRODsByRefWalker extends RefWalker<CountRODsWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
/**
* One or more input rod files
*/
@Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList();
@Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
public boolean showSkipped = false;
CountRodWalker crw = new CountRodWalker();
CountRODsWalker crw = new CountRODsWalker();
public void initialize() {
crw.verbose = verbose;
crw.showSkipped = showSkipped;
}
public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
public CountRODsWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return crw.map(tracker, ref, context);
}
@ -59,7 +92,7 @@ public class CountRodByRefWalker extends RefWalker<CountRodWalker.Datum, Pair<Ex
return crw.reduceInit();
}
public Pair<ExpandingArrayList<Long>, Long> reduce(CountRodWalker.Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
public Pair<ExpandingArrayList<Long>, Long> reduce(CountRODsWalker.Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
return crw.reduce(point, sum);
}
}

View File

@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.qc;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -41,23 +44,46 @@ import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.*;
/**
* Prints out counts of the number of reference ordered data objects are
* each locus for debugging RodWalkers.
* Prints out counts of the number of reference ordered data objects encountered.
*
*
* <h2>Input</h2>
* <p>
* One or more rod files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of rods seen.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountRODs \
* -o output.txt \
* --rod input.vcf
* </pre>
*
*/
public class CountRodWalker extends RodWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>> {
public class CountRODsWalker extends RodWalker<CountRODsWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>> {
@Output
public PrintStream out;
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
/**
* One or more input rod files
*/
@Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList();
@Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
public boolean showSkipped = false;
@Override

View File

@ -9,8 +9,32 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
/**
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
*
* <p>
* Can also count the number of reads matching a given criterion using read filters (see the
* --read-filter command line argument). Simplest example of a read-backed analysis.
*
*
* <h2>Input</h2>
* <p>
* One or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of reads seen.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountReads \
* -o output.txt \
* -I input.bam \
* [-L input.intervals]
* </pre>
*
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CountReadsWalker extends ReadWalker<Integer, Integer> {

View File

@ -36,20 +36,61 @@ import java.util.*;
/**
* General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more)
*
* <p>
* Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of
* raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls
* to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics
* by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid
* degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in
* evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation
* and stratification modules.
*
* <h2>Input</h2>
* <p>
* One or more variant sets to evaluate plus any number of comparison sets.
* </p>
*
* <h2>Output</h2>
* <p>
* Evaluation tables.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantEval \
* -o output.eval.gatkreport \
* --eval:set1 set1.vcf \
* --eval:set2 set2.vcf \
* [--comp comp.vcf]
* </pre>
*
*/
@Reference(window=@Window(start=-50, stop=50))
public class VariantEvalWalker extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
// Output arguments
@Output
protected PrintStream out;
/**
* The variant file(s) to evaluate.
*/
@Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true)
public List<RodBinding<VariantContext>> evals;
/**
* The variant file(s) to compare against.
*/
@Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false)
public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList();
private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>();
/**
* dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants.
* Other sets can be specified with the -knownName (--known_names) argument.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
@ -67,6 +108,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="sample", shortName="sn", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false)
protected Set<String> SAMPLE_EXPRESSIONS;
/**
* List of rod tracks to be used for specifying "known" variants other than dbSNP.
*/
@Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
protected HashSet<String> KNOWN_NAMES = new HashSet<String>();
List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>();
@ -81,7 +125,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false)
protected Set<VariantContext.Type> typesToUse = null;
// Evaluator arguments
/**
* See the -list argument to view available modules.
*/
@Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
protected String[] MODULES_TO_USE = {};
@ -95,7 +141,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false)
protected double MIN_PHASE_QUALITY = 10.0;
@Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
/**
* This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined.
*/
@Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false)
protected String FAMILY_STRUCTURE;
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)

View File

@ -46,6 +46,31 @@ import java.util.*;
/**
* Left-aligns indels from a variants file.
*
* <p>
* LeftAlignVariants is a tool that takes a VCF file and left-aligns any indels inside it. The same indel can often be
* placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to
* place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
*
* <h2>Input</h2>
* <p>
* A variant set to left-align.
* </p>
*
* <h2>Output</h2>
* <p>
* A left-aligned VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T LeftAlignVariants \
* --variant input.vcf \
* -o output.vcf
* </pre>
*
*/
@Reference(window=@Window(start=-200,stop=200))
public class LeftAlignVariants extends RodWalker<Integer, Integer> {

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.Feature;
import org.broad.tribble.TribbleException;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.commandline.*;
@ -34,7 +33,6 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -48,7 +46,32 @@ import java.util.Set;
/**
* Validates a variants file.
* Strictly validates a variants file.
*
* <p>
* ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it.
* Checks include the correctness of the reference base(s), accuracy of AC & AN values, tests against rsIDs
* when a dbSNP file is provided, and that all alternate alleles are present in at least one sample.
*
* <h2>Input</h2>
* <p>
* A variant set to filter.
* </p>
*
* <h2>Output</h2>
* <p>
* A filtered VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ValidateVariants \
* --variant input.vcf \
* --dbsnp dbsnp.vcf
* </pre>
*
*/
@Reference(window=@Window(start=0,stop=100))
public class ValidateVariants extends RodWalker<Integer, Integer> {
@ -67,10 +90,13 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false)
protected ValidationType type = ValidationType.ALL;
@Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "should we skip validation on filtered records?", required = false)
/**
* By default, even filtered records are validated.
*/
@Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false)
protected Boolean DO_NOT_VALIDATE_FILTERED = false;
@Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "should we just emit warnings on errors instead of terminating the run?", required = false)
@Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false)
protected Boolean WARN_ON_ERROR = false;
private long numErrors = 0;

View File

@ -25,10 +25,8 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -43,21 +41,57 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
* Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
*
* <p>
* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes).
* The tool produces a VCF that is annotated with information pertaining to plate quality control and by
* default is soft-filtered by high no-call rate or low Hardy-Weinberg probability.
* If you have .ped files, please first convert them to VCF format
* (see http://www.broadinstitute.org/gsa/wiki/index.php/Converting_ped_to_vcf).
*
* <h2>Input</h2>
* <p>
* A validation VCF to annotate.
* </p>
*
* <h2>Output</h2>
* <p>
* An annotated VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantValidationAssessor \
* --variant input.vcf \
* -o output.vcf
* </pre>
*
*/
@Reference(window=@Window(start=0,stop=40))
public class VariantValidationAssessor extends RodWalker<VariantContext,Integer> {
@Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true)
public RodBinding<VariantContext> variants;
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfwriter = null;
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false)
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid", required=false)
protected double maxHardy = 20.0;
@Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid [default:0.05]", required=false)
/**
* To disable, set to a value greater than 1.
*/
@Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid", required=false)
protected double maxNoCall = 0.05;
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false)
/**
* To disable, set to a value greater than 1.
*/
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid", required=false)
protected double maxHomNonref = 1.1;
//@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+
@ -93,7 +127,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
if ( tracker == null )
return null;
VariantContext vc = tracker.getFirstValue(variants, ref.getLocus());
VariantContext vc = tracker.getFirstValue(variantCollection.variants, ref.getLocus());
// ignore places where we don't have a variant
if ( vc == null )
return null;
@ -101,7 +135,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
if ( sampleNames == null )
sampleNames = new TreeSet<String>(vc.getSampleNames());
return addVariantInformationToCall(ref, vc);
return addVariantInformationToCall(vc);
}
public Integer reduce(VariantContext call, Integer numVariants) {
@ -113,7 +147,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
}
public void onTraversalDone(Integer finalReduce) {
final List<String> inputNames = Arrays.asList(variants.getName());
final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
@ -159,7 +193,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
}
private VariantContext addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) {
private VariantContext addVariantInformationToCall(VariantContext vContext) {
// check possible filters
double hwPvalue = hardyWeinbergCalculation(vContext);

View File

@ -40,95 +40,105 @@ import java.io.PrintStream;
import java.util.*;
/**
* Emits specific fields as dictated by the user from one or more VCF files.
* Emits specific fields from a VCF file to a tab-deliminated table
*
* <p>
* This walker accepts a single VCF file and writes out user-selected fields from the
* VCF as a header-containing, tab-deliminated file. The user specifies one or more
* fields to print with the -F NAME, each of which appears as a single column in
* the output file, with a header named NAME, and the value of this field in the VCF
* one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding
* in the INFO field (AC=10). Note that this tool does not support capturing any
* GENOTYPE field values. If a VCF record is missing a value, then the tool by
* default throws an error, but the special value NA can be emitted instead with
* appropriate tool arguments.
*
* </p>
*
* <h2>Input</h2>
* <p>
* <ul>
* <li>A VCF file</li>
* <li>A list of -F fields to write</li>
* </ul>
* </p>
*
* <h2>Output</h2>
* <p>
* A table deliminated file containing the values of the requested fields in the VCF file
* </p>
*
* <h2>Examples</h2>
* <pre>
* -T $WalkerName \
* -V file.vcf \
* -F CHROM -F POS -F ID -F QUAL -F AC \
* -o results.table
*
* would produce a file that looks like:
*
* CHROM POS ID QUAL AC
* 1 10 . 50 1
* 1 20 rs10 99 10
* et cetera...
* </pre>
*
* @author Mark DePristo
* @since 2010
*/
public class VariantsToTable extends RodWalker<Integer, Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@Output(doc="File to which results should be written",required=true)
protected PrintStream out;
@Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true)
public ArrayList<String> fieldsToTake = new ArrayList<String>();
/**
* -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10).
* Note that this tool does not support capturing any GENOTYPE field values. Note this argument
* accepts any number of inputs. So -F CHROM -F POS is allowed.
*/
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
public List<String> fieldsToTake = new ArrayList<String>();
@Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records")
/**
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
* Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
*/
@Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
public boolean showFiltered = false;
@Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false)
/**
* If provided, then this tool will exit with success after this number of records have been emitted to the file.
*/
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
public int MAX_RECORDS = -1;
int nRecords = 0;
/**
* By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then
* VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this
* can make your resulting file unreadable and malformated according to tools like R, as the representation of
* multi-allelic INFO field values can be lists of values.
*/
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
public boolean keepMultiAllelic = false;
/**
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
* is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being
* found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such
* fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
* will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
*/
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
public boolean ALLOW_MISSING_DATA = false;
public void initialize() {
// print out the header
out.println(Utils.join("\t", fieldsToTake));
}
public static abstract class Getter { public abstract String get(VariantContext vc); }
public static Map<String, Getter> getters = new HashMap<String, Getter>();
static {
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
getters.put("REF", new Getter() {
public String get(VariantContext vc) {
String x = "";
if ( vc.hasReferenceBaseForIndel() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x=x+new String(new byte[]{refByte});
}
return x+vc.getReference().getDisplayString();
}
});
getters.put("ALT", new Getter() {
public String get(VariantContext vc) {
StringBuilder x = new StringBuilder();
int n = vc.getAlternateAlleles().size();
if ( n == 0 ) return ".";
if ( vc.hasReferenceBaseForIndel() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x.append(new String(new byte[]{refByte}));
}
for ( int i = 0; i < n; i++ ) {
if ( i != 0 ) x.append(",");
x.append(vc.getAlternateAllele(i).getDisplayString());
}
return x.toString();
}
});
getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
if ( vc.isSNP() && vc.isBiallelic() )
return VariantContextUtils.isTransition(vc) ? "1" : "0";
else
return "-1";
}});
getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
});
getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
}});
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
@ -155,6 +165,15 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return s.endsWith("*");
}
/**
* Utility function that returns the list of values for each field in fields from vc.
*
* @param vc the VariantContext whose field values we can to capture
* @param fields a non-null list of fields to capture from VC
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
* provides a value of NA
* @return
*/
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
List<String> vals = new ArrayList<String>();
@ -213,13 +232,75 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return vals;
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer counter, Integer sum) {
return counter + sum;
}
//
// default reduce -- doesn't do anything at all
//
public Integer reduceInit() { return 0; }
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
public void onTraversalDone(Integer sum) {}
// ----------------------------------------------------------------------------------------------------
//
// static system for getting values from VC by name.
//
// ----------------------------------------------------------------------------------------------------
public static abstract class Getter { public abstract String get(VariantContext vc); }
public static Map<String, Getter> getters = new HashMap<String, Getter>();
static {
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
getters.put("REF", new Getter() {
public String get(VariantContext vc) {
String x = "";
if ( vc.hasReferenceBaseForIndel() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x=x+new String(new byte[]{refByte});
}
return x+vc.getReference().getDisplayString();
}
});
getters.put("ALT", new Getter() {
public String get(VariantContext vc) {
StringBuilder x = new StringBuilder();
int n = vc.getAlternateAlleles().size();
if ( n == 0 ) return ".";
if ( vc.hasReferenceBaseForIndel() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x.append(new String(new byte[]{refByte}));
}
for ( int i = 0; i < n; i++ ) {
if ( i != 0 ) x.append(",");
x.append(vc.getAlternateAllele(i).getDisplayString());
}
return x.toString();
}
});
getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
if ( vc.isSNP() && vc.isBiallelic() )
return VariantContextUtils.isTransition(vc) ? "1" : "0";
else
return "-1";
}});
getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
});
getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
}});
}
}

View File

@ -53,6 +53,30 @@ import java.util.*;
/**
* Converts variants from other file formats to VCF format.
*
* <p>
* Note that there must be a Tribble feature/codec for the file format as well as an adaptor.
*
* <h2>Input</h2>
* <p>
* A variant file to filter.
* </p>
*
* <h2>Output</h2>
* <p>
* A VCF file.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantsToVCF \
* -o output.vcf \
* --variant:RawHapMap input.hapmap \
* --dbsnp dbsnp.vcf
* </pre>
*
*/
@Reference(window=@Window(start=-40,stop=40))
public class VariantsToVCF extends RodWalker<Integer, Integer> {
@ -61,15 +85,24 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
protected VCFWriter baseWriter = null;
private SortingVCFWriter vcfwriter; // needed because hapmap/dbsnp indel records move
/**
* Variants from this input file are used by this tool as input.
*/
@Input(fullName="variant", shortName = "V", doc="Input variant file", required=true)
public RodBinding<Feature> variants;
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
/**
* This argument is used for data (like GELI) with genotypes but no sample names encoded within.
*/
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod", required=false)
protected String sampleName = null;
/**
* This argument is useful for fixing input VCFs with bad reference bases (the output will be a fixed version of the VCF).
*/
@Argument(fullName="fixRef", shortName="fixRef", doc="Fix common reference base in case there's an indel without padding", required=false)
protected boolean fixReferenceBase = false;

View File

@ -88,14 +88,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest {
collect.intervals.add("intervals".toLowerCase());
collect.excludeIntervals = new ArrayList<String>();
collect.numberOfThreads = 1;
// make some rod bindings up
ArrayList<String> fakeBindings = new ArrayList<String>();
fakeBindings.add("Bind1");
fakeBindings.add("Bind2");
fakeBindings.add("Bind3");
collect.RODBindings = fakeBindings;
}