Merge branch 'master' of ssh://chartl@tin.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
eea8087c73
|
|
@ -96,24 +96,23 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
|
|||
loadArgumentsIntoObject(walker);
|
||||
argumentSources.add(walker);
|
||||
|
||||
Collection<RMDTriplet> newStyle = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
|
||||
Collection<RMDTriplet> rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
|
||||
|
||||
// todo: remove me when the old style system is removed
|
||||
if ( getArgumentCollection().RODBindings.size() > 0 ) {
|
||||
logger.warn("################################################################################");
|
||||
logger.warn("################################################################################");
|
||||
logger.warn("Deprecated -B rod binding syntax detected. This syntax will be retired in GATK 1.2.");
|
||||
logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2.");
|
||||
logger.warn("Please use arguments defined by each specific walker instead.");
|
||||
for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) {
|
||||
logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags());
|
||||
}
|
||||
logger.warn("################################################################################");
|
||||
logger.warn("################################################################################");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
Collection<RMDTriplet> oldStyle = ListFileUtils.unpackRODBindingsOldStyle(getArgumentCollection().RODBindings, parser);
|
||||
oldStyle.addAll(newStyle);
|
||||
engine.setReferenceMetaDataFiles(oldStyle);
|
||||
engine.setReferenceMetaDataFiles(rodBindings);
|
||||
|
||||
for (ReadFilter filter: filters) {
|
||||
loadArgumentsIntoObject(filter);
|
||||
|
|
|
|||
|
|
@ -39,8 +39,7 @@ import org.simpleframework.xml.*;
|
|||
public class DbsnpArgumentCollection {
|
||||
|
||||
/**
|
||||
* A dbSNP VCF file. Variants in this track will be treated as "known" variants
|
||||
* in tools using this track.
|
||||
* A dbSNP VCF file.
|
||||
*/
|
||||
@Input(fullName="dbsnp", shortName = "D", doc="dbSNP file", required=false)
|
||||
public RodBinding<VariantContext> dbsnp;
|
||||
|
|
|
|||
|
|
@ -101,6 +101,8 @@ public class GATKArgumentCollection {
|
|||
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
|
||||
public File referenceFile = null;
|
||||
|
||||
@Deprecated
|
||||
@Hidden
|
||||
@ElementList(required = false)
|
||||
@Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :<name>,<type> <file>", required = false)
|
||||
public ArrayList<String> RODBindings = new ArrayList<String>();
|
||||
|
|
@ -340,14 +342,6 @@ public class GATKArgumentCollection {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
if (other.RODBindings.size() != RODBindings.size()) {
|
||||
return false;
|
||||
}
|
||||
for (int x = 0; x < RODBindings.size(); x++) {
|
||||
if (!RODBindings.get(x).equals(other.RODBindings.get(x))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!other.samFiles.equals(this.samFiles)) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,26 +40,65 @@ import java.util.TreeSet;
|
|||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
|
||||
/**
|
||||
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear
|
||||
* in the input file. It can dynamically merge the contents of multiple input BAM files, resulting
|
||||
* in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter
|
||||
* command line argument.
|
||||
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
|
||||
*
|
||||
* <p>
|
||||
* PrintReads can dynamically merge the contents of multiple input BAM files, resulting
|
||||
* in merged output sorted in coordinate order. Can also optionally filter reads based on the
|
||||
* --read_filter command line argument.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more bam files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A single processed bam file.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T PrintReads \
|
||||
* -o output.bam \
|
||||
* -I input1.bam \
|
||||
* -I input2.bam \
|
||||
* --read_filter MappingQualityZero
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
/** an optional argument to dump the reads out to a BAM file */
|
||||
|
||||
@Output(doc="Write output to this BAM filename instead of STDOUT")
|
||||
SAMFileWriter out;
|
||||
|
||||
@Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false)
|
||||
String readGroup = null;
|
||||
|
||||
/**
|
||||
* For example, --platform ILLUMINA or --platform 454.
|
||||
*/
|
||||
@Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false)
|
||||
String platform = null; // E.g. ILLUMINA, 454
|
||||
String platform = null;
|
||||
|
||||
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
|
||||
int nReadsToPrint = -1;
|
||||
|
||||
/**
|
||||
* Only reads from samples listed in the provided file(s) will be included in the output.
|
||||
*/
|
||||
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
|
||||
public Set<File> sampleFile = new TreeSet<File>();
|
||||
|
||||
/**
|
||||
* Only reads from the sample(s) will be included in the output.
|
||||
*/
|
||||
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
|
||||
public Set<String> sampleNames = new TreeSet<String>();
|
||||
|
||||
|
|
|
|||
|
|
@ -49,7 +49,34 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* Annotates variant calls with context information. Users can specify which of the available annotations to use.
|
||||
* Annotates variant calls with context information.
|
||||
*
|
||||
* <p>
|
||||
* VariantAnnotator is a GATK tool for annotating variant calls based on their context.
|
||||
* The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* A variant set to annotate and optionally one or more BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* An annotated VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T VariantAnnotator \
|
||||
* -I input.bam \
|
||||
* -o output.vcf \
|
||||
* -A DepthOfCoverage
|
||||
* --variant input.vcf \
|
||||
* --dbsnp dbsnp.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Requires(value={})
|
||||
@Allows(value={DataSource.READS, DataSource.REFERENCE})
|
||||
|
|
@ -69,8 +96,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return snpEffFile; }
|
||||
|
||||
/**
|
||||
* A dbSNP VCF file from which to annotate.
|
||||
*
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
*/
|
||||
@ArgumentCollection
|
||||
|
|
@ -101,15 +126,25 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter vcfWriter = null;
|
||||
|
||||
@Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
|
||||
protected String sampleName = null;
|
||||
|
||||
/**
|
||||
* See the -list argument to view available annotations.
|
||||
*/
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* See the -list argument to view available groups.
|
||||
*/
|
||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationGroupsToUse = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* This option enables you to add annotations from one VCF to another.
|
||||
*
|
||||
* For example, if you want to annotate your 'variant' VCF with the AC field value from the rod bound to 'resource',
|
||||
* you can specify '-E resource.AC' and records in the output VCF will be annotated with 'resource.AC=N' when a record exists in that rod at the given position.
|
||||
* If multiple records in the rod overlap the given position, one is chosen arbitrarily.
|
||||
*/
|
||||
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false)
|
||||
protected List<String> expressionsToUse = new ArrayList<String>();
|
||||
|
||||
|
|
@ -127,8 +162,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
|
||||
protected boolean indelsOnly = false;
|
||||
|
||||
private HashMap<String, String> nonVCFsampleName = new HashMap<String, String>();
|
||||
|
||||
private VariantAnnotatorEngine engine;
|
||||
|
||||
private Collection<VariantContext> indelBufferContext;
|
||||
|
|
@ -164,12 +197,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
List<String> rodName = Arrays.asList(variantCollection.variants.getName());
|
||||
Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
|
||||
|
||||
// add the non-VCF sample from the command-line, if applicable
|
||||
if ( sampleName != null ) {
|
||||
nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
|
||||
samples.add(sampleName.toUpperCase());
|
||||
}
|
||||
|
||||
// if there are no valid samples, warn the user
|
||||
if ( samples.size() == 0 ) {
|
||||
logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
|
||||
|
|
|
|||
|
|
@ -38,12 +38,32 @@ import java.util.List;
|
|||
|
||||
/**
|
||||
* Walks along reference and calculates the GC content for each interval.
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* GC content calculations per interval.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T GCContentByInterval \
|
||||
* -o output.txt \
|
||||
* -I input.bam \
|
||||
* -L input.intervals
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Allows(value = {DataSource.REFERENCE})
|
||||
@Requires(value = {DataSource.REFERENCE})
|
||||
|
||||
@By(DataSource.REFERENCE)
|
||||
|
||||
public class GCContentByIntervalWalker extends LocusWalker<Long, Long> {
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
|
|
|||
|
|
@ -40,18 +40,48 @@ import java.util.List;
|
|||
|
||||
|
||||
/**
|
||||
* Generates an alternative reference sequence over the specified interval. Given variant ROD tracks,
|
||||
* it replaces the reference bases at variation sites with the bases supplied by the ROD(s). Additionally,
|
||||
* allows for a "snpmask" ROD to set overlapping bases to 'N'.
|
||||
* Generates an alternative reference sequence over the specified interval.
|
||||
*
|
||||
* <p>
|
||||
* Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
|
||||
* Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The reference, requested intervals, and any number of variant rod files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A fasta file representing the requested intervals.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T FastaAlternateReferenceMaker \
|
||||
* -o output.fasta \
|
||||
* -L input.intervals \
|
||||
* --variant input.vcf \
|
||||
* [--snpmask mask.vcf]
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@WalkerName("FastaAlternateReferenceMaker")
|
||||
@Reference(window=@Window(start=-1,stop=50))
|
||||
@Requires(value={DataSource.REFERENCE})
|
||||
public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
|
||||
|
||||
/**
|
||||
* Variants from these input files are used by this tool to construct an alternate reference.
|
||||
*/
|
||||
@Input(fullName = "variant", shortName = "V", doc="variants to model", required=false)
|
||||
public List<RodBinding<VariantContext>> variants = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* Snps from this file are used as a mask when constructing the alternate reference.
|
||||
*/
|
||||
@Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false)
|
||||
public RodBinding<VariantContext> snpmask;
|
||||
|
||||
|
|
|
|||
|
|
@ -38,14 +38,44 @@ import org.broadinstitute.sting.utils.collections.Pair;
|
|||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Renders a new reference in FASTA format consisting of only those loci provided in the input data set. Has optional
|
||||
* features to control the output format.
|
||||
* Renders a new reference in FASTA format consisting of only those loci provided in the input data set.
|
||||
*
|
||||
* <p>
|
||||
* The output format can be partially controlled using the provided command-line arguments.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The reference and requested intervals.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A fasta file representing the requested intervals.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T FastaReference \
|
||||
* -o output.fasta \
|
||||
* -L input.intervals
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@WalkerName("FastaReferenceMaker")
|
||||
public class FastaReferenceWalker extends RefWalker<Pair<GenomeLoc, String>, GenomeLoc> {
|
||||
|
||||
@Output PrintStream out;
|
||||
@Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) public int fastaLineWidth=60;
|
||||
@Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity) - CAUTION: adjacent intervals will automatically be merged", required=false) public boolean fastaRawSeqs=false;
|
||||
|
||||
@Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false)
|
||||
public int fastaLineWidth=60;
|
||||
|
||||
/**
|
||||
* Please note that when using this argument adjacent intervals will automatically be merged.
|
||||
*/
|
||||
@Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false)
|
||||
public boolean fastaRawSeqs=false;
|
||||
|
||||
protected FastaSequence fasta;
|
||||
|
||||
|
|
|
|||
|
|
@ -45,6 +45,34 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* Filters variant calls using a number of user-selectable, parameterizable criteria.
|
||||
*
|
||||
* <p>
|
||||
* VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria.
|
||||
* Records are hard-filtered by changing the value in the FILTER field to something other than PASS.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* A variant set to filter.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A filtered VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T VariantFiltration \
|
||||
* -o output.vcf \
|
||||
* --variant input.vcf \
|
||||
* --filterExpression "AB < 0.2 || MQ0 > 50" \
|
||||
* --filterName "Nov09filters" \
|
||||
* --mask mask.vcf \
|
||||
* --maskName InDel
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Reference(window=@Window(start=-50,stop=50))
|
||||
public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
||||
|
|
@ -52,33 +80,65 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
|
|||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
/**
|
||||
* Any variant which overlaps entries from the provided mask rod will be filtered.
|
||||
*/
|
||||
@Input(fullName="mask", doc="Input ROD mask", required=false)
|
||||
public RodBinding<Feature> mask;
|
||||
|
||||
@Output(doc="File to which variants should be written", required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter (see wiki docs for more info)", required=false)
|
||||
/**
|
||||
* VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using
|
||||
* --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
|
||||
*/
|
||||
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
|
||||
protected ArrayList<String> FILTER_EXPS = new ArrayList<String>();
|
||||
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
|
||||
|
||||
/**
|
||||
* This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
|
||||
*/
|
||||
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
|
||||
protected ArrayList<String> FILTER_NAMES = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
|
||||
* VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag).
|
||||
* One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
|
||||
* methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1").
|
||||
*/
|
||||
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false)
|
||||
protected ArrayList<String> GENOTYPE_FILTER_EXPS = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
|
||||
*/
|
||||
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
|
||||
protected ArrayList<String> GENOTYPE_FILTER_NAMES = new ArrayList<String>();
|
||||
|
||||
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]", required=false)
|
||||
/**
|
||||
* Works together with the --clusterWindowSize argument.
|
||||
*/
|
||||
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false)
|
||||
protected Integer clusterSize = 3;
|
||||
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]", required=false)
|
||||
|
||||
/**
|
||||
* Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1.
|
||||
*/
|
||||
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false)
|
||||
protected Integer clusterWindow = 0;
|
||||
|
||||
@Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered; [default:0]", required=false)
|
||||
@Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false)
|
||||
protected Integer MASK_EXTEND = 0;
|
||||
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false)
|
||||
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
|
||||
protected String MASK_NAME = "Mask";
|
||||
|
||||
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)?", required=false)
|
||||
/**
|
||||
* By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing.
|
||||
* Use this argument to have it evaluate as failing filters instead for these cases.
|
||||
*/
|
||||
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
|
||||
protected Boolean FAIL_MISSING_VALUES = false;
|
||||
|
||||
// JEXL expressions for the filters
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ import java.util.*;
|
|||
* <p>
|
||||
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
|
||||
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
|
||||
* or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
|
||||
* or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
|
||||
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
|
||||
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
|
||||
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
|
||||
|
|
|
|||
|
|
@ -35,16 +35,46 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
|
||||
|
||||
/**
|
||||
* Left aligns indels in reads.
|
||||
* Left-aligns indels from reads in a bam file.
|
||||
*
|
||||
* <p>
|
||||
* LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be
|
||||
* placed at multiple positions and still represent the same haplotype. While a standard convention is to place an
|
||||
* indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* A bam file to left-align.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A left-aligned bam.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx3g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T LeftAlignIndels \
|
||||
* -I input.bam \
|
||||
* -o output.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public class LeftAlignIndels extends ReadWalker<Integer, Integer> {
|
||||
|
||||
@Output(required=false, doc="Output bam")
|
||||
protected StingSAMFileWriter writer = null;
|
||||
|
||||
@Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
|
||||
"If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false)
|
||||
/**
|
||||
* If set too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool
|
||||
* may run out of memory. We recommend that you additionally tell Java to use a temp directory with plenty of available
|
||||
* space (by setting java.io.tempdir on the command-line).
|
||||
*/
|
||||
@Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the output writer", required=false)
|
||||
protected int MAX_RECORDS_IN_RAM = 500000;
|
||||
|
||||
public void initialize() {
|
||||
|
|
|
|||
|
|
@ -23,7 +23,10 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -49,16 +52,46 @@ import java.util.*;
|
|||
|
||||
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
|
||||
|
||||
|
||||
/**
|
||||
* Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads).
|
||||
*
|
||||
* <p>
|
||||
* Performs physical phasing of SNP calls, based on sequencing reads.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* VCF file of SNP calls, BAM file of sequence reads.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Phased VCF file.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ReadBackedPhasing
|
||||
* -R reference.fasta
|
||||
* -I reads.bam
|
||||
* --variant:vcf SNPs.vcf
|
||||
* -BTI variant
|
||||
* -BTIMR INTERSECTION
|
||||
* -o phased_SNPs.vcf
|
||||
* --phaseQualityThresh 20.0
|
||||
* </pre>
|
||||
*
|
||||
* @author Menachem Fromer
|
||||
* @since July 2010
|
||||
*/
|
||||
@Allows(value = {DataSource.READS, DataSource.REFERENCE})
|
||||
@Requires(value = {DataSource.READS, DataSource.REFERENCE})
|
||||
@By(DataSource.READS)
|
||||
|
||||
@ReadFilters({MappingQualityZeroReadFilter.class})
|
||||
// Filter out all reads with zero mapping quality
|
||||
@ReadFilters({MappingQualityZeroReadFilter.class})
|
||||
|
||||
public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, PhasingStats> {
|
||||
private static final boolean DEBUG = false;
|
||||
|
|
@ -73,13 +106,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
@Output(doc = "File to which variants should be written", required = true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
@Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads; [default:20000]", required = false)
|
||||
@Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads for the phasing procedure", required = false)
|
||||
protected Integer cacheWindow = 20000;
|
||||
|
||||
@Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm; [default:10]", required = false)
|
||||
@Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm", required = false)
|
||||
protected Integer maxPhaseSites = 10; // 2^10 == 10^3 diploid haplotypes
|
||||
|
||||
@Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing; [default:10.0]", required = false)
|
||||
@Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing", required = false)
|
||||
protected Double phaseQualityThresh = 10.0; // PQ = 10.0 <=> P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9
|
||||
|
||||
@Hidden
|
||||
|
|
@ -87,10 +120,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
protected String variantStatsFilePrefix = null;
|
||||
private PhasingQualityStatsWriter statsWriter = null;
|
||||
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing [default: 17]", required = false)
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing", required = false)
|
||||
public int MIN_BASE_QUALITY_SCORE = 17;
|
||||
|
||||
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing [default: 20]", required = false)
|
||||
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing", required = false)
|
||||
public int MIN_MAPPING_QUALITY_SCORE = 20;
|
||||
|
||||
@Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false)
|
||||
|
|
@ -111,10 +144,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
|
||||
public static final String PHASING_INCONSISTENT_KEY = "PhasingInconsistent";
|
||||
|
||||
@Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records [default:false]", required = false)
|
||||
@Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records", required = false)
|
||||
protected boolean enableMergePhasedSegregatingPolymorphismsToMNP = false;
|
||||
|
||||
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
|
||||
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record", required = false)
|
||||
protected int maxGenomicDistanceForMNP = 1;
|
||||
|
||||
@Hidden
|
||||
|
|
|
|||
|
|
@ -11,7 +11,31 @@ import java.io.PrintStream;
|
|||
|
||||
/**
|
||||
* Walks over the input data set, calculating the total number of covered loci for diagnostic purposes.
|
||||
*
|
||||
* <p>
|
||||
* Simplest example of a locus walker.
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Number of loci traversed.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T CountLoci \
|
||||
* -o output.txt \
|
||||
* -I input.bam \
|
||||
* [-L input.intervals]
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public class CountLociWalker extends LocusWalker<Integer, Long> implements TreeReducible<Long> {
|
||||
@Output(doc="Write count to this file instead of STDOUT")
|
||||
|
|
|
|||
|
|
@ -39,6 +39,26 @@ import java.util.List;
|
|||
* query name order. Breaks counts down by total pairs and number
|
||||
* of paired reads.
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more bam files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Number of pairs seen.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T CountPairs \
|
||||
* -o output.txt \
|
||||
* -I input.bam
|
||||
* </pre>
|
||||
*
|
||||
* @author mhanna
|
||||
*/
|
||||
public class CountPairsWalker extends ReadPairWalker<Integer,Long> {
|
||||
|
|
|
|||
|
|
@ -25,7 +25,10 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -33,25 +36,55 @@ import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
|||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Prints out counts of the number of reference ordered data objects are
|
||||
* each locus for debugging RefWalkers.
|
||||
* Prints out counts of the number of reference ordered data objects encountered.
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more rod files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Number of rods seen.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T CountRODsByRef \
|
||||
* -o output.txt \
|
||||
* --rod input.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public class CountRodByRefWalker extends RefWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
|
||||
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
|
||||
public class CountRODsByRefWalker extends RefWalker<CountRODsWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
|
||||
|
||||
/**
|
||||
* One or more input rod files
|
||||
*/
|
||||
@Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
|
||||
public List<RodBinding<Feature>> rods = Collections.emptyList();
|
||||
|
||||
@Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
|
||||
public boolean verbose = false;
|
||||
|
||||
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
|
||||
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
|
||||
public boolean showSkipped = false;
|
||||
|
||||
CountRodWalker crw = new CountRodWalker();
|
||||
CountRODsWalker crw = new CountRODsWalker();
|
||||
|
||||
public void initialize() {
|
||||
crw.verbose = verbose;
|
||||
crw.showSkipped = showSkipped;
|
||||
}
|
||||
|
||||
public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
public CountRODsWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
return crw.map(tracker, ref, context);
|
||||
}
|
||||
|
||||
|
|
@ -59,7 +92,7 @@ public class CountRodByRefWalker extends RefWalker<CountRodWalker.Datum, Pair<Ex
|
|||
return crw.reduceInit();
|
||||
}
|
||||
|
||||
public Pair<ExpandingArrayList<Long>, Long> reduce(CountRodWalker.Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
|
||||
public Pair<ExpandingArrayList<Long>, Long> reduce(CountRODsWalker.Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
|
||||
return crw.reduce(point, sum);
|
||||
}
|
||||
}
|
||||
|
|
@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.qc;
|
|||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -41,23 +44,46 @@ import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
|||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Prints out counts of the number of reference ordered data objects are
|
||||
* each locus for debugging RodWalkers.
|
||||
* Prints out counts of the number of reference ordered data objects encountered.
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more rod files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Number of rods seen.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T CountRODs \
|
||||
* -o output.txt \
|
||||
* --rod input.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public class CountRodWalker extends RodWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>> {
|
||||
public class CountRODsWalker extends RodWalker<CountRODsWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>> {
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
|
||||
/**
|
||||
* One or more input rod files
|
||||
*/
|
||||
@Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
|
||||
public List<RodBinding<Feature>> rods = Collections.emptyList();
|
||||
|
||||
@Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
|
||||
public boolean verbose = false;
|
||||
|
||||
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
|
||||
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
|
||||
public boolean showSkipped = false;
|
||||
|
||||
@Override
|
||||
|
|
@ -9,8 +9,32 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
|
|||
|
||||
/**
|
||||
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
|
||||
*
|
||||
* <p>
|
||||
* Can also count the number of reads matching a given criterion using read filters (see the
|
||||
* --read-filter command line argument). Simplest example of a read-backed analysis.
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Number of reads seen.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T CountReads \
|
||||
* -o output.txt \
|
||||
* -I input.bam \
|
||||
* [-L input.intervals]
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class CountReadsWalker extends ReadWalker<Integer, Integer> {
|
||||
|
|
|
|||
|
|
@ -36,20 +36,61 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more)
|
||||
*
|
||||
* <p>
|
||||
* Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of
|
||||
* raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls
|
||||
* to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics
|
||||
* by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid
|
||||
* degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in
|
||||
* evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation
|
||||
* and stratification modules.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more variant sets to evaluate plus any number of comparison sets.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Evaluation tables.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T VariantEval \
|
||||
* -o output.eval.gatkreport \
|
||||
* --eval:set1 set1.vcf \
|
||||
* --eval:set2 set2.vcf \
|
||||
* [--comp comp.vcf]
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Reference(window=@Window(start=-50, stop=50))
|
||||
public class VariantEvalWalker extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
|
||||
// Output arguments
|
||||
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
/**
|
||||
* The variant file(s) to evaluate.
|
||||
*/
|
||||
@Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true)
|
||||
public List<RodBinding<VariantContext>> evals;
|
||||
|
||||
/**
|
||||
* The variant file(s) to compare against.
|
||||
*/
|
||||
@Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false)
|
||||
public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList();
|
||||
private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>();
|
||||
|
||||
/**
|
||||
* dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants.
|
||||
* Other sets can be specified with the -knownName (--known_names) argument.
|
||||
*/
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
|
||||
|
|
@ -67,6 +108,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
@Argument(fullName="sample", shortName="sn", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false)
|
||||
protected Set<String> SAMPLE_EXPRESSIONS;
|
||||
|
||||
/**
|
||||
* List of rod tracks to be used for specifying "known" variants other than dbSNP.
|
||||
*/
|
||||
@Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
|
||||
protected HashSet<String> KNOWN_NAMES = new HashSet<String>();
|
||||
List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>();
|
||||
|
|
@ -81,7 +125,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
@Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false)
|
||||
protected Set<VariantContext.Type> typesToUse = null;
|
||||
|
||||
// Evaluator arguments
|
||||
/**
|
||||
* See the -list argument to view available modules.
|
||||
*/
|
||||
@Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
|
||||
protected String[] MODULES_TO_USE = {};
|
||||
|
||||
|
|
@ -95,7 +141,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
@Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false)
|
||||
protected double MIN_PHASE_QUALITY = 10.0;
|
||||
|
||||
@Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||
/**
|
||||
* This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined.
|
||||
*/
|
||||
@Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false)
|
||||
protected String FAMILY_STRUCTURE;
|
||||
|
||||
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||
|
|
|
|||
|
|
@ -46,6 +46,31 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* Left-aligns indels from a variants file.
|
||||
*
|
||||
* <p>
|
||||
* LeftAlignVariants is a tool that takes a VCF file and left-aligns any indels inside it. The same indel can often be
|
||||
* placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to
|
||||
* place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* A variant set to left-align.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A left-aligned VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T LeftAlignVariants \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
public class LeftAlignVariants extends RodWalker<Integer, Integer> {
|
||||
|
|
|
|||
|
|
@ -25,7 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
|
|
@ -34,7 +33,6 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
|
@ -48,7 +46,32 @@ import java.util.Set;
|
|||
|
||||
|
||||
/**
|
||||
* Validates a variants file.
|
||||
* Strictly validates a variants file.
|
||||
*
|
||||
* <p>
|
||||
* ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it.
|
||||
* Checks include the correctness of the reference base(s), accuracy of AC & AN values, tests against rsIDs
|
||||
* when a dbSNP file is provided, and that all alternate alleles are present in at least one sample.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* A variant set to filter.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A filtered VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T ValidateVariants \
|
||||
* --variant input.vcf \
|
||||
* --dbsnp dbsnp.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Reference(window=@Window(start=0,stop=100))
|
||||
public class ValidateVariants extends RodWalker<Integer, Integer> {
|
||||
|
|
@ -67,10 +90,13 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false)
|
||||
protected ValidationType type = ValidationType.ALL;
|
||||
|
||||
@Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "should we skip validation on filtered records?", required = false)
|
||||
/**
|
||||
* By default, even filtered records are validated.
|
||||
*/
|
||||
@Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false)
|
||||
protected Boolean DO_NOT_VALIDATE_FILTERED = false;
|
||||
|
||||
@Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "should we just emit warnings on errors instead of terminating the run?", required = false)
|
||||
@Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false)
|
||||
protected Boolean WARN_ON_ERROR = false;
|
||||
|
||||
private long numErrors = 0;
|
||||
|
|
|
|||
|
|
@ -25,10 +25,8 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -43,21 +41,57 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
|
||||
* Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
|
||||
*
|
||||
* <p>
|
||||
* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes).
|
||||
* The tool produces a VCF that is annotated with information pertaining to plate quality control and by
|
||||
* default is soft-filtered by high no-call rate or low Hardy-Weinberg probability.
|
||||
* If you have .ped files, please first convert them to VCF format
|
||||
* (see http://www.broadinstitute.org/gsa/wiki/index.php/Converting_ped_to_vcf).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* A validation VCF to annotate.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* An annotated VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T VariantValidationAssessor \
|
||||
* --variant input.vcf \
|
||||
* -o output.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Reference(window=@Window(start=0,stop=40))
|
||||
public class VariantValidationAssessor extends RodWalker<VariantContext,Integer> {
|
||||
@Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true)
|
||||
public RodBinding<VariantContext> variants;
|
||||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter vcfwriter = null;
|
||||
|
||||
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false)
|
||||
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid", required=false)
|
||||
protected double maxHardy = 20.0;
|
||||
@Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid [default:0.05]", required=false)
|
||||
|
||||
/**
|
||||
* To disable, set to a value greater than 1.
|
||||
*/
|
||||
@Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid", required=false)
|
||||
protected double maxNoCall = 0.05;
|
||||
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false)
|
||||
|
||||
/**
|
||||
* To disable, set to a value greater than 1.
|
||||
*/
|
||||
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid", required=false)
|
||||
protected double maxHomNonref = 1.1;
|
||||
|
||||
//@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+
|
||||
|
|
@ -93,7 +127,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
|
|||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
VariantContext vc = tracker.getFirstValue(variants, ref.getLocus());
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants, ref.getLocus());
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
return null;
|
||||
|
|
@ -101,7 +135,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
|
|||
if ( sampleNames == null )
|
||||
sampleNames = new TreeSet<String>(vc.getSampleNames());
|
||||
|
||||
return addVariantInformationToCall(ref, vc);
|
||||
return addVariantInformationToCall(vc);
|
||||
}
|
||||
|
||||
public Integer reduce(VariantContext call, Integer numVariants) {
|
||||
|
|
@ -113,7 +147,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
|
|||
}
|
||||
|
||||
public void onTraversalDone(Integer finalReduce) {
|
||||
final List<String> inputNames = Arrays.asList(variants.getName());
|
||||
final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
|
||||
|
||||
// setup the header fields
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
|
|
@ -159,7 +193,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
|
|||
}
|
||||
|
||||
|
||||
private VariantContext addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) {
|
||||
private VariantContext addVariantInformationToCall(VariantContext vContext) {
|
||||
|
||||
// check possible filters
|
||||
double hwPvalue = hardyWeinbergCalculation(vContext);
|
||||
|
|
|
|||
|
|
@ -40,95 +40,105 @@ import java.io.PrintStream;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Emits specific fields as dictated by the user from one or more VCF files.
|
||||
* Emits specific fields from a VCF file to a tab-deliminated table
|
||||
*
|
||||
* <p>
|
||||
* This walker accepts a single VCF file and writes out user-selected fields from the
|
||||
* VCF as a header-containing, tab-deliminated file. The user specifies one or more
|
||||
* fields to print with the -F NAME, each of which appears as a single column in
|
||||
* the output file, with a header named NAME, and the value of this field in the VCF
|
||||
* one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding
|
||||
* in the INFO field (AC=10). Note that this tool does not support capturing any
|
||||
* GENOTYPE field values. If a VCF record is missing a value, then the tool by
|
||||
* default throws an error, but the special value NA can be emitted instead with
|
||||
* appropriate tool arguments.
|
||||
*
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>A VCF file</li>
|
||||
* <li>A list of -F fields to write</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A table deliminated file containing the values of the requested fields in the VCF file
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* -T $WalkerName \
|
||||
* -V file.vcf \
|
||||
* -F CHROM -F POS -F ID -F QUAL -F AC \
|
||||
* -o results.table
|
||||
*
|
||||
* would produce a file that looks like:
|
||||
*
|
||||
* CHROM POS ID QUAL AC
|
||||
* 1 10 . 50 1
|
||||
* 1 20 rs10 99 10
|
||||
* et cetera...
|
||||
* </pre>
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 2010
|
||||
*/
|
||||
public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
@Output(doc="File to which results should be written",required=true)
|
||||
protected PrintStream out;
|
||||
|
||||
@Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true)
|
||||
public ArrayList<String> fieldsToTake = new ArrayList<String>();
|
||||
/**
|
||||
* -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10).
|
||||
* Note that this tool does not support capturing any GENOTYPE field values. Note this argument
|
||||
* accepts any number of inputs. So -F CHROM -F POS is allowed.
|
||||
*/
|
||||
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
|
||||
public List<String> fieldsToTake = new ArrayList<String>();
|
||||
|
||||
@Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records")
|
||||
/**
|
||||
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
|
||||
* Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
|
||||
*/
|
||||
@Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
|
||||
public boolean showFiltered = false;
|
||||
|
||||
@Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false)
|
||||
/**
|
||||
* If provided, then this tool will exit with success after this number of records have been emitted to the file.
|
||||
*/
|
||||
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
|
||||
public int MAX_RECORDS = -1;
|
||||
int nRecords = 0;
|
||||
|
||||
/**
|
||||
* By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then
|
||||
* VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this
|
||||
* can make your resulting file unreadable and malformated according to tools like R, as the representation of
|
||||
* multi-allelic INFO field values can be lists of values.
|
||||
*/
|
||||
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
|
||||
public boolean keepMultiAllelic = false;
|
||||
|
||||
/**
|
||||
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
|
||||
* is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being
|
||||
* found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such
|
||||
* fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
|
||||
* will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
|
||||
*/
|
||||
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
|
||||
public boolean ALLOW_MISSING_DATA = false;
|
||||
|
||||
public void initialize() {
|
||||
// print out the header
|
||||
out.println(Utils.join("\t", fieldsToTake));
|
||||
}
|
||||
|
||||
public static abstract class Getter { public abstract String get(VariantContext vc); }
|
||||
public static Map<String, Getter> getters = new HashMap<String, Getter>();
|
||||
|
||||
static {
|
||||
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
|
||||
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
|
||||
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
|
||||
getters.put("REF", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
String x = "";
|
||||
if ( vc.hasReferenceBaseForIndel() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x=x+new String(new byte[]{refByte});
|
||||
}
|
||||
return x+vc.getReference().getDisplayString();
|
||||
}
|
||||
});
|
||||
getters.put("ALT", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
StringBuilder x = new StringBuilder();
|
||||
int n = vc.getAlternateAlleles().size();
|
||||
if ( n == 0 ) return ".";
|
||||
if ( vc.hasReferenceBaseForIndel() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x.append(new String(new byte[]{refByte}));
|
||||
}
|
||||
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
if ( i != 0 ) x.append(",");
|
||||
x.append(vc.getAlternateAllele(i).getDisplayString());
|
||||
}
|
||||
return x.toString();
|
||||
}
|
||||
});
|
||||
getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
|
||||
getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.isSNP() && vc.isBiallelic() )
|
||||
return VariantContextUtils.isTransition(vc) ? "1" : "0";
|
||||
else
|
||||
return "-1";
|
||||
}});
|
||||
getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
|
||||
return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
|
||||
});
|
||||
|
||||
getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
|
||||
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
|
||||
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
|
||||
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
|
||||
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
|
||||
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
|
||||
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
|
||||
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
|
||||
return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
|
||||
}});
|
||||
}
|
||||
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||
return 0;
|
||||
|
|
@ -155,6 +165,15 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
return s.endsWith("*");
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility function that returns the list of values for each field in fields from vc.
|
||||
*
|
||||
* @param vc the VariantContext whose field values we can to capture
|
||||
* @param fields a non-null list of fields to capture from VC
|
||||
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
|
||||
* provides a value of NA
|
||||
* @return
|
||||
*/
|
||||
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
||||
List<String> vals = new ArrayList<String>();
|
||||
|
||||
|
|
@ -213,13 +232,75 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
return vals;
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer counter, Integer sum) {
|
||||
return counter + sum;
|
||||
}
|
||||
|
||||
//
|
||||
// default reduce -- doesn't do anything at all
|
||||
//
|
||||
public Integer reduceInit() { return 0; }
|
||||
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
|
||||
public void onTraversalDone(Integer sum) {}
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// static system for getting values from VC by name.
|
||||
//
|
||||
// ----------------------------------------------------------------------------------------------------
|
||||
|
||||
public static abstract class Getter { public abstract String get(VariantContext vc); }
|
||||
public static Map<String, Getter> getters = new HashMap<String, Getter>();
|
||||
|
||||
static {
|
||||
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
|
||||
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
|
||||
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
|
||||
getters.put("REF", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
String x = "";
|
||||
if ( vc.hasReferenceBaseForIndel() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x=x+new String(new byte[]{refByte});
|
||||
}
|
||||
return x+vc.getReference().getDisplayString();
|
||||
}
|
||||
});
|
||||
getters.put("ALT", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
StringBuilder x = new StringBuilder();
|
||||
int n = vc.getAlternateAlleles().size();
|
||||
if ( n == 0 ) return ".";
|
||||
if ( vc.hasReferenceBaseForIndel() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x.append(new String(new byte[]{refByte}));
|
||||
}
|
||||
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
if ( i != 0 ) x.append(",");
|
||||
x.append(vc.getAlternateAllele(i).getDisplayString());
|
||||
}
|
||||
return x.toString();
|
||||
}
|
||||
});
|
||||
getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
|
||||
getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.isSNP() && vc.isBiallelic() )
|
||||
return VariantContextUtils.isTransition(vc) ? "1" : "0";
|
||||
else
|
||||
return "-1";
|
||||
}});
|
||||
getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
|
||||
return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
|
||||
});
|
||||
|
||||
getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
|
||||
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
|
||||
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
|
||||
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
|
||||
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
|
||||
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
|
||||
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
|
||||
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
|
||||
return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
|
||||
}});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,6 +53,30 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* Converts variants from other file formats to VCF format.
|
||||
*
|
||||
* <p>
|
||||
* Note that there must be a Tribble feature/codec for the file format as well as an adaptor.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* A variant file to filter.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A VCF file.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T VariantsToVCF \
|
||||
* -o output.vcf \
|
||||
* --variant:RawHapMap input.hapmap \
|
||||
* --dbsnp dbsnp.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Reference(window=@Window(start=-40,stop=40))
|
||||
public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
||||
|
|
@ -61,15 +85,24 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
|||
protected VCFWriter baseWriter = null;
|
||||
private SortingVCFWriter vcfwriter; // needed because hapmap/dbsnp indel records move
|
||||
|
||||
/**
|
||||
* Variants from this input file are used by this tool as input.
|
||||
*/
|
||||
@Input(fullName="variant", shortName = "V", doc="Input variant file", required=true)
|
||||
public RodBinding<Feature> variants;
|
||||
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
|
||||
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
|
||||
/**
|
||||
* This argument is used for data (like GELI) with genotypes but no sample names encoded within.
|
||||
*/
|
||||
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod", required=false)
|
||||
protected String sampleName = null;
|
||||
|
||||
/**
|
||||
* This argument is useful for fixing input VCFs with bad reference bases (the output will be a fixed version of the VCF).
|
||||
*/
|
||||
@Argument(fullName="fixRef", shortName="fixRef", doc="Fix common reference base in case there's an indel without padding", required=false)
|
||||
protected boolean fixReferenceBase = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -88,14 +88,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest {
|
|||
collect.intervals.add("intervals".toLowerCase());
|
||||
collect.excludeIntervals = new ArrayList<String>();
|
||||
collect.numberOfThreads = 1;
|
||||
|
||||
// make some rod bindings up
|
||||
ArrayList<String> fakeBindings = new ArrayList<String>();
|
||||
fakeBindings.add("Bind1");
|
||||
fakeBindings.add("Bind2");
|
||||
fakeBindings.add("Bind3");
|
||||
|
||||
collect.RODBindings = fakeBindings;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue