From 79dcfca25fdedd3aea29e36a9e8a2736c601d717 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Aug 2011 11:56:51 -0400 Subject: [PATCH 2/7] Fixed bad character in documentation --- .../sting/gatk/walkers/indels/IndelRealigner.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 029b6deaf..d766ae8bd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -69,7 +69,7 @@ import java.util.*; *

* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion - * or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, From 78deb3f19595816d08fbde35db5f62189727048e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Aug 2011 11:57:00 -0400 Subject: [PATCH 3/7] Fixed bad character in documentation --- .../walkers/variantutils/VariantsToTable.java | 226 ++++++++++++------ 1 file changed, 153 insertions(+), 73 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index af3593ce4..51515b2d3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -40,29 +40,97 @@ import java.io.PrintStream; import java.util.*; /** - * Emits specific fields as dictated by the user from one or more VCF files. + * Emits specific fields from a VCF file to a table-deliminated format + * + *

+ * This walker accepts a single VCF file and writes out user-selected fields from the + * VCF as a header-containing, tab-deliminated file. The user specifies one or more + * fields to print with the -F NAME, each of which appears as a single column in + * the output file, with a header named NAME, and the value of this field in the VCF + * one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding + * in the INFO field (AC=10). Note that this tool does not support capturing any + * GENOTYPE field values. If a VCF record is missing a value, then the tool by + * default throws an error, but the special value NA can be emitted instead with + * appropriate tool arguments. + * + *

+ * + *

Input

+ *

+ *

+ *

+ * + *

Output

+ *

+ * A table deliminated file containing the values of the requested fields in the VCF file + *

+ * + *

Examples

+ *
+ *     -T $WalkerName \
+ *     -V file.vcf \
+ *     -F CHROM -F POS -F ID -F QUAL -F AC \
+ *     -o results.table
+ *
+ *     would produce a file that looks like:
+ *
+ *     CHROM    POS ID      QUAL    AC
+ *     1        10  .       50      1
+ *     1        20  rs10    99      10
+ *     et cetera...
+ * 
+ * + * @author Mark DePristo + * @since 2010 */ public class VariantsToTable extends RodWalker { - @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @Output(doc="File to which results should be written",required=true) protected PrintStream out; - @Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true) - public ArrayList fieldsToTake = new ArrayList(); + /** + * -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10). + * Note that this tool does not support capturing any GENOTYPE field values. Note this argument + * accepts any number of inputs. So -F CHROM -F POS is allowed. + */ + @Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true) + public List fieldsToTake = new ArrayList(); - @Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records") + /** + * By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered). + * Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value. + */ + @Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false) public boolean showFiltered = false; - @Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false) + /** + * If provided, then this tool will exit with success after this number of records have been emitted to the file. + */ + @Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false) public int MAX_RECORDS = -1; int nRecords = 0; + /** + * By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then + * VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this + * can make your resulting file unreadable and malformated according to tools like R, as the representation of + * multi-allelic INFO field values can be lists of values. + */ @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) public boolean keepMultiAllelic = false; + /** + * By default, this tool throws a UserException when it encounters a field without a value in some record. This + * is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being + * found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such + * fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument + * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error. + */ @Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false) public boolean ALLOW_MISSING_DATA = false; @@ -70,65 +138,6 @@ public class VariantsToTable extends RodWalker { out.println(Utils.join("\t", fieldsToTake)); } - public static abstract class Getter { public abstract String get(VariantContext vc); } - public static Map getters = new HashMap(); - - static { - // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT - getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); - getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); - getters.put("REF", new Getter() { - public String get(VariantContext vc) { - String x = ""; - if ( vc.hasReferenceBaseForIndel() ) { - Byte refByte = vc.getReferenceBaseForIndel(); - x=x+new String(new byte[]{refByte}); - } - return x+vc.getReference().getDisplayString(); - } - }); - getters.put("ALT", new Getter() { - public String get(VariantContext vc) { - StringBuilder x = new StringBuilder(); - int n = vc.getAlternateAlleles().size(); - if ( n == 0 ) return "."; - if ( vc.hasReferenceBaseForIndel() ) { - Byte refByte = vc.getReferenceBaseForIndel(); - x.append(new String(new byte[]{refByte})); - } - - for ( int i = 0; i < n; i++ ) { - if ( i != 0 ) x.append(","); - x.append(vc.getAlternateAllele(i).getDisplayString()); - } - return x.toString(); - } - }); - getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); - getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { - if ( vc.isSNP() && vc.isBiallelic() ) - return VariantContextUtils.isTransition(vc) ? "1" : "0"; - else - return "-1"; - }}); - getters.put("FILTER", new Getter() { public String get(VariantContext vc) { - return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } - }); - - getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); - getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); - getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); - getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); - getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); - getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); - getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); - getters.put("GQ", new Getter() { public String get(VariantContext vc) { - if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); - return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); - }}); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) // RodWalkers can make funky map calls return 0; @@ -155,6 +164,15 @@ public class VariantsToTable extends RodWalker { return s.endsWith("*"); } + /** + * Utility function that returns the list of values for each field in fields from vc. + * + * @param vc the VariantContext whose field values we can to capture + * @param fields a non-null list of fields to capture from VC + * @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise + * provides a value of NA + * @return + */ public static List extractFields(VariantContext vc, List fields, boolean allowMissingData) { List vals = new ArrayList(); @@ -213,13 +231,75 @@ public class VariantsToTable extends RodWalker { return vals; } - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - + // + // default reduce -- doesn't do anything at all + // + public Integer reduceInit() { return 0; } + public Integer reduce(Integer counter, Integer sum) { return counter + sum; } public void onTraversalDone(Integer sum) {} + + // ---------------------------------------------------------------------------------------------------- + // + // static system for getting values from VC by name. + // + // ---------------------------------------------------------------------------------------------------- + + public static abstract class Getter { public abstract String get(VariantContext vc); } + public static Map getters = new HashMap(); + + static { + // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT + getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); + getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); + getters.put("REF", new Getter() { + public String get(VariantContext vc) { + String x = ""; + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x=x+new String(new byte[]{refByte}); + } + return x+vc.getReference().getDisplayString(); + } + }); + getters.put("ALT", new Getter() { + public String get(VariantContext vc) { + StringBuilder x = new StringBuilder(); + int n = vc.getAlternateAlleles().size(); + if ( n == 0 ) return "."; + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x.append(new String(new byte[]{refByte})); + } + + for ( int i = 0; i < n; i++ ) { + if ( i != 0 ) x.append(","); + x.append(vc.getAlternateAllele(i).getDisplayString()); + } + return x.toString(); + } + }); + getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); + getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { + if ( vc.isSNP() && vc.isBiallelic() ) + return VariantContextUtils.isTransition(vc) ? "1" : "0"; + else + return "-1"; + }}); + getters.put("FILTER", new Getter() { public String get(VariantContext vc) { + return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); } + }); + + getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } }); + getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } }); + getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } }); + getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } }); + getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } }); + getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); + getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); + getters.put("GQ", new Getter() { public String get(VariantContext vc) { + if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); + return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError()); + }}); + } + } From 9d1d5bd27a5747a8dfd430ec042dee76e7fce835 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Aug 2011 11:57:31 -0400 Subject: [PATCH 4/7] Revert "Fixed bad character in documentation" This reverts commit a1f50c82d3cb25e5e83d36e9054d74cdee957d87. --- .../sting/gatk/walkers/indels/IndelRealigner.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index d766ae8bd..029b6deaf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -69,7 +69,7 @@ import java.util.*; *

* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion - * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, From 5f794d16a7df2aab7e761adbe9d75e3dbdfc1dae Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Aug 2011 12:00:32 -0400 Subject: [PATCH 5/7] Fixed bad character in documentation --- .../sting/gatk/walkers/indels/IndelRealigner.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 029b6deaf..d766ae8bd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -69,7 +69,7 @@ import java.util.*; *

* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion - * or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, From c6fb215faf42d04ba86341b0351613d7ef717ad3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Aug 2011 12:02:41 -0400 Subject: [PATCH 6/7] GATKDocs for VariantsToTable -- Made a previously required argument optional, as this was a long-standing bug --- .../sting/gatk/walkers/variantutils/VariantsToTable.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 51515b2d3..a37cfa6ba 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -135,6 +135,7 @@ public class VariantsToTable extends RodWalker { public boolean ALLOW_MISSING_DATA = false; public void initialize() { + // print out the header out.println(Utils.join("\t", fieldsToTake)); } From 3da71a9bb6ed5384641ea98e36fd3affca87ac8a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Aug 2011 12:04:45 -0400 Subject: [PATCH 7/7] Clean up summary --- .../sting/gatk/walkers/variantutils/VariantsToTable.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index a37cfa6ba..b3fd540bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -40,7 +40,7 @@ import java.io.PrintStream; import java.util.*; /** - * Emits specific fields from a VCF file to a table-deliminated format + * Emits specific fields from a VCF file to a tab-deliminated table * *

* This walker accepts a single VCF file and writes out user-selected fields from the