From 5afcc8e05f3e24938bef05dbc995285f69602d5d Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Mon, 7 Apr 2014 12:22:19 -0400 Subject: [PATCH] Change in the command line interface of ValidateVariants. Following reviewers comments the command line interface has been simplified. All extra strict validations are performed by default (as before) and the user has to indicate which one he/she does not want to use with --validationTypeToExclude. Before he/she was able to indicate the only ones to apply with --validationType but that has been scrapped out. Stories: - https://www.pivotaltracker.com/story/show/68725164 Changes: - Removed validateType argument. - Improved documentation. - Added some warnning log message on suspicious argument combinations. Tests: - ValidateVariantsIntegrationTest#* --- .../ValidateVariantsIntegrationTest.java | 20 ++- .../variantutils/ValidateVariants.java | 136 ++++++++++-------- 2 files changed, 93 insertions(+), 63 deletions(-) diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java index 0e12ae937..13ba2f58a 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -63,10 +63,21 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { } public static String baseTestString(String file, String type, String region, String reference) { - final String typeArgString = type.startsWith("-") ? " --validationTypeToExclude " + type.substring(1) : " --validationType " + type; + final String typeArgString = type.startsWith("-") ? " --validationTypeToExclude " + type.substring(1) : excludeValidationTypesButString(type); return "-T ValidateVariants -R " + reference + " -L " + region + " --variant:vcf " + privateTestDir + file + typeArgString; } + private static String excludeValidationTypesButString(String type) { + if (type == "ALL") + return ""; + final ValidateVariants.ValidationType vtype = ValidateVariants.ValidationType.valueOf(type); + final StringBuilder sbuilder = new StringBuilder(); + for (final ValidateVariants.ValidationType t : ValidateVariants.ValidationType.CONCRETE_TYPES) + if (t != vtype) + sbuilder.append(" --validationTypeToExclude " + t.toString()); + return sbuilder.toString(); + } + @Test public void testGoodFile() { WalkerTestSpec spec = new WalkerTestSpec( @@ -124,12 +135,11 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { @Test public void testBadID() { - WalkerTestSpec spec = new WalkerTestSpec( + final WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad.vcf", "IDS") + " --dbsnp " + b36dbSNP129, 0, UserException.FailsStrictValidation.class ); - executeTest("test bad RS ID", spec); } @@ -158,7 +168,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { @Test public void testNoValidation() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString("validationExampleBad.vcf", "NONE"), + baseTestString("validationExampleBad.vcf", "-ALL"), 0, Arrays.asList(emptyMd5) ); @@ -187,7 +197,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { @Test(description = "Checks '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164") public void testUnusedAlleleError() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString("validationUnusedAllelesBugFix.vcf","ALL","1:1-739000",b37KGReference),0, UserException.FailsStrictValidation.class); + baseTestString("validationUnusedAllelesBugFix.vcf","ALLELES","1:1-739000",b37KGReference),0, UserException.FailsStrictValidation.class); executeTest("test unused allele bug fix", spec); } } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index 59e9c8723..a9a4cfd53 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -53,43 +53,37 @@ import java.util.*; * *

* ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it. - * In addition to standard adherence to the VCF specification, this tool performs extra checks to make ensure - * the information contained within the file is correct. Checks include: - * p> - *

+ * In addition to standard adherence to the VCF specification, this tool performs extra strict validations to ensure + * the information contained within the file is correct. These include: + *

+ *

+ *
REF
the correctness of the reference base(s).
+ *
CHR_COUNTS
accuracy of AC & AN values.
+ *
IDS
tests against rsIDs when a dbSNP file is provided. Notice that for this one to work, you need + * to provide a reference to the dbsnp variant containing file using the --dbsnp as show in examples below.
+ *
ALLELES
and that all alternate alleles are present in at least one sample.
+ *
* *

* *

- * By default it will apply all the strict validations unless you indicate which one you want to perform - * using --validationType as indicated above. - * You can request this explicitly using --validationType ALL but this is not necessary. - *

- * + * By default it will apply all the strict validations unless you indicate which one you want you want to exclude + * using -Xtype|--validationTypeToExclude <code>, where code is one of the listed above. You + * can exclude as many types as you want *

- * If you are looking simply to test the adherence to the VCF specification, use --validationType NONE. - *

- * - *

- * If you want to perform all strict validations but a few you can list the ones to exclude adding - * as many '--validationTypeToExclude TYPE_ID' to the command line as needed. - * This is only possible when the validation type is the default 'ALL'. - * An attempt to combine --validationTypeToExclude with - * any other validation type will result in an error. + * Yo can exclude all strict validations with the special code ALL. In this case the tool will only + * test the adherence to the VCF specification. *

* *

Input

*

- * A variant set to validate. + * A variant set to validate using -V or --variant as shown below. *

* *

Examples

+ * + *

To perform VCF format and all strict validations:

+ * *
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
@@ -98,6 +92,27 @@ import java.util.*;
  *   --dbsnp dbsnp.vcf
  * 
* + *

To perform only VCF format tests:

+ * + *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ValidateVariants \
+ *   --validationTypeToExclude ALL \
+ *   --variant input.vcf
+ * 
+ * + *

To perform all validations except the strict ALLELE validation:

+ * + *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ValidateVariants \
+ *   --validationTypeToExclude ALLELES
+ *   --variant input.vcf \
+ *   --dbsnp dbsnp.vcf
+ * 
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=100)) @@ -110,8 +125,9 @@ public class ValidateVariants extends RodWalker { protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); public enum ValidationType { + /** - * Perform all extra-strict tests listed bellow. + * Makes reference to all extra-strict tests listed below. */ ALL, @@ -135,18 +151,23 @@ public class ValidateVariants extends RodWalker { * Check that the AN and AC annotations are consistent with the number of calls, alleles and then number these * are called across samples. */ - CHR_COUNTS, + CHR_COUNTS; /** - * Do not perform any extra-strict VCF content validation. Notice however that mis-formatted VCF would be reported - * as an error (e.g. garbled VCF file, mismatch between format fields and genotype fields number, wrong data types, - * or reference to non-existing alternative alleles. + * Unmodifiable set of concrete validation types. + * + *

These are all types except {@link #ALL}.

*/ - NONE - } + public final static Set CONCRETE_TYPES; - @Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false) - protected ValidationType type = ValidationType.ALL; + static { + final Set cts = new LinkedHashSet<>(values().length - 1); + for (final ValidationType v : values()) + if (v != ALL) + cts.add(v); + CONCRETE_TYPES = Collections.unmodifiableSet(cts); + } + } @Argument(fullName = "validationTypeToExclude", shortName = "Xtype", doc = "which validation type to exclude from a full strict validation", required = false) protected List excludeTypes = new ArrayList<>(); @@ -164,8 +185,14 @@ public class ValidateVariants extends RodWalker { private File file = null; + /** + * Contains final set of validation to apply. + */ + private Collection validationTypes; + public void initialize() { file = new File(variantCollection.variants.getSource()); + validationTypes = calculateValidationTypesToApply(excludeTypes); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -215,7 +242,7 @@ public class ValidateVariants extends RodWalker { } try { - for (final ValidationType t : calculateValidationTypesToApply(type,excludeTypes)) + for (final ValidationType t : validationTypes) applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t); } catch (TribbleException e) { if ( WARN_ON_ERROR ) { @@ -229,34 +256,27 @@ public class ValidateVariants extends RodWalker { /** * Given the validation type and exclusion type, calculate the final set of type to validate. - * @param type the validation type. - * @param excludeType types to exclude. + * @param excludeTypes types to exclude. * * @throws UserException.BadArgumentValue if the user combines any validation type except 'ALL' and some exclude types. * * @return never {@code null} but perhaps an empty set. */ - private Collection calculateValidationTypesToApply(final ValidationType type, final List excludeType) { - final Set excludeTypeSet = new LinkedHashSet<>(excludeType); - if (type == ValidationType.ALL) { - if (excludeTypeSet.size() == 0) - return Collections.singleton(ValidationType.ALL); - else { - if (excludeTypeSet.remove(ValidationType.NONE)) - logger.warn("found NONE in the --validationTypeToExclude list; does not have any effect"); - if (excludeTypeSet.contains(ValidationType.ALL)) - logger.warn("found ALL in the --validationTypeToExclude list. Perhaps you want to simply use --validationType NONE instead?"); - - final Set result = new LinkedHashSet<>(Arrays.asList(ValidationType.values())); - result.remove(ValidationType.ALL); - result.remove(ValidationType.NONE); - result.removeAll(excludeTypeSet); - return result; - } - } else if (excludeTypeSet.size() != 0) - throw new UserException.BadArgumentValue("--validationTypeToExclude","you can use --validationTypeToExclude ONLY in combination with the default --validationType ALL"); - else - return Collections.singleton(type); + private Collection calculateValidationTypesToApply(final List excludeTypes) { + if (excludeTypes.size() == 0) + return Collections.singleton(ValidationType.ALL); + final Set excludeTypeSet = new LinkedHashSet<>(excludeTypes); + if (excludeTypes.size() != excludeTypeSet.size()) + logger.warn("found repeat redundant validation types listed using the --validationTypeToExclude argument"); + if (excludeTypeSet.contains(ValidationType.ALL)) { + if (excludeTypeSet.size() > 1) + logger.warn("found ALL in the --validationTypeToExclude list together with other concrete type exclusions that are redundant"); + return Collections.emptyList(); + } else { + final Set result = new LinkedHashSet<>(ValidationType.CONCRETE_TYPES); + result.removeAll(excludeTypeSet); + return result; + } } private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set rsIDs, ValidationType t) {