diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java index ebdd3a17a..13ba2f58a 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -55,9 +55,27 @@ import java.util.Arrays; public class ValidateVariantsIntegrationTest extends WalkerTest { protected static final String emptyMd5 = "d41d8cd98f00b204e9800998ecf8427e"; + protected static final String defaultRegion = "1:10001292-10001303"; - public static String baseTestString(String file, String type) { - return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 --variant:vcf " + privateTestDir + file + " --validationType " + type; + + public static String baseTestString(final String file, String type) { + return baseTestString(file,type,defaultRegion,b36KGReference); + } + + public static String baseTestString(String file, String type, String region, String reference) { + final String typeArgString = type.startsWith("-") ? " --validationTypeToExclude " + type.substring(1) : excludeValidationTypesButString(type); + return "-T ValidateVariants -R " + reference + " -L " + region + " --variant:vcf " + privateTestDir + file + typeArgString; + } + + private static String excludeValidationTypesButString(String type) { + if (type == "ALL") + return ""; + final ValidateVariants.ValidationType vtype = ValidateVariants.ValidationType.valueOf(type); + final StringBuilder sbuilder = new StringBuilder(); + for (final ValidateVariants.ValidationType t : ValidateVariants.ValidationType.CONCRETE_TYPES) + if (t != vtype) + sbuilder.append(" --validationTypeToExclude " + t.toString()); + return sbuilder.toString(); } @Test @@ -117,12 +135,11 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { @Test public void testBadID() { - WalkerTestSpec spec = new WalkerTestSpec( + final WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad.vcf", "IDS") + " --dbsnp " + b36dbSNP129, 0, UserException.FailsStrictValidation.class ); - executeTest("test bad RS ID", spec); } @@ -151,7 +168,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { @Test public void testNoValidation() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString("validationExampleBad.vcf", "NONE"), + baseTestString("validationExampleBad.vcf", "-ALL"), 0, Arrays.asList(emptyMd5) ); @@ -169,4 +186,18 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { executeTest("test validating complex events", spec); } + + @Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164") + public void testUnusedAlleleFix() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationUnusedAllelesBugFix.vcf","-ALLELES","1:1-739000",b37KGReference),0,Arrays.asList(emptyMd5)); + executeTest("test unused allele bug fix", spec); + } + + @Test(description = "Checks '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164") + public void testUnusedAlleleError() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationUnusedAllelesBugFix.vcf","ALLELES","1:1-739000",b37KGReference),0, UserException.FailsStrictValidation.class); + executeTest("test unused allele bug fix", spec); + } } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index d11cf5aee..a9a4cfd53 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -26,26 +26,26 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broad.tribble.TribbleException; -import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFConstants; import java.io.File; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashSet; -import java.util.Set; +import java.util.*; /** @@ -53,19 +53,37 @@ import java.util.Set; * *

* ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it. - * In addition to standard adherence to the VCF specification, this tool performs extra checks to make ensure - * the information contained within the file is correct. Checks include the correctness of the reference base(s), - * accuracy of AC & AN values, tests against rsIDs when a dbSNP file is provided, and that all alternate alleles - * are present in at least one sample. + * In addition to standard adherence to the VCF specification, this tool performs extra strict validations to ensure + * the information contained within the file is correct. These include: + *

+ *

+ *
REF
the correctness of the reference base(s).
+ *
CHR_COUNTS
accuracy of AC & AN values.
+ *
IDS
tests against rsIDs when a dbSNP file is provided. Notice that for this one to work, you need + * to provide a reference to the dbsnp variant containing file using the --dbsnp as show in examples below.
+ *
ALLELES
and that all alternate alleles are present in at least one sample.
+ *
* - * If you are looking simply to test the adherence to the VCF specification, use --validationType NONE. + *

+ * + *

+ * By default it will apply all the strict validations unless you indicate which one you want you want to exclude + * using -Xtype|--validationTypeToExclude <code>, where code is one of the listed above. You + * can exclude as many types as you want + *

+ * Yo can exclude all strict validations with the special code ALL. In this case the tool will only + * test the adherence to the VCF specification. + *

* *

Input

*

- * A variant set to validate. + * A variant set to validate using -V or --variant as shown below. *

* *

Examples

+ * + *

To perform VCF format and all strict validations:

+ * *
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
@@ -74,6 +92,27 @@ import java.util.Set;
  *   --dbsnp dbsnp.vcf
  * 
* + *

To perform only VCF format tests:

+ * + *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ValidateVariants \
+ *   --validationTypeToExclude ALL \
+ *   --variant input.vcf
+ * 
+ * + *

To perform all validations except the strict ALLELE validation:

+ * + *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ValidateVariants \
+ *   --validationTypeToExclude ALLELES
+ *   --variant input.vcf \
+ *   --dbsnp dbsnp.vcf
+ * 
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=100)) @@ -86,11 +125,52 @@ public class ValidateVariants extends RodWalker { protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); public enum ValidationType { - ALL, REF, IDS, ALLELES, CHR_COUNTS, NONE + + /** + * Makes reference to all extra-strict tests listed below. + */ + ALL, + + /** + * Check whether the reported reference base in the VCF is the same as the corresponding base in the + * actual reference. + */ + REF, + + /** + * Checks whether the variant IDs exists, only relevant if the user indicates a DBSNP vcf file (see {@link #dbsnp}). + */ + IDS, + + /** + * Check whether all alternative alleles participate in a genotype call of at least on sample. + */ + ALLELES, + + /** + * Check that the AN and AC annotations are consistent with the number of calls, alleles and then number these + * are called across samples. + */ + CHR_COUNTS; + + /** + * Unmodifiable set of concrete validation types. + * + *

These are all types except {@link #ALL}.

+ */ + public final static Set CONCRETE_TYPES; + + static { + final Set cts = new LinkedHashSet<>(values().length - 1); + for (final ValidationType v : values()) + if (v != ALL) + cts.add(v); + CONCRETE_TYPES = Collections.unmodifiableSet(cts); + } } - @Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false) - protected ValidationType type = ValidationType.ALL; + @Argument(fullName = "validationTypeToExclude", shortName = "Xtype", doc = "which validation type to exclude from a full strict validation", required = false) + protected List excludeTypes = new ArrayList<>(); /** * By default, even filtered records are validated. @@ -105,8 +185,14 @@ public class ValidateVariants extends RodWalker { private File file = null; + /** + * Contains final set of validation to apply. + */ + private Collection validationTypes; + public void initialize() { file = new File(variantCollection.variants.getSource()); + validationTypes = calculateValidationTypesToApply(excludeTypes); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -156,23 +242,8 @@ public class ValidateVariants extends RodWalker { } try { - switch( type ) { - case ALL: - vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs); - break; - case REF: - vc.validateReferenceBases(reportedRefAllele, observedRefAllele); - break; - case IDS: - vc.validateRSIDs(rsIDs); - break; - case ALLELES: - vc.validateAlternateAlleles(); - break; - case CHR_COUNTS: - vc.validateChromosomeCounts(); - break; - } + for (final ValidationType t : validationTypes) + applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t); } catch (TribbleException e) { if ( WARN_ON_ERROR ) { numErrors++; @@ -182,4 +253,49 @@ public class ValidateVariants extends RodWalker { } } } + + /** + * Given the validation type and exclusion type, calculate the final set of type to validate. + * @param excludeTypes types to exclude. + * + * @throws UserException.BadArgumentValue if the user combines any validation type except 'ALL' and some exclude types. + * + * @return never {@code null} but perhaps an empty set. + */ + private Collection calculateValidationTypesToApply(final List excludeTypes) { + if (excludeTypes.size() == 0) + return Collections.singleton(ValidationType.ALL); + final Set excludeTypeSet = new LinkedHashSet<>(excludeTypes); + if (excludeTypes.size() != excludeTypeSet.size()) + logger.warn("found repeat redundant validation types listed using the --validationTypeToExclude argument"); + if (excludeTypeSet.contains(ValidationType.ALL)) { + if (excludeTypeSet.size() > 1) + logger.warn("found ALL in the --validationTypeToExclude list together with other concrete type exclusions that are redundant"); + return Collections.emptyList(); + } else { + final Set result = new LinkedHashSet<>(ValidationType.CONCRETE_TYPES); + result.removeAll(excludeTypeSet); + return result; + } + } + + private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set rsIDs, ValidationType t) { + switch( t ) { + case ALL: + vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs); + break; + case REF: + vc.validateReferenceBases(reportedRefAllele, observedRefAllele); + break; + case IDS: + vc.validateRSIDs(rsIDs); + break; + case ALLELES: + vc.validateAlternateAlleles(); + break; + case CHR_COUNTS: + vc.validateChromosomeCounts(); + break; + } + } } \ No newline at end of file