Merge pull request #590 from broadinstitute/vrr_validate_variants_unused_alleles_fix

Addresses issue with strict validation on GVCF files.
This commit is contained in:
Eric Banks 2014-04-07 22:10:49 -04:00
commit ad336375dc
2 changed files with 186 additions and 39 deletions

View File

@ -55,9 +55,27 @@ import java.util.Arrays;
public class ValidateVariantsIntegrationTest extends WalkerTest { public class ValidateVariantsIntegrationTest extends WalkerTest {
protected static final String emptyMd5 = "d41d8cd98f00b204e9800998ecf8427e"; protected static final String emptyMd5 = "d41d8cd98f00b204e9800998ecf8427e";
protected static final String defaultRegion = "1:10001292-10001303";
public static String baseTestString(String file, String type) {
return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 --variant:vcf " + privateTestDir + file + " --validationType " + type; public static String baseTestString(final String file, String type) {
return baseTestString(file,type,defaultRegion,b36KGReference);
}
public static String baseTestString(String file, String type, String region, String reference) {
final String typeArgString = type.startsWith("-") ? " --validationTypeToExclude " + type.substring(1) : excludeValidationTypesButString(type);
return "-T ValidateVariants -R " + reference + " -L " + region + " --variant:vcf " + privateTestDir + file + typeArgString;
}
private static String excludeValidationTypesButString(String type) {
if (type == "ALL")
return "";
final ValidateVariants.ValidationType vtype = ValidateVariants.ValidationType.valueOf(type);
final StringBuilder sbuilder = new StringBuilder();
for (final ValidateVariants.ValidationType t : ValidateVariants.ValidationType.CONCRETE_TYPES)
if (t != vtype)
sbuilder.append(" --validationTypeToExclude " + t.toString());
return sbuilder.toString();
} }
@Test @Test
@ -117,12 +135,11 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
@Test @Test
public void testBadID() { public void testBadID() {
WalkerTestSpec spec = new WalkerTestSpec( final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "IDS") + " --dbsnp " + b36dbSNP129, baseTestString("validationExampleBad.vcf", "IDS") + " --dbsnp " + b36dbSNP129,
0, 0,
UserException.FailsStrictValidation.class UserException.FailsStrictValidation.class
); );
executeTest("test bad RS ID", spec); executeTest("test bad RS ID", spec);
} }
@ -151,7 +168,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
@Test @Test
public void testNoValidation() { public void testNoValidation() {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "NONE"), baseTestString("validationExampleBad.vcf", "-ALL"),
0, 0,
Arrays.asList(emptyMd5) Arrays.asList(emptyMd5)
); );
@ -169,4 +186,18 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
executeTest("test validating complex events", spec); executeTest("test validating complex events", spec);
} }
@Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
public void testUnusedAlleleFix() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationUnusedAllelesBugFix.vcf","-ALLELES","1:1-739000",b37KGReference),0,Arrays.asList(emptyMd5));
executeTest("test unused allele bug fix", spec);
}
@Test(description = "Checks '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
public void testUnusedAlleleError() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationUnusedAllelesBugFix.vcf","ALLELES","1:1-739000",b37KGReference),0, UserException.FailsStrictValidation.class);
executeTest("test unused allele bug fix", spec);
}
} }

View File

@ -26,26 +26,26 @@
package org.broadinstitute.sting.gatk.walkers.variantutils; package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.TribbleException; import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFConstants;
import java.io.File; import java.io.File;
import java.util.Arrays; import java.util.*;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
/** /**
@ -53,19 +53,37 @@ import java.util.Set;
* *
* <p> * <p>
* ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it. * ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it.
* In addition to standard adherence to the VCF specification, this tool performs extra checks to make ensure * In addition to standard adherence to the VCF specification, this tool performs extra strict validations to ensure
* the information contained within the file is correct. Checks include the correctness of the reference base(s), * the information contained within the file is correct. These include:
* accuracy of AC & AN values, tests against rsIDs when a dbSNP file is provided, and that all alternate alleles * </p><p>
* are present in at least one sample. * <dl>
* <dt>REF</dt><dd>the correctness of the reference base(s).</dd>
* <dt>CHR_COUNTS</dt><dd>accuracy of AC & AN values.</dd>
* <dt>IDS</dt><dd>tests against rsIDs when a dbSNP file is provided. Notice that for this one to work, you need
* to provide a reference to the dbsnp variant containing file using the <code>--dbsnp</code> as show in examples below.</dd>
* <dt>ALLELES</dt><dd>and that all alternate alleles are present in at least one sample.</dd>
* </dl>
* *
* If you are looking simply to test the adherence to the VCF specification, use --validationType NONE. * </p>
*
* <p>
* By default it will apply all the strict validations unless you indicate which one you want you want to exclude
* using <code>-Xtype|--validationTypeToExclude &lt;<i>code</i>&gt;</code>, where <i>code</i> is one of the listed above. You
* can exclude as many types as you want
* <p>
* Yo can exclude all strict validations with the special code <code><b>ALL</b></code>. In this case the tool will only
* test the adherence to the VCF specification.
* </p>
* *
* <h3>Input</h3> * <h3>Input</h3>
* <p> * <p>
* A variant set to validate. * A variant set to validate using <code>-V</code> or <code>--variant</code> as shown below.
* </p> * </p>
* *
* <h3>Examples</h3> * <h3>Examples</h3>
*
* <p>To perform VCF format and all strict validations: </p>
*
* <pre> * <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \ * java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \ * -R ref.fasta \
@ -74,6 +92,27 @@ import java.util.Set;
* --dbsnp dbsnp.vcf * --dbsnp dbsnp.vcf
* </pre> * </pre>
* *
* <p>To perform only VCF format tests:</p>
*
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ValidateVariants \
* <b>--validationTypeToExclude ALL</b> \
* --variant input.vcf
* </pre>
*
* <p>To perform all validations except the strict <i>ALLELE</i> validation:</p>
*
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ValidateVariants \
* <b>--validationTypeToExclude ALLELES</b>
* --variant input.vcf \
* --dbsnp dbsnp.vcf
* </pre>
*
*/ */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} ) @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=0,stop=100)) @Reference(window=@Window(start=0,stop=100))
@ -86,11 +125,52 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public enum ValidationType { public enum ValidationType {
ALL, REF, IDS, ALLELES, CHR_COUNTS, NONE
/**
* Makes reference to all extra-strict tests listed below.
*/
ALL,
/**
* Check whether the reported reference base in the VCF is the same as the corresponding base in the
* actual reference.
*/
REF,
/**
* Checks whether the variant IDs exists, only relevant if the user indicates a DBSNP vcf file (see {@link #dbsnp}).
*/
IDS,
/**
* Check whether all alternative alleles participate in a genotype call of at least on sample.
*/
ALLELES,
/**
* Check that the AN and AC annotations are consistent with the number of calls, alleles and then number these
* are called across samples.
*/
CHR_COUNTS;
/**
* Unmodifiable set of concrete validation types.
*
* <p>These are all types except {@link #ALL}.</p>
*/
public final static Set<ValidationType> CONCRETE_TYPES;
static {
final Set<ValidationType> cts = new LinkedHashSet<>(values().length - 1);
for (final ValidationType v : values())
if (v != ALL)
cts.add(v);
CONCRETE_TYPES = Collections.unmodifiableSet(cts);
}
} }
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false) @Argument(fullName = "validationTypeToExclude", shortName = "Xtype", doc = "which validation type to exclude from a full strict validation", required = false)
protected ValidationType type = ValidationType.ALL; protected List<ValidationType> excludeTypes = new ArrayList<>();
/** /**
* By default, even filtered records are validated. * By default, even filtered records are validated.
@ -105,8 +185,14 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
private File file = null; private File file = null;
/**
* Contains final set of validation to apply.
*/
private Collection<ValidationType> validationTypes;
public void initialize() { public void initialize() {
file = new File(variantCollection.variants.getSource()); file = new File(variantCollection.variants.getSource());
validationTypes = calculateValidationTypesToApply(excludeTypes);
} }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -156,7 +242,45 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
} }
try { try {
switch( type ) { for (final ValidationType t : validationTypes)
applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t);
} catch (TribbleException e) {
if ( WARN_ON_ERROR ) {
numErrors++;
logger.warn("***** " + e.getMessage() + " *****");
} else {
throw new UserException.FailsStrictValidation(file, e.getMessage());
}
}
}
/**
* Given the validation type and exclusion type, calculate the final set of type to validate.
* @param excludeTypes types to exclude.
*
* @throws UserException.BadArgumentValue if the user combines any validation type except 'ALL' and some exclude types.
*
* @return never {@code null} but perhaps an empty set.
*/
private Collection<ValidationType> calculateValidationTypesToApply(final List<ValidationType> excludeTypes) {
if (excludeTypes.size() == 0)
return Collections.singleton(ValidationType.ALL);
final Set<ValidationType> excludeTypeSet = new LinkedHashSet<>(excludeTypes);
if (excludeTypes.size() != excludeTypeSet.size())
logger.warn("found repeat redundant validation types listed using the --validationTypeToExclude argument");
if (excludeTypeSet.contains(ValidationType.ALL)) {
if (excludeTypeSet.size() > 1)
logger.warn("found ALL in the --validationTypeToExclude list together with other concrete type exclusions that are redundant");
return Collections.emptyList();
} else {
final Set<ValidationType> result = new LinkedHashSet<>(ValidationType.CONCRETE_TYPES);
result.removeAll(excludeTypeSet);
return result;
}
}
private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set<String> rsIDs, ValidationType t) {
switch( t ) {
case ALL: case ALL:
vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs); vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs);
break; break;
@ -173,13 +297,5 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
vc.validateChromosomeCounts(); vc.validateChromosomeCounts();
break; break;
} }
} catch (TribbleException e) {
if ( WARN_ON_ERROR ) {
numErrors++;
logger.warn("***** " + e.getMessage() + " *****");
} else {
throw new UserException.FailsStrictValidation(file, e.getMessage());
}
}
} }
} }