Merge pull request #590 from broadinstitute/vrr_validate_variants_unused_alleles_fix

Addresses issue with strict validation on GVCF files.
This commit is contained in:
Eric Banks 2014-04-07 22:10:49 -04:00
commit ad336375dc
2 changed files with 186 additions and 39 deletions

View File

@ -55,9 +55,27 @@ import java.util.Arrays;
public class ValidateVariantsIntegrationTest extends WalkerTest {
protected static final String emptyMd5 = "d41d8cd98f00b204e9800998ecf8427e";
protected static final String defaultRegion = "1:10001292-10001303";
public static String baseTestString(String file, String type) {
return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 --variant:vcf " + privateTestDir + file + " --validationType " + type;
public static String baseTestString(final String file, String type) {
return baseTestString(file,type,defaultRegion,b36KGReference);
}
public static String baseTestString(String file, String type, String region, String reference) {
final String typeArgString = type.startsWith("-") ? " --validationTypeToExclude " + type.substring(1) : excludeValidationTypesButString(type);
return "-T ValidateVariants -R " + reference + " -L " + region + " --variant:vcf " + privateTestDir + file + typeArgString;
}
private static String excludeValidationTypesButString(String type) {
if (type == "ALL")
return "";
final ValidateVariants.ValidationType vtype = ValidateVariants.ValidationType.valueOf(type);
final StringBuilder sbuilder = new StringBuilder();
for (final ValidateVariants.ValidationType t : ValidateVariants.ValidationType.CONCRETE_TYPES)
if (t != vtype)
sbuilder.append(" --validationTypeToExclude " + t.toString());
return sbuilder.toString();
}
@Test
@ -117,12 +135,11 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
@Test
public void testBadID() {
WalkerTestSpec spec = new WalkerTestSpec(
final WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "IDS") + " --dbsnp " + b36dbSNP129,
0,
UserException.FailsStrictValidation.class
);
executeTest("test bad RS ID", spec);
}
@ -151,7 +168,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
@Test
public void testNoValidation() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "NONE"),
baseTestString("validationExampleBad.vcf", "-ALL"),
0,
Arrays.asList(emptyMd5)
);
@ -169,4 +186,18 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
executeTest("test validating complex events", spec);
}
@Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
public void testUnusedAlleleFix() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationUnusedAllelesBugFix.vcf","-ALLELES","1:1-739000",b37KGReference),0,Arrays.asList(emptyMd5));
executeTest("test unused allele bug fix", spec);
}
@Test(description = "Checks '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
public void testUnusedAlleleError() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationUnusedAllelesBugFix.vcf","ALLELES","1:1-739000",b37KGReference),0, UserException.FailsStrictValidation.class);
executeTest("test unused allele bug fix", spec);
}
}

View File

@ -26,26 +26,26 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFConstants;
import java.io.File;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.*;
/**
@ -53,19 +53,37 @@ import java.util.Set;
*
* <p>
* ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it.
* In addition to standard adherence to the VCF specification, this tool performs extra checks to make ensure
* the information contained within the file is correct. Checks include the correctness of the reference base(s),
* accuracy of AC & AN values, tests against rsIDs when a dbSNP file is provided, and that all alternate alleles
* are present in at least one sample.
* In addition to standard adherence to the VCF specification, this tool performs extra strict validations to ensure
* the information contained within the file is correct. These include:
* </p><p>
* <dl>
* <dt>REF</dt><dd>the correctness of the reference base(s).</dd>
* <dt>CHR_COUNTS</dt><dd>accuracy of AC & AN values.</dd>
* <dt>IDS</dt><dd>tests against rsIDs when a dbSNP file is provided. Notice that for this one to work, you need
* to provide a reference to the dbsnp variant containing file using the <code>--dbsnp</code> as show in examples below.</dd>
* <dt>ALLELES</dt><dd>and that all alternate alleles are present in at least one sample.</dd>
* </dl>
*
* If you are looking simply to test the adherence to the VCF specification, use --validationType NONE.
* </p>
*
* <p>
* By default it will apply all the strict validations unless you indicate which one you want you want to exclude
* using <code>-Xtype|--validationTypeToExclude &lt;<i>code</i>&gt;</code>, where <i>code</i> is one of the listed above. You
* can exclude as many types as you want
* <p>
* Yo can exclude all strict validations with the special code <code><b>ALL</b></code>. In this case the tool will only
* test the adherence to the VCF specification.
* </p>
*
* <h3>Input</h3>
* <p>
* A variant set to validate.
* A variant set to validate using <code>-V</code> or <code>--variant</code> as shown below.
* </p>
*
* <h3>Examples</h3>
*
* <p>To perform VCF format and all strict validations: </p>
*
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
@ -74,6 +92,27 @@ import java.util.Set;
* --dbsnp dbsnp.vcf
* </pre>
*
* <p>To perform only VCF format tests:</p>
*
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ValidateVariants \
* <b>--validationTypeToExclude ALL</b> \
* --variant input.vcf
* </pre>
*
* <p>To perform all validations except the strict <i>ALLELE</i> validation:</p>
*
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ValidateVariants \
* <b>--validationTypeToExclude ALLELES</b>
* --variant input.vcf \
* --dbsnp dbsnp.vcf
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=0,stop=100))
@ -86,11 +125,52 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public enum ValidationType {
ALL, REF, IDS, ALLELES, CHR_COUNTS, NONE
/**
* Makes reference to all extra-strict tests listed below.
*/
ALL,
/**
* Check whether the reported reference base in the VCF is the same as the corresponding base in the
* actual reference.
*/
REF,
/**
* Checks whether the variant IDs exists, only relevant if the user indicates a DBSNP vcf file (see {@link #dbsnp}).
*/
IDS,
/**
* Check whether all alternative alleles participate in a genotype call of at least on sample.
*/
ALLELES,
/**
* Check that the AN and AC annotations are consistent with the number of calls, alleles and then number these
* are called across samples.
*/
CHR_COUNTS;
/**
* Unmodifiable set of concrete validation types.
*
* <p>These are all types except {@link #ALL}.</p>
*/
public final static Set<ValidationType> CONCRETE_TYPES;
static {
final Set<ValidationType> cts = new LinkedHashSet<>(values().length - 1);
for (final ValidationType v : values())
if (v != ALL)
cts.add(v);
CONCRETE_TYPES = Collections.unmodifiableSet(cts);
}
}
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false)
protected ValidationType type = ValidationType.ALL;
@Argument(fullName = "validationTypeToExclude", shortName = "Xtype", doc = "which validation type to exclude from a full strict validation", required = false)
protected List<ValidationType> excludeTypes = new ArrayList<>();
/**
* By default, even filtered records are validated.
@ -105,8 +185,14 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
private File file = null;
/**
* Contains final set of validation to apply.
*/
private Collection<ValidationType> validationTypes;
public void initialize() {
file = new File(variantCollection.variants.getSource());
validationTypes = calculateValidationTypesToApply(excludeTypes);
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -156,23 +242,8 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
}
try {
switch( type ) {
case ALL:
vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs);
break;
case REF:
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
break;
case IDS:
vc.validateRSIDs(rsIDs);
break;
case ALLELES:
vc.validateAlternateAlleles();
break;
case CHR_COUNTS:
vc.validateChromosomeCounts();
break;
}
for (final ValidationType t : validationTypes)
applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t);
} catch (TribbleException e) {
if ( WARN_ON_ERROR ) {
numErrors++;
@ -182,4 +253,49 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
}
}
}
/**
* Given the validation type and exclusion type, calculate the final set of type to validate.
* @param excludeTypes types to exclude.
*
* @throws UserException.BadArgumentValue if the user combines any validation type except 'ALL' and some exclude types.
*
* @return never {@code null} but perhaps an empty set.
*/
private Collection<ValidationType> calculateValidationTypesToApply(final List<ValidationType> excludeTypes) {
if (excludeTypes.size() == 0)
return Collections.singleton(ValidationType.ALL);
final Set<ValidationType> excludeTypeSet = new LinkedHashSet<>(excludeTypes);
if (excludeTypes.size() != excludeTypeSet.size())
logger.warn("found repeat redundant validation types listed using the --validationTypeToExclude argument");
if (excludeTypeSet.contains(ValidationType.ALL)) {
if (excludeTypeSet.size() > 1)
logger.warn("found ALL in the --validationTypeToExclude list together with other concrete type exclusions that are redundant");
return Collections.emptyList();
} else {
final Set<ValidationType> result = new LinkedHashSet<>(ValidationType.CONCRETE_TYPES);
result.removeAll(excludeTypeSet);
return result;
}
}
private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set<String> rsIDs, ValidationType t) {
switch( t ) {
case ALL:
vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs);
break;
case REF:
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
break;
case IDS:
vc.validateRSIDs(rsIDs);
break;
case ALLELES:
vc.validateAlternateAlleles();
break;
case CHR_COUNTS:
vc.validateChromosomeCounts();
break;
}
}
}