Merge pull request #590 from broadinstitute/vrr_validate_variants_unused_alleles_fix
Addresses issue with strict validation on GVCF files.
This commit is contained in:
commit
ad336375dc
|
|
@ -55,9 +55,27 @@ import java.util.Arrays;
|
|||
public class ValidateVariantsIntegrationTest extends WalkerTest {
|
||||
|
||||
protected static final String emptyMd5 = "d41d8cd98f00b204e9800998ecf8427e";
|
||||
protected static final String defaultRegion = "1:10001292-10001303";
|
||||
|
||||
public static String baseTestString(String file, String type) {
|
||||
return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 --variant:vcf " + privateTestDir + file + " --validationType " + type;
|
||||
|
||||
public static String baseTestString(final String file, String type) {
|
||||
return baseTestString(file,type,defaultRegion,b36KGReference);
|
||||
}
|
||||
|
||||
public static String baseTestString(String file, String type, String region, String reference) {
|
||||
final String typeArgString = type.startsWith("-") ? " --validationTypeToExclude " + type.substring(1) : excludeValidationTypesButString(type);
|
||||
return "-T ValidateVariants -R " + reference + " -L " + region + " --variant:vcf " + privateTestDir + file + typeArgString;
|
||||
}
|
||||
|
||||
private static String excludeValidationTypesButString(String type) {
|
||||
if (type == "ALL")
|
||||
return "";
|
||||
final ValidateVariants.ValidationType vtype = ValidateVariants.ValidationType.valueOf(type);
|
||||
final StringBuilder sbuilder = new StringBuilder();
|
||||
for (final ValidateVariants.ValidationType t : ValidateVariants.ValidationType.CONCRETE_TYPES)
|
||||
if (t != vtype)
|
||||
sbuilder.append(" --validationTypeToExclude " + t.toString());
|
||||
return sbuilder.toString();
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -117,12 +135,11 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testBadID() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("validationExampleBad.vcf", "IDS") + " --dbsnp " + b36dbSNP129,
|
||||
0,
|
||||
UserException.FailsStrictValidation.class
|
||||
);
|
||||
|
||||
executeTest("test bad RS ID", spec);
|
||||
}
|
||||
|
||||
|
|
@ -151,7 +168,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testNoValidation() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("validationExampleBad.vcf", "NONE"),
|
||||
baseTestString("validationExampleBad.vcf", "-ALL"),
|
||||
0,
|
||||
Arrays.asList(emptyMd5)
|
||||
);
|
||||
|
|
@ -169,4 +186,18 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
|||
|
||||
executeTest("test validating complex events", spec);
|
||||
}
|
||||
|
||||
@Test(description = "Fixes '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
|
||||
public void testUnusedAlleleFix() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("validationUnusedAllelesBugFix.vcf","-ALLELES","1:1-739000",b37KGReference),0,Arrays.asList(emptyMd5));
|
||||
executeTest("test unused allele bug fix", spec);
|
||||
}
|
||||
|
||||
@Test(description = "Checks '''bug''' reported in story https://www.pivotaltracker.com/story/show/68725164")
|
||||
public void testUnusedAlleleError() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("validationUnusedAllelesBugFix.vcf","ALLELES","1:1-739000",b37KGReference),0, UserException.FailsStrictValidation.class);
|
||||
executeTest("test unused allele bug fix", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,26 +26,26 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -53,19 +53,37 @@ import java.util.Set;
|
|||
*
|
||||
* <p>
|
||||
* ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it.
|
||||
* In addition to standard adherence to the VCF specification, this tool performs extra checks to make ensure
|
||||
* the information contained within the file is correct. Checks include the correctness of the reference base(s),
|
||||
* accuracy of AC & AN values, tests against rsIDs when a dbSNP file is provided, and that all alternate alleles
|
||||
* are present in at least one sample.
|
||||
* In addition to standard adherence to the VCF specification, this tool performs extra strict validations to ensure
|
||||
* the information contained within the file is correct. These include:
|
||||
* </p><p>
|
||||
* <dl>
|
||||
* <dt>REF</dt><dd>the correctness of the reference base(s).</dd>
|
||||
* <dt>CHR_COUNTS</dt><dd>accuracy of AC & AN values.</dd>
|
||||
* <dt>IDS</dt><dd>tests against rsIDs when a dbSNP file is provided. Notice that for this one to work, you need
|
||||
* to provide a reference to the dbsnp variant containing file using the <code>--dbsnp</code> as show in examples below.</dd>
|
||||
* <dt>ALLELES</dt><dd>and that all alternate alleles are present in at least one sample.</dd>
|
||||
* </dl>
|
||||
*
|
||||
* If you are looking simply to test the adherence to the VCF specification, use --validationType NONE.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* By default it will apply all the strict validations unless you indicate which one you want you want to exclude
|
||||
* using <code>-Xtype|--validationTypeToExclude <<i>code</i>></code>, where <i>code</i> is one of the listed above. You
|
||||
* can exclude as many types as you want
|
||||
* <p>
|
||||
* Yo can exclude all strict validations with the special code <code><b>ALL</b></code>. In this case the tool will only
|
||||
* test the adherence to the VCF specification.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* A variant set to validate.
|
||||
* A variant set to validate using <code>-V</code> or <code>--variant</code> as shown below.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Examples</h3>
|
||||
*
|
||||
* <p>To perform VCF format and all strict validations: </p>
|
||||
*
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
|
|
@ -74,6 +92,27 @@ import java.util.Set;
|
|||
* --dbsnp dbsnp.vcf
|
||||
* </pre>
|
||||
*
|
||||
* <p>To perform only VCF format tests:</p>
|
||||
*
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T ValidateVariants \
|
||||
* <b>--validationTypeToExclude ALL</b> \
|
||||
* --variant input.vcf
|
||||
* </pre>
|
||||
*
|
||||
* <p>To perform all validations except the strict <i>ALLELE</i> validation:</p>
|
||||
*
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T ValidateVariants \
|
||||
* <b>--validationTypeToExclude ALLELES</b>
|
||||
* --variant input.vcf \
|
||||
* --dbsnp dbsnp.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} )
|
||||
@Reference(window=@Window(start=0,stop=100))
|
||||
|
|
@ -86,11 +125,52 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
|
||||
public enum ValidationType {
|
||||
ALL, REF, IDS, ALLELES, CHR_COUNTS, NONE
|
||||
|
||||
/**
|
||||
* Makes reference to all extra-strict tests listed below.
|
||||
*/
|
||||
ALL,
|
||||
|
||||
/**
|
||||
* Check whether the reported reference base in the VCF is the same as the corresponding base in the
|
||||
* actual reference.
|
||||
*/
|
||||
REF,
|
||||
|
||||
/**
|
||||
* Checks whether the variant IDs exists, only relevant if the user indicates a DBSNP vcf file (see {@link #dbsnp}).
|
||||
*/
|
||||
IDS,
|
||||
|
||||
/**
|
||||
* Check whether all alternative alleles participate in a genotype call of at least on sample.
|
||||
*/
|
||||
ALLELES,
|
||||
|
||||
/**
|
||||
* Check that the AN and AC annotations are consistent with the number of calls, alleles and then number these
|
||||
* are called across samples.
|
||||
*/
|
||||
CHR_COUNTS;
|
||||
|
||||
/**
|
||||
* Unmodifiable set of concrete validation types.
|
||||
*
|
||||
* <p>These are all types except {@link #ALL}.</p>
|
||||
*/
|
||||
public final static Set<ValidationType> CONCRETE_TYPES;
|
||||
|
||||
static {
|
||||
final Set<ValidationType> cts = new LinkedHashSet<>(values().length - 1);
|
||||
for (final ValidationType v : values())
|
||||
if (v != ALL)
|
||||
cts.add(v);
|
||||
CONCRETE_TYPES = Collections.unmodifiableSet(cts);
|
||||
}
|
||||
}
|
||||
|
||||
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false)
|
||||
protected ValidationType type = ValidationType.ALL;
|
||||
@Argument(fullName = "validationTypeToExclude", shortName = "Xtype", doc = "which validation type to exclude from a full strict validation", required = false)
|
||||
protected List<ValidationType> excludeTypes = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* By default, even filtered records are validated.
|
||||
|
|
@ -105,8 +185,14 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
private File file = null;
|
||||
|
||||
/**
|
||||
* Contains final set of validation to apply.
|
||||
*/
|
||||
private Collection<ValidationType> validationTypes;
|
||||
|
||||
public void initialize() {
|
||||
file = new File(variantCollection.variants.getSource());
|
||||
validationTypes = calculateValidationTypesToApply(excludeTypes);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
|
@ -156,23 +242,8 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
try {
|
||||
switch( type ) {
|
||||
case ALL:
|
||||
vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs);
|
||||
break;
|
||||
case REF:
|
||||
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
|
||||
break;
|
||||
case IDS:
|
||||
vc.validateRSIDs(rsIDs);
|
||||
break;
|
||||
case ALLELES:
|
||||
vc.validateAlternateAlleles();
|
||||
break;
|
||||
case CHR_COUNTS:
|
||||
vc.validateChromosomeCounts();
|
||||
break;
|
||||
}
|
||||
for (final ValidationType t : validationTypes)
|
||||
applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t);
|
||||
} catch (TribbleException e) {
|
||||
if ( WARN_ON_ERROR ) {
|
||||
numErrors++;
|
||||
|
|
@ -182,4 +253,49 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the validation type and exclusion type, calculate the final set of type to validate.
|
||||
* @param excludeTypes types to exclude.
|
||||
*
|
||||
* @throws UserException.BadArgumentValue if the user combines any validation type except 'ALL' and some exclude types.
|
||||
*
|
||||
* @return never {@code null} but perhaps an empty set.
|
||||
*/
|
||||
private Collection<ValidationType> calculateValidationTypesToApply(final List<ValidationType> excludeTypes) {
|
||||
if (excludeTypes.size() == 0)
|
||||
return Collections.singleton(ValidationType.ALL);
|
||||
final Set<ValidationType> excludeTypeSet = new LinkedHashSet<>(excludeTypes);
|
||||
if (excludeTypes.size() != excludeTypeSet.size())
|
||||
logger.warn("found repeat redundant validation types listed using the --validationTypeToExclude argument");
|
||||
if (excludeTypeSet.contains(ValidationType.ALL)) {
|
||||
if (excludeTypeSet.size() > 1)
|
||||
logger.warn("found ALL in the --validationTypeToExclude list together with other concrete type exclusions that are redundant");
|
||||
return Collections.emptyList();
|
||||
} else {
|
||||
final Set<ValidationType> result = new LinkedHashSet<>(ValidationType.CONCRETE_TYPES);
|
||||
result.removeAll(excludeTypeSet);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set<String> rsIDs, ValidationType t) {
|
||||
switch( t ) {
|
||||
case ALL:
|
||||
vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs);
|
||||
break;
|
||||
case REF:
|
||||
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
|
||||
break;
|
||||
case IDS:
|
||||
vc.validateRSIDs(rsIDs);
|
||||
break;
|
||||
case ALLELES:
|
||||
vc.validateAlternateAlleles();
|
||||
break;
|
||||
case CHR_COUNTS:
|
||||
vc.validateChromosomeCounts();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue