Added option to validate gvcf (for ValidateVariants) (#1379)
* with option --gvcf CLP will now put extra checks that a gvcf must adhere to (existance of <NON_REF> allele at every variant, and that the variants in total cover the entire requested intervals, or the whole genome if no intervals have been specified) * works on gvcf produced by HC when using either GVCF or BP_RESOLUTION mode * added positive and negative tests
This commit is contained in:
parent
e1fadae139
commit
25fa25b618
|
|
@ -51,16 +51,19 @@
|
|||
|
||||
package org.broadinstitute.gatk.tools.walkers.variantutils;
|
||||
|
||||
import htsjdk.samtools.util.TestUtil;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.log4j.Level;
|
||||
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
|
||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||
import org.broadinstitute.gatk.utils.io.IOUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
public class ValidateVariantsIntegrationTest extends WalkerTest {
|
||||
|
||||
|
|
@ -279,7 +282,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
|||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ALLOW_SEQ_DICT_INCOMPATIBILITY ",
|
||||
0, Arrays.asList(EMPTY_MD5));
|
||||
0, Collections.singletonList(EMPTY_MD5));
|
||||
executeTest("test to allow wrong header contig length, not checking dictionary incompatibility", spec);
|
||||
}
|
||||
|
||||
|
|
@ -288,7 +291,78 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
|
|||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ",
|
||||
0, Arrays.asList(EMPTY_MD5));
|
||||
0, Collections.singletonList(EMPTY_MD5));
|
||||
executeTest("test to allow wrong header contig length, no compatibility checks", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGoodGvcf() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12891.AS.chr20snippet.g.vcf", "ALL", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
|
||||
0, Collections.singletonList("d41d8cd98f00b204e9800998ecf8427e"));
|
||||
executeTest("tests correct gvcf", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGoodGvcfExcludingAlleles() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12891.AS.chr20snippet.g.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
|
||||
0, Collections.singletonList("d41d8cd98f00b204e9800998ecf8427e"));
|
||||
executeTest("tests correct gvcf", spec);
|
||||
}
|
||||
|
||||
|
||||
@Test(expectedExceptions = RuntimeException.class )
|
||||
public void testBadGvcfMissingNON_REF() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12891.AS.chr20snippet.BAD_MISSING_NON_REF.g.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
|
||||
0, Collections.singletonList(EMPTY_MD5));
|
||||
executeTest("tests capture of missing NON_REF allele", spec);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = RuntimeException.class )
|
||||
public void testBadGvcfRegions() {
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("diploid-gvcf.bad-IncompleteRegion.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
|
||||
0, Collections.singletonList(EMPTY_MD5));
|
||||
executeTest("tests capture of non-complete region", spec);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = RuntimeException.class )
|
||||
public void testNonOverlappingRegions() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12891.AS.chr20snippet_BAD_INCOMPLETE_REGION.g.vcf", "-ALLELES", "Y:4966254-4967190", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
|
||||
0, Collections.singletonList(EMPTY_MD5));
|
||||
executeTest("tests capture of non-complete region", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNonOverlappingRegionsBP_RESOLUTION() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("gvcf.basepairResolution.vcf", "-ALLELES", "20:10000000-10010000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
|
||||
0, Collections.singletonList(EMPTY_MD5));
|
||||
executeTest("tests capture of non-complete region, on BP_RESOLUTION gvcf", spec);
|
||||
}
|
||||
@Test
|
||||
public void testCorrectCreationOfBlocks() throws IOException {
|
||||
final File tempDir = IOUtils.tempDir("RefBlocks", "test", new File(privateTestDir));
|
||||
tempDir.mkdir();
|
||||
tempDir.deleteOnExit();
|
||||
final File output = File.createTempFile("RefBlocks", ".g.vcf", tempDir);
|
||||
String baseIntervals = " 1:1-100 -L 5:1-200 ";
|
||||
String intervalString = " -L " + baseIntervals;
|
||||
final WalkerTestSpec hc = new WalkerTestSpec("-T HaplotypeCaller " + intervalString + " -I " + privateTestDir + "NA12878.4.snippet.bam " +
|
||||
" -R /humgen/1kg/reference/human_g1k_v37_decoy.fasta -ERC GVCF -o " + output, Collections.singletonList(EMPTY_MD5));
|
||||
executeTest("running hc", hc);
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString(tempDir.getName() + "/" + output.getName(), "-ALLELES", baseIntervals, b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
|
||||
0, Collections.singletonList(EMPTY_MD5));
|
||||
executeTest("testing the correct creation of reference blocks", spec);
|
||||
|
||||
TestUtil.recursiveDelete(tempDir);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,23 +26,26 @@
|
|||
package org.broadinstitute.gatk.tools.walkers.variantutils;
|
||||
|
||||
import htsjdk.tribble.TribbleException;
|
||||
import org.broadinstitute.gatk.utils.commandline.Argument;
|
||||
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.gatk.engine.CommandLineGATK;
|
||||
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
|
||||
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
|
||||
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.gatk.engine.walkers.Reference;
|
||||
import org.broadinstitute.gatk.engine.walkers.RodWalker;
|
||||
import org.broadinstitute.gatk.engine.walkers.Window;
|
||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.gatk.utils.help.HelpConstants;
|
||||
import htsjdk.variant.variantcontext.Allele;
|
||||
import htsjdk.variant.variantcontext.VariantContext;
|
||||
import htsjdk.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.gatk.engine.CommandLineGATK;
|
||||
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.gatk.engine.walkers.Reference;
|
||||
import org.broadinstitute.gatk.engine.walkers.RodWalker;
|
||||
import org.broadinstitute.gatk.engine.walkers.Window;
|
||||
import org.broadinstitute.gatk.utils.GenomeLoc;
|
||||
import org.broadinstitute.gatk.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.gatk.utils.commandline.Argument;
|
||||
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
|
||||
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
|
||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.gatk.utils.help.HelpConstants;
|
||||
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
|
@ -122,7 +125,7 @@ import java.util.*;
|
|||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} )
|
||||
@Reference(window=@Window(start=0,stop=100))
|
||||
public class ValidateVariants extends RodWalker<Integer, Integer> {
|
||||
public class ValidateVariants extends RodWalker<GenomeLoc, GenomeLocSortedSet> {
|
||||
|
||||
// Log message for a reference allele that is too long
|
||||
protected static final String REFERENCE_ALLELE_TOO_LONG_MSG = "Reference allele is too long";
|
||||
|
|
@ -184,12 +187,21 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
/**
|
||||
* By default, even filtered records are validated.
|
||||
*/
|
||||
@Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false)
|
||||
@Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false, exclusiveOf = "VALIDATE_GVCF")
|
||||
protected Boolean DO_NOT_VALIDATE_FILTERED = false;
|
||||
|
||||
@Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false)
|
||||
protected Boolean WARN_ON_ERROR = false;
|
||||
|
||||
/**
|
||||
* Validate this file as a gvcf. In particular, every variant must include a <NON_REF> allele, and that
|
||||
* every base in the territory under consideration is covered by a variant (or a reference block).
|
||||
* If you specifed intervals (using -L or -XL) to restrict analysis to a subset of genomic regions,
|
||||
* those intervals will need to be covered in a valid gvcf.
|
||||
*/
|
||||
@Argument(fullName = "validateGVCF", shortName = "gvcf", doc = "Validate this file as a GVCF", required = false, exclusiveOf = "DO_NOT_VALIDATE_FILTERED")
|
||||
protected Boolean VALIDATE_GVCF = false;
|
||||
|
||||
private long numErrors = 0;
|
||||
|
||||
private File file = null;
|
||||
|
|
@ -208,29 +220,60 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
referenceWindowStop = getToolkit().getArguments().reference_window_stop;
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
return null;
|
||||
|
||||
int lastVcEnd = -1;
|
||||
|
||||
Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
|
||||
for ( VariantContext vc : VCs )
|
||||
validate(vc, tracker, ref);
|
||||
if (VALIDATE_GVCF && VCs.size() > 1) {
|
||||
logger.error("GVCF validation can only be performed one file at a time. Validation results are invalid.");
|
||||
}
|
||||
for ( VariantContext vc : VCs ) {
|
||||
validate(vc, tracker, ref, VALIDATE_GVCF);
|
||||
lastVcEnd = vc.getEnd();
|
||||
}
|
||||
|
||||
return VCs.size();
|
||||
return GenomeLoc.setStop(ref.getLocus(), lastVcEnd);
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
@Override
|
||||
public GenomeLocSortedSet reduce(GenomeLoc value, GenomeLocSortedSet sum) {
|
||||
sum.add(value, true);
|
||||
return sum;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) { return sum+value; }
|
||||
@Override
|
||||
public GenomeLocSortedSet reduceInit() {
|
||||
return new GenomeLocSortedSet(getToolkit().getGenomeLocParser());
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
if ( numErrors == 0 )
|
||||
System.out.println("Successfully validated the input file. Checked " + result + " records with no failures.");
|
||||
@Override
|
||||
public void onTraversalDone(GenomeLocSortedSet result) {
|
||||
if (VALIDATE_GVCF) {
|
||||
final GenomeLocSortedSet uncoveredIntervals = getToolkit().getIntervals().subtractRegions(result);
|
||||
if (uncoveredIntervals.coveredSize() > 0) {
|
||||
final UserException e = new UserException.FailsStrictValidation(file, "A GVCF must cover the entire region. Found " + uncoveredIntervals.coveredSize() +
|
||||
" loci with no VariantContext covering it. The first uncovered segment is:" +
|
||||
uncoveredIntervals.iterator().next());
|
||||
|
||||
if (WARN_ON_ERROR) {
|
||||
numErrors++;
|
||||
logger.warn("***** " + e.getMessage() + " *****");
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numErrors == 0)
|
||||
System.out.println("Successfully validated the input file. Checked " + result.size() + " records with no failures.");
|
||||
else
|
||||
System.out.println("Found " + numErrors + " records with failures.");
|
||||
}
|
||||
|
||||
private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref) {
|
||||
private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, boolean gvcf) {
|
||||
if ( DO_NOT_VALIDATE_FILTERED && vc.isFiltered() )
|
||||
return;
|
||||
|
||||
|
|
@ -241,7 +284,7 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
// reference length is greater than the reference window stop before and after expansion
|
||||
if ( refLength > 100 && refLength > referenceWindowStop ) {
|
||||
logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --referenceWindowStop >= %d",
|
||||
REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getChr(), vc.getStart(), refLength));
|
||||
REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getContig(), vc.getStart(), refLength));
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -252,7 +295,7 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
// get the RS IDs
|
||||
Set<String> rsIDs = null;
|
||||
if ( tracker.hasValues(dbsnp.dbsnp) ) {
|
||||
rsIDs = new HashSet<String>();
|
||||
rsIDs = new HashSet<>();
|
||||
for ( VariantContext rsID : tracker.getValues(dbsnp.dbsnp, ref.getLocus()) )
|
||||
rsIDs.addAll(Arrays.asList(rsID.getID().split(VCFConstants.ID_FIELD_SEPARATOR)));
|
||||
}
|
||||
|
|
@ -260,6 +303,10 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
try {
|
||||
for (final ValidationType t : validationTypes)
|
||||
applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t);
|
||||
|
||||
if (gvcf) {
|
||||
ValidateGVCFVariant(vc);
|
||||
}
|
||||
} catch (TribbleException e) {
|
||||
if ( WARN_ON_ERROR ) {
|
||||
numErrors++;
|
||||
|
|
@ -279,6 +326,13 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
* @return never {@code null} but perhaps an empty set.
|
||||
*/
|
||||
private Collection<ValidationType> calculateValidationTypesToApply(final List<ValidationType> excludeTypes) {
|
||||
|
||||
if (VALIDATE_GVCF && !excludeTypes.contains(ValidationType.ALLELES)) {
|
||||
// Note: in a future version allele validation might be OK for GVCFs, if that happens
|
||||
// this will be more complicated.
|
||||
logger.warn("GVCF format is currently incompatible with allele validation. Not validating Alleles.");
|
||||
excludeTypes.add(ValidationType.ALLELES);
|
||||
}
|
||||
if (excludeTypes.isEmpty())
|
||||
return Collections.singleton(ValidationType.ALL);
|
||||
final Set<ValidationType> excludeTypeSet = new LinkedHashSet<>(excludeTypes);
|
||||
|
|
@ -295,6 +349,13 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
private void ValidateGVCFVariant(final VariantContext vc) {
|
||||
if (!vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) {
|
||||
throw new TribbleException.InternalCodecException(String.format("In a GVCF all records must contain a %s allele. Offending record: %s",
|
||||
GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, vc.toStringWithoutGenotypes()));
|
||||
}
|
||||
}
|
||||
|
||||
private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set<String> rsIDs, ValidationType t) {
|
||||
switch( t ) {
|
||||
case ALL:
|
||||
|
|
|
|||
|
|
@ -540,7 +540,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
*
|
||||
* @return a newly allocated GenomeLoc as loc but with start == start
|
||||
*/
|
||||
public GenomeLoc setStart(GenomeLoc loc, int start) {
|
||||
public static GenomeLoc setStart(GenomeLoc loc, int start) {
|
||||
return new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop());
|
||||
}
|
||||
|
||||
|
|
@ -554,7 +554,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
*
|
||||
* @return a newly allocated GenomeLoc as loc but with stop == stop
|
||||
*/
|
||||
public GenomeLoc setStop(GenomeLoc loc, int stop) {
|
||||
public static GenomeLoc setStop(GenomeLoc loc, int stop) {
|
||||
return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue