From 25fa25b61864ff6202c5dd7cc30caff54d880b35 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Thu, 26 May 2016 06:42:45 -0400 Subject: [PATCH] Added option to validate gvcf (for ValidateVariants) (#1379) * with option --gvcf CLP will now put extra checks that a gvcf must adhere to (existance of allele at every variant, and that the variants in total cover the entire requested intervals, or the whole genome if no intervals have been specified) * works on gvcf produced by HC when using either GVCF or BP_RESOLUTION mode * added positive and negative tests --- .../ValidateVariantsIntegrationTest.java | 78 +++++++++++- .../variantutils/ValidateVariants.java | 119 +++++++++++++----- .../broadinstitute/gatk/utils/GenomeLoc.java | 4 +- 3 files changed, 168 insertions(+), 33 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java index d9da8b821..8d4ccac28 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -51,16 +51,19 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import htsjdk.samtools.util.TestUtil; import org.apache.commons.io.FileUtils; import org.apache.log4j.Level; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; import java.io.IOException; import java.util.Arrays; +import java.util.Collections; public class ValidateVariantsIntegrationTest extends WalkerTest { @@ -279,7 +282,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ALLOW_SEQ_DICT_INCOMPATIBILITY ", - 0, Arrays.asList(EMPTY_MD5)); + 0, Collections.singletonList(EMPTY_MD5)); executeTest("test to allow wrong header contig length, not checking dictionary incompatibility", spec); } @@ -288,7 +291,78 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ", - 0, Arrays.asList(EMPTY_MD5)); + 0, Collections.singletonList(EMPTY_MD5)); executeTest("test to allow wrong header contig length, no compatibility checks", spec); } + + @Test + public void testGoodGvcf() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12891.AS.chr20snippet.g.vcf", "ALL", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList("d41d8cd98f00b204e9800998ecf8427e")); + executeTest("tests correct gvcf", spec); + } + + @Test + public void testGoodGvcfExcludingAlleles() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12891.AS.chr20snippet.g.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList("d41d8cd98f00b204e9800998ecf8427e")); + executeTest("tests correct gvcf", spec); + } + + + @Test(expectedExceptions = RuntimeException.class ) + public void testBadGvcfMissingNON_REF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12891.AS.chr20snippet.BAD_MISSING_NON_REF.g.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("tests capture of missing NON_REF allele", spec); + } + + @Test(expectedExceptions = RuntimeException.class ) + public void testBadGvcfRegions() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("diploid-gvcf.bad-IncompleteRegion.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("tests capture of non-complete region", spec); + } + + @Test(expectedExceptions = RuntimeException.class ) + public void testNonOverlappingRegions() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12891.AS.chr20snippet_BAD_INCOMPLETE_REGION.g.vcf", "-ALLELES", "Y:4966254-4967190", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("tests capture of non-complete region", spec); + } + + @Test + public void testNonOverlappingRegionsBP_RESOLUTION() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("gvcf.basepairResolution.vcf", "-ALLELES", "20:10000000-10010000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("tests capture of non-complete region, on BP_RESOLUTION gvcf", spec); + } + @Test + public void testCorrectCreationOfBlocks() throws IOException { + final File tempDir = IOUtils.tempDir("RefBlocks", "test", new File(privateTestDir)); + tempDir.mkdir(); + tempDir.deleteOnExit(); + final File output = File.createTempFile("RefBlocks", ".g.vcf", tempDir); + String baseIntervals = " 1:1-100 -L 5:1-200 "; + String intervalString = " -L " + baseIntervals; + final WalkerTestSpec hc = new WalkerTestSpec("-T HaplotypeCaller " + intervalString + " -I " + privateTestDir + "NA12878.4.snippet.bam " + + " -R /humgen/1kg/reference/human_g1k_v37_decoy.fasta -ERC GVCF -o " + output, Collections.singletonList(EMPTY_MD5)); + executeTest("running hc", hc); + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(tempDir.getName() + "/" + output.getName(), "-ALLELES", baseIntervals, b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("testing the correct creation of reference blocks", spec); + + TestUtil.recursiveDelete(tempDir); + } + } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java index e34c78111..653bdaf50 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java @@ -26,23 +26,26 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import htsjdk.tribble.TribbleException; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; -import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.utils.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.Reference; -import org.broadinstitute.gatk.engine.walkers.RodWalker; -import org.broadinstitute.gatk.engine.walkers.Window; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; -import org.broadinstitute.gatk.utils.help.HelpConstants; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFConstants; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; +import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.gatk.engine.walkers.Reference; +import org.broadinstitute.gatk.engine.walkers.RodWalker; +import org.broadinstitute.gatk.engine.walkers.Window; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.utils.help.HelpConstants; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.io.File; import java.util.*; @@ -122,7 +125,7 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=100)) -public class ValidateVariants extends RodWalker { +public class ValidateVariants extends RodWalker { // Log message for a reference allele that is too long protected static final String REFERENCE_ALLELE_TOO_LONG_MSG = "Reference allele is too long"; @@ -184,12 +187,21 @@ public class ValidateVariants extends RodWalker { /** * By default, even filtered records are validated. */ - @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false) + @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false, exclusiveOf = "VALIDATE_GVCF") protected Boolean DO_NOT_VALIDATE_FILTERED = false; @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false) protected Boolean WARN_ON_ERROR = false; + /** + * Validate this file as a gvcf. In particular, every variant must include a allele, and that + * every base in the territory under consideration is covered by a variant (or a reference block). + * If you specifed intervals (using -L or -XL) to restrict analysis to a subset of genomic regions, + * those intervals will need to be covered in a valid gvcf. + */ + @Argument(fullName = "validateGVCF", shortName = "gvcf", doc = "Validate this file as a GVCF", required = false, exclusiveOf = "DO_NOT_VALIDATE_FILTERED") + protected Boolean VALIDATE_GVCF = false; + private long numErrors = 0; private File file = null; @@ -208,29 +220,60 @@ public class ValidateVariants extends RodWalker { referenceWindowStop = getToolkit().getArguments().reference_window_stop; } - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) - return 0; + return null; + + int lastVcEnd = -1; Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); - for ( VariantContext vc : VCs ) - validate(vc, tracker, ref); + if (VALIDATE_GVCF && VCs.size() > 1) { + logger.error("GVCF validation can only be performed one file at a time. Validation results are invalid."); + } + for ( VariantContext vc : VCs ) { + validate(vc, tracker, ref, VALIDATE_GVCF); + lastVcEnd = vc.getEnd(); + } - return VCs.size(); + return GenomeLoc.setStop(ref.getLocus(), lastVcEnd); } - public Integer reduceInit() { return 0; } + @Override + public GenomeLocSortedSet reduce(GenomeLoc value, GenomeLocSortedSet sum) { + sum.add(value, true); + return sum; + } - public Integer reduce(Integer value, Integer sum) { return sum+value; } + @Override + public GenomeLocSortedSet reduceInit() { + return new GenomeLocSortedSet(getToolkit().getGenomeLocParser()); + } - public void onTraversalDone(Integer result) { - if ( numErrors == 0 ) - System.out.println("Successfully validated the input file. Checked " + result + " records with no failures."); + @Override + public void onTraversalDone(GenomeLocSortedSet result) { + if (VALIDATE_GVCF) { + final GenomeLocSortedSet uncoveredIntervals = getToolkit().getIntervals().subtractRegions(result); + if (uncoveredIntervals.coveredSize() > 0) { + final UserException e = new UserException.FailsStrictValidation(file, "A GVCF must cover the entire region. Found " + uncoveredIntervals.coveredSize() + + " loci with no VariantContext covering it. The first uncovered segment is:" + + uncoveredIntervals.iterator().next()); + + if (WARN_ON_ERROR) { + numErrors++; + logger.warn("***** " + e.getMessage() + " *****"); + } else { + throw e; + } + } + } + + if (numErrors == 0) + System.out.println("Successfully validated the input file. Checked " + result.size() + " records with no failures."); else System.out.println("Found " + numErrors + " records with failures."); } - private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref) { + private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, boolean gvcf) { if ( DO_NOT_VALIDATE_FILTERED && vc.isFiltered() ) return; @@ -241,7 +284,7 @@ public class ValidateVariants extends RodWalker { // reference length is greater than the reference window stop before and after expansion if ( refLength > 100 && refLength > referenceWindowStop ) { logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --referenceWindowStop >= %d", - REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getChr(), vc.getStart(), refLength)); + REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getContig(), vc.getStart(), refLength)); return; } @@ -252,7 +295,7 @@ public class ValidateVariants extends RodWalker { // get the RS IDs Set rsIDs = null; if ( tracker.hasValues(dbsnp.dbsnp) ) { - rsIDs = new HashSet(); + rsIDs = new HashSet<>(); for ( VariantContext rsID : tracker.getValues(dbsnp.dbsnp, ref.getLocus()) ) rsIDs.addAll(Arrays.asList(rsID.getID().split(VCFConstants.ID_FIELD_SEPARATOR))); } @@ -260,6 +303,10 @@ public class ValidateVariants extends RodWalker { try { for (final ValidationType t : validationTypes) applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t); + + if (gvcf) { + ValidateGVCFVariant(vc); + } } catch (TribbleException e) { if ( WARN_ON_ERROR ) { numErrors++; @@ -279,6 +326,13 @@ public class ValidateVariants extends RodWalker { * @return never {@code null} but perhaps an empty set. */ private Collection calculateValidationTypesToApply(final List excludeTypes) { + + if (VALIDATE_GVCF && !excludeTypes.contains(ValidationType.ALLELES)) { + // Note: in a future version allele validation might be OK for GVCFs, if that happens + // this will be more complicated. + logger.warn("GVCF format is currently incompatible with allele validation. Not validating Alleles."); + excludeTypes.add(ValidationType.ALLELES); + } if (excludeTypes.isEmpty()) return Collections.singleton(ValidationType.ALL); final Set excludeTypeSet = new LinkedHashSet<>(excludeTypes); @@ -295,6 +349,13 @@ public class ValidateVariants extends RodWalker { } } + private void ValidateGVCFVariant(final VariantContext vc) { + if (!vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) { + throw new TribbleException.InternalCodecException(String.format("In a GVCF all records must contain a %s allele. Offending record: %s", + GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, vc.toStringWithoutGenotypes())); + } + } + private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set rsIDs, ValidationType t) { switch( t ) { case ALL: diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java index ec70eff8c..c64d5798f 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java @@ -540,7 +540,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome * * @return a newly allocated GenomeLoc as loc but with start == start */ - public GenomeLoc setStart(GenomeLoc loc, int start) { + public static GenomeLoc setStart(GenomeLoc loc, int start) { return new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop()); } @@ -554,7 +554,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome * * @return a newly allocated GenomeLoc as loc but with stop == stop */ - public GenomeLoc setStop(GenomeLoc loc, int stop) { + public static GenomeLoc setStop(GenomeLoc loc, int stop) { return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop); }