diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java index d9da8b821..8d4ccac28 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -51,16 +51,19 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import htsjdk.samtools.util.TestUtil; import org.apache.commons.io.FileUtils; import org.apache.log4j.Level; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; import java.io.IOException; import java.util.Arrays; +import java.util.Collections; public class ValidateVariantsIntegrationTest extends WalkerTest { @@ -279,7 +282,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ALLOW_SEQ_DICT_INCOMPATIBILITY ", - 0, Arrays.asList(EMPTY_MD5)); + 0, Collections.singletonList(EMPTY_MD5)); executeTest("test to allow wrong header contig length, not checking dictionary incompatibility", spec); } @@ -288,7 +291,78 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ", - 0, Arrays.asList(EMPTY_MD5)); + 0, Collections.singletonList(EMPTY_MD5)); executeTest("test to allow wrong header contig length, no compatibility checks", spec); } + + @Test + public void testGoodGvcf() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12891.AS.chr20snippet.g.vcf", "ALL", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList("d41d8cd98f00b204e9800998ecf8427e")); + executeTest("tests correct gvcf", spec); + } + + @Test + public void testGoodGvcfExcludingAlleles() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12891.AS.chr20snippet.g.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList("d41d8cd98f00b204e9800998ecf8427e")); + executeTest("tests correct gvcf", spec); + } + + + @Test(expectedExceptions = RuntimeException.class ) + public void testBadGvcfMissingNON_REF() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12891.AS.chr20snippet.BAD_MISSING_NON_REF.g.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("tests capture of missing NON_REF allele", spec); + } + + @Test(expectedExceptions = RuntimeException.class ) + public void testBadGvcfRegions() { + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("diploid-gvcf.bad-IncompleteRegion.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("tests capture of non-complete region", spec); + } + + @Test(expectedExceptions = RuntimeException.class ) + public void testNonOverlappingRegions() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12891.AS.chr20snippet_BAD_INCOMPLETE_REGION.g.vcf", "-ALLELES", "Y:4966254-4967190", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("tests capture of non-complete region", spec); + } + + @Test + public void testNonOverlappingRegionsBP_RESOLUTION() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("gvcf.basepairResolution.vcf", "-ALLELES", "20:10000000-10010000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("tests capture of non-complete region, on BP_RESOLUTION gvcf", spec); + } + @Test + public void testCorrectCreationOfBlocks() throws IOException { + final File tempDir = IOUtils.tempDir("RefBlocks", "test", new File(privateTestDir)); + tempDir.mkdir(); + tempDir.deleteOnExit(); + final File output = File.createTempFile("RefBlocks", ".g.vcf", tempDir); + String baseIntervals = " 1:1-100 -L 5:1-200 "; + String intervalString = " -L " + baseIntervals; + final WalkerTestSpec hc = new WalkerTestSpec("-T HaplotypeCaller " + intervalString + " -I " + privateTestDir + "NA12878.4.snippet.bam " + + " -R /humgen/1kg/reference/human_g1k_v37_decoy.fasta -ERC GVCF -o " + output, Collections.singletonList(EMPTY_MD5)); + executeTest("running hc", hc); + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(tempDir.getName() + "/" + output.getName(), "-ALLELES", baseIntervals, b37KGReference) + " -gvcf --reference_window_stop 208 -U ", + 0, Collections.singletonList(EMPTY_MD5)); + executeTest("testing the correct creation of reference blocks", spec); + + TestUtil.recursiveDelete(tempDir); + } + } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java index e34c78111..653bdaf50 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java @@ -26,23 +26,26 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import htsjdk.tribble.TribbleException; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; -import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.utils.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.Reference; -import org.broadinstitute.gatk.engine.walkers.RodWalker; -import org.broadinstitute.gatk.engine.walkers.Window; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; -import org.broadinstitute.gatk.utils.help.HelpConstants; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFConstants; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; +import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.gatk.engine.walkers.Reference; +import org.broadinstitute.gatk.engine.walkers.RodWalker; +import org.broadinstitute.gatk.engine.walkers.Window; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.utils.help.HelpConstants; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.io.File; import java.util.*; @@ -122,7 +125,7 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=100)) -public class ValidateVariants extends RodWalker { +public class ValidateVariants extends RodWalker { // Log message for a reference allele that is too long protected static final String REFERENCE_ALLELE_TOO_LONG_MSG = "Reference allele is too long"; @@ -184,12 +187,21 @@ public class ValidateVariants extends RodWalker { /** * By default, even filtered records are validated. */ - @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false) + @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false, exclusiveOf = "VALIDATE_GVCF") protected Boolean DO_NOT_VALIDATE_FILTERED = false; @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false) protected Boolean WARN_ON_ERROR = false; + /** + * Validate this file as a gvcf. In particular, every variant must include a allele, and that + * every base in the territory under consideration is covered by a variant (or a reference block). + * If you specifed intervals (using -L or -XL) to restrict analysis to a subset of genomic regions, + * those intervals will need to be covered in a valid gvcf. + */ + @Argument(fullName = "validateGVCF", shortName = "gvcf", doc = "Validate this file as a GVCF", required = false, exclusiveOf = "DO_NOT_VALIDATE_FILTERED") + protected Boolean VALIDATE_GVCF = false; + private long numErrors = 0; private File file = null; @@ -208,29 +220,60 @@ public class ValidateVariants extends RodWalker { referenceWindowStop = getToolkit().getArguments().reference_window_stop; } - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) - return 0; + return null; + + int lastVcEnd = -1; Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); - for ( VariantContext vc : VCs ) - validate(vc, tracker, ref); + if (VALIDATE_GVCF && VCs.size() > 1) { + logger.error("GVCF validation can only be performed one file at a time. Validation results are invalid."); + } + for ( VariantContext vc : VCs ) { + validate(vc, tracker, ref, VALIDATE_GVCF); + lastVcEnd = vc.getEnd(); + } - return VCs.size(); + return GenomeLoc.setStop(ref.getLocus(), lastVcEnd); } - public Integer reduceInit() { return 0; } + @Override + public GenomeLocSortedSet reduce(GenomeLoc value, GenomeLocSortedSet sum) { + sum.add(value, true); + return sum; + } - public Integer reduce(Integer value, Integer sum) { return sum+value; } + @Override + public GenomeLocSortedSet reduceInit() { + return new GenomeLocSortedSet(getToolkit().getGenomeLocParser()); + } - public void onTraversalDone(Integer result) { - if ( numErrors == 0 ) - System.out.println("Successfully validated the input file. Checked " + result + " records with no failures."); + @Override + public void onTraversalDone(GenomeLocSortedSet result) { + if (VALIDATE_GVCF) { + final GenomeLocSortedSet uncoveredIntervals = getToolkit().getIntervals().subtractRegions(result); + if (uncoveredIntervals.coveredSize() > 0) { + final UserException e = new UserException.FailsStrictValidation(file, "A GVCF must cover the entire region. Found " + uncoveredIntervals.coveredSize() + + " loci with no VariantContext covering it. The first uncovered segment is:" + + uncoveredIntervals.iterator().next()); + + if (WARN_ON_ERROR) { + numErrors++; + logger.warn("***** " + e.getMessage() + " *****"); + } else { + throw e; + } + } + } + + if (numErrors == 0) + System.out.println("Successfully validated the input file. Checked " + result.size() + " records with no failures."); else System.out.println("Found " + numErrors + " records with failures."); } - private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref) { + private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, boolean gvcf) { if ( DO_NOT_VALIDATE_FILTERED && vc.isFiltered() ) return; @@ -241,7 +284,7 @@ public class ValidateVariants extends RodWalker { // reference length is greater than the reference window stop before and after expansion if ( refLength > 100 && refLength > referenceWindowStop ) { logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --referenceWindowStop >= %d", - REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getChr(), vc.getStart(), refLength)); + REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getContig(), vc.getStart(), refLength)); return; } @@ -252,7 +295,7 @@ public class ValidateVariants extends RodWalker { // get the RS IDs Set rsIDs = null; if ( tracker.hasValues(dbsnp.dbsnp) ) { - rsIDs = new HashSet(); + rsIDs = new HashSet<>(); for ( VariantContext rsID : tracker.getValues(dbsnp.dbsnp, ref.getLocus()) ) rsIDs.addAll(Arrays.asList(rsID.getID().split(VCFConstants.ID_FIELD_SEPARATOR))); } @@ -260,6 +303,10 @@ public class ValidateVariants extends RodWalker { try { for (final ValidationType t : validationTypes) applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t); + + if (gvcf) { + ValidateGVCFVariant(vc); + } } catch (TribbleException e) { if ( WARN_ON_ERROR ) { numErrors++; @@ -279,6 +326,13 @@ public class ValidateVariants extends RodWalker { * @return never {@code null} but perhaps an empty set. */ private Collection calculateValidationTypesToApply(final List excludeTypes) { + + if (VALIDATE_GVCF && !excludeTypes.contains(ValidationType.ALLELES)) { + // Note: in a future version allele validation might be OK for GVCFs, if that happens + // this will be more complicated. + logger.warn("GVCF format is currently incompatible with allele validation. Not validating Alleles."); + excludeTypes.add(ValidationType.ALLELES); + } if (excludeTypes.isEmpty()) return Collections.singleton(ValidationType.ALL); final Set excludeTypeSet = new LinkedHashSet<>(excludeTypes); @@ -295,6 +349,13 @@ public class ValidateVariants extends RodWalker { } } + private void ValidateGVCFVariant(final VariantContext vc) { + if (!vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) { + throw new TribbleException.InternalCodecException(String.format("In a GVCF all records must contain a %s allele. Offending record: %s", + GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, vc.toStringWithoutGenotypes())); + } + } + private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set rsIDs, ValidationType t) { switch( t ) { case ALL: diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java index ec70eff8c..c64d5798f 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLoc.java @@ -540,7 +540,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome * * @return a newly allocated GenomeLoc as loc but with start == start */ - public GenomeLoc setStart(GenomeLoc loc, int start) { + public static GenomeLoc setStart(GenomeLoc loc, int start) { return new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop()); } @@ -554,7 +554,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome * * @return a newly allocated GenomeLoc as loc but with stop == stop */ - public GenomeLoc setStop(GenomeLoc loc, int stop) { + public static GenomeLoc setStop(GenomeLoc loc, int stop) { return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop); }