Added option to validate gvcf (for ValidateVariants) (#1379)

* with option --gvcf CLP will now put extra checks that a gvcf must adhere to (existance of <NON_REF> allele at every variant, and that the variants in total cover the entire requested intervals, or the whole genome if no intervals have been specified)
* works on gvcf produced by HC when using either GVCF or BP_RESOLUTION mode
* added positive and negative tests
This commit is contained in:
Yossi Farjoun 2016-05-26 06:42:45 -04:00
parent e1fadae139
commit 25fa25b618
3 changed files with 168 additions and 33 deletions

View File

@ -51,16 +51,19 @@
package org.broadinstitute.gatk.tools.walkers.variantutils; package org.broadinstitute.gatk.tools.walkers.variantutils;
import htsjdk.samtools.util.TestUtil;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.log4j.Level; import org.apache.log4j.Level;
import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.io.IOUtils;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
public class ValidateVariantsIntegrationTest extends WalkerTest { public class ValidateVariantsIntegrationTest extends WalkerTest {
@ -279,7 +282,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ALLOW_SEQ_DICT_INCOMPATIBILITY ", baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ALLOW_SEQ_DICT_INCOMPATIBILITY ",
0, Arrays.asList(EMPTY_MD5)); 0, Collections.singletonList(EMPTY_MD5));
executeTest("test to allow wrong header contig length, not checking dictionary incompatibility", spec); executeTest("test to allow wrong header contig length, not checking dictionary incompatibility", spec);
} }
@ -288,7 +291,78 @@ public class ValidateVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ", baseTestString("longAlleles-wrongLength.vcf", "ALL", "1", b37KGReference) + " --reference_window_stop 208 -U ",
0, Arrays.asList(EMPTY_MD5)); 0, Collections.singletonList(EMPTY_MD5));
executeTest("test to allow wrong header contig length, no compatibility checks", spec); executeTest("test to allow wrong header contig length, no compatibility checks", spec);
} }
@Test
public void testGoodGvcf() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12891.AS.chr20snippet.g.vcf", "ALL", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
0, Collections.singletonList("d41d8cd98f00b204e9800998ecf8427e"));
executeTest("tests correct gvcf", spec);
}
@Test
public void testGoodGvcfExcludingAlleles() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12891.AS.chr20snippet.g.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
0, Collections.singletonList("d41d8cd98f00b204e9800998ecf8427e"));
executeTest("tests correct gvcf", spec);
}
@Test(expectedExceptions = RuntimeException.class )
public void testBadGvcfMissingNON_REF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12891.AS.chr20snippet.BAD_MISSING_NON_REF.g.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
0, Collections.singletonList(EMPTY_MD5));
executeTest("tests capture of missing NON_REF allele", spec);
}
@Test(expectedExceptions = RuntimeException.class )
public void testBadGvcfRegions() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("diploid-gvcf.bad-IncompleteRegion.vcf", "-ALLELES", "20:10433000-10437000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
0, Collections.singletonList(EMPTY_MD5));
executeTest("tests capture of non-complete region", spec);
}
@Test(expectedExceptions = RuntimeException.class )
public void testNonOverlappingRegions() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12891.AS.chr20snippet_BAD_INCOMPLETE_REGION.g.vcf", "-ALLELES", "Y:4966254-4967190", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
0, Collections.singletonList(EMPTY_MD5));
executeTest("tests capture of non-complete region", spec);
}
@Test
public void testNonOverlappingRegionsBP_RESOLUTION() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("gvcf.basepairResolution.vcf", "-ALLELES", "20:10000000-10010000", b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
0, Collections.singletonList(EMPTY_MD5));
executeTest("tests capture of non-complete region, on BP_RESOLUTION gvcf", spec);
}
@Test
public void testCorrectCreationOfBlocks() throws IOException {
final File tempDir = IOUtils.tempDir("RefBlocks", "test", new File(privateTestDir));
tempDir.mkdir();
tempDir.deleteOnExit();
final File output = File.createTempFile("RefBlocks", ".g.vcf", tempDir);
String baseIntervals = " 1:1-100 -L 5:1-200 ";
String intervalString = " -L " + baseIntervals;
final WalkerTestSpec hc = new WalkerTestSpec("-T HaplotypeCaller " + intervalString + " -I " + privateTestDir + "NA12878.4.snippet.bam " +
" -R /humgen/1kg/reference/human_g1k_v37_decoy.fasta -ERC GVCF -o " + output, Collections.singletonList(EMPTY_MD5));
executeTest("running hc", hc);
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(tempDir.getName() + "/" + output.getName(), "-ALLELES", baseIntervals, b37KGReference) + " -gvcf --reference_window_stop 208 -U ",
0, Collections.singletonList(EMPTY_MD5));
executeTest("testing the correct creation of reference blocks", spec);
TestUtil.recursiveDelete(tempDir);
}
} }

View File

@ -26,23 +26,26 @@
package org.broadinstitute.gatk.tools.walkers.variantutils; package org.broadinstitute.gatk.tools.walkers.variantutils;
import htsjdk.tribble.TribbleException; import htsjdk.tribble.TribbleException;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.Reference;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.engine.walkers.Window;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.gatk.engine.walkers.Reference;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.engine.walkers.Window;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocSortedSet;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import java.io.File; import java.io.File;
import java.util.*; import java.util.*;
@ -122,7 +125,7 @@ import java.util.*;
*/ */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} ) @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VALIDATION, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=0,stop=100)) @Reference(window=@Window(start=0,stop=100))
public class ValidateVariants extends RodWalker<Integer, Integer> { public class ValidateVariants extends RodWalker<GenomeLoc, GenomeLocSortedSet> {
// Log message for a reference allele that is too long // Log message for a reference allele that is too long
protected static final String REFERENCE_ALLELE_TOO_LONG_MSG = "Reference allele is too long"; protected static final String REFERENCE_ALLELE_TOO_LONG_MSG = "Reference allele is too long";
@ -184,12 +187,21 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
/** /**
* By default, even filtered records are validated. * By default, even filtered records are validated.
*/ */
@Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false) @Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false, exclusiveOf = "VALIDATE_GVCF")
protected Boolean DO_NOT_VALIDATE_FILTERED = false; protected Boolean DO_NOT_VALIDATE_FILTERED = false;
@Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false) @Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false)
protected Boolean WARN_ON_ERROR = false; protected Boolean WARN_ON_ERROR = false;
/**
* Validate this file as a gvcf. In particular, every variant must include a <NON_REF> allele, and that
* every base in the territory under consideration is covered by a variant (or a reference block).
* If you specifed intervals (using -L or -XL) to restrict analysis to a subset of genomic regions,
* those intervals will need to be covered in a valid gvcf.
*/
@Argument(fullName = "validateGVCF", shortName = "gvcf", doc = "Validate this file as a GVCF", required = false, exclusiveOf = "DO_NOT_VALIDATE_FILTERED")
protected Boolean VALIDATE_GVCF = false;
private long numErrors = 0; private long numErrors = 0;
private File file = null; private File file = null;
@ -208,29 +220,60 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
referenceWindowStop = getToolkit().getArguments().reference_window_stop; referenceWindowStop = getToolkit().getArguments().reference_window_stop;
} }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) if ( tracker == null )
return 0; return null;
int lastVcEnd = -1;
Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation()); Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
for ( VariantContext vc : VCs ) if (VALIDATE_GVCF && VCs.size() > 1) {
validate(vc, tracker, ref); logger.error("GVCF validation can only be performed one file at a time. Validation results are invalid.");
}
for ( VariantContext vc : VCs ) {
validate(vc, tracker, ref, VALIDATE_GVCF);
lastVcEnd = vc.getEnd();
}
return VCs.size(); return GenomeLoc.setStop(ref.getLocus(), lastVcEnd);
} }
public Integer reduceInit() { return 0; } @Override
public GenomeLocSortedSet reduce(GenomeLoc value, GenomeLocSortedSet sum) {
sum.add(value, true);
return sum;
}
public Integer reduce(Integer value, Integer sum) { return sum+value; } @Override
public GenomeLocSortedSet reduceInit() {
return new GenomeLocSortedSet(getToolkit().getGenomeLocParser());
}
public void onTraversalDone(Integer result) { @Override
if ( numErrors == 0 ) public void onTraversalDone(GenomeLocSortedSet result) {
System.out.println("Successfully validated the input file. Checked " + result + " records with no failures."); if (VALIDATE_GVCF) {
final GenomeLocSortedSet uncoveredIntervals = getToolkit().getIntervals().subtractRegions(result);
if (uncoveredIntervals.coveredSize() > 0) {
final UserException e = new UserException.FailsStrictValidation(file, "A GVCF must cover the entire region. Found " + uncoveredIntervals.coveredSize() +
" loci with no VariantContext covering it. The first uncovered segment is:" +
uncoveredIntervals.iterator().next());
if (WARN_ON_ERROR) {
numErrors++;
logger.warn("***** " + e.getMessage() + " *****");
} else {
throw e;
}
}
}
if (numErrors == 0)
System.out.println("Successfully validated the input file. Checked " + result.size() + " records with no failures.");
else else
System.out.println("Found " + numErrors + " records with failures."); System.out.println("Found " + numErrors + " records with failures.");
} }
private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref) { private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, boolean gvcf) {
if ( DO_NOT_VALIDATE_FILTERED && vc.isFiltered() ) if ( DO_NOT_VALIDATE_FILTERED && vc.isFiltered() )
return; return;
@ -241,7 +284,7 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
// reference length is greater than the reference window stop before and after expansion // reference length is greater than the reference window stop before and after expansion
if ( refLength > 100 && refLength > referenceWindowStop ) { if ( refLength > 100 && refLength > referenceWindowStop ) {
logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --referenceWindowStop >= %d", logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --referenceWindowStop >= %d",
REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getChr(), vc.getStart(), refLength)); REFERENCE_ALLELE_TOO_LONG_MSG, refLength, vc.getContig(), vc.getStart(), refLength));
return; return;
} }
@ -252,7 +295,7 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
// get the RS IDs // get the RS IDs
Set<String> rsIDs = null; Set<String> rsIDs = null;
if ( tracker.hasValues(dbsnp.dbsnp) ) { if ( tracker.hasValues(dbsnp.dbsnp) ) {
rsIDs = new HashSet<String>(); rsIDs = new HashSet<>();
for ( VariantContext rsID : tracker.getValues(dbsnp.dbsnp, ref.getLocus()) ) for ( VariantContext rsID : tracker.getValues(dbsnp.dbsnp, ref.getLocus()) )
rsIDs.addAll(Arrays.asList(rsID.getID().split(VCFConstants.ID_FIELD_SEPARATOR))); rsIDs.addAll(Arrays.asList(rsID.getID().split(VCFConstants.ID_FIELD_SEPARATOR)));
} }
@ -260,6 +303,10 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
try { try {
for (final ValidationType t : validationTypes) for (final ValidationType t : validationTypes)
applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t); applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t);
if (gvcf) {
ValidateGVCFVariant(vc);
}
} catch (TribbleException e) { } catch (TribbleException e) {
if ( WARN_ON_ERROR ) { if ( WARN_ON_ERROR ) {
numErrors++; numErrors++;
@ -279,6 +326,13 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
* @return never {@code null} but perhaps an empty set. * @return never {@code null} but perhaps an empty set.
*/ */
private Collection<ValidationType> calculateValidationTypesToApply(final List<ValidationType> excludeTypes) { private Collection<ValidationType> calculateValidationTypesToApply(final List<ValidationType> excludeTypes) {
if (VALIDATE_GVCF && !excludeTypes.contains(ValidationType.ALLELES)) {
// Note: in a future version allele validation might be OK for GVCFs, if that happens
// this will be more complicated.
logger.warn("GVCF format is currently incompatible with allele validation. Not validating Alleles.");
excludeTypes.add(ValidationType.ALLELES);
}
if (excludeTypes.isEmpty()) if (excludeTypes.isEmpty())
return Collections.singleton(ValidationType.ALL); return Collections.singleton(ValidationType.ALL);
final Set<ValidationType> excludeTypeSet = new LinkedHashSet<>(excludeTypes); final Set<ValidationType> excludeTypeSet = new LinkedHashSet<>(excludeTypes);
@ -295,6 +349,13 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
} }
} }
private void ValidateGVCFVariant(final VariantContext vc) {
if (!vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) {
throw new TribbleException.InternalCodecException(String.format("In a GVCF all records must contain a %s allele. Offending record: %s",
GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, vc.toStringWithoutGenotypes()));
}
}
private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set<String> rsIDs, ValidationType t) { private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set<String> rsIDs, ValidationType t) {
switch( t ) { switch( t ) {
case ALL: case ALL:

View File

@ -540,7 +540,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
* *
* @return a newly allocated GenomeLoc as loc but with start == start * @return a newly allocated GenomeLoc as loc but with start == start
*/ */
public GenomeLoc setStart(GenomeLoc loc, int start) { public static GenomeLoc setStart(GenomeLoc loc, int start) {
return new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop()); return new GenomeLoc(loc.getContig(), loc.getContigIndex(), start, loc.getStop());
} }
@ -554,7 +554,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
* *
* @return a newly allocated GenomeLoc as loc but with stop == stop * @return a newly allocated GenomeLoc as loc but with stop == stop
*/ */
public GenomeLoc setStop(GenomeLoc loc, int stop) { public static GenomeLoc setStop(GenomeLoc loc, int stop) {
return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop); return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start, stop);
} }