From 7a291a8ff3968144458f62870bc657ede371ba16 Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 19 Oct 2010 19:55:49 +0000 Subject: [PATCH] First pass at a VCF validator. Will test more tonight. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4524 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantutils/ValidateVariants.java | 155 ++++++++++++++++++ .../ValidateVariantsIntegrationTest.java | 116 +++++++++++++ 2 files changed, 271 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java create mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java new file mode 100755 index 000000000..c526f5415 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.dbsnp.DbSNPFeature; +import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; + +import java.util.*; +import java.io.File; + + +/** + * Validates a variants file. + */ +@Reference(window=@Window(start=0,stop=100)) +@Requires(value={},referenceMetaData=@RMD(name=ValidateVariants.TARGET_ROD_NAME, type=VariantContext.class)) +public class ValidateVariants extends RodWalker { + + protected static final String TARGET_ROD_NAME = "variant"; + + public enum ValidationType { + ALL, REF, IDS, ALLELES, CHR_COUNTS + } + + @Hidden + @Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false) + protected ValidationType type = ValidationType.ALL; + + private File file = null; + + public void initialize() { + for ( ReferenceOrderedDataSource source : getToolkit().getRodDataSources() ) { + if ( source.getName().equals(TARGET_ROD_NAME) ) { + file = source.getReferenceOrderedData().getFile(); + break; + } + } + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + Collection VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false); + for ( VariantContext vc : VCs ) + validate(vc, tracker, ref); + + return VCs.size(); + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { return sum+value; } + + public void onTraversalDone(Integer result) { + System.out.println("Successfully validated the input file. Checked " + result + " records with no failures."); + } + + private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref) { + // get the true reference allele + Allele reportedRefAllele = vc.getReference(); + Allele observedRefAllele; + // insertions + if ( reportedRefAllele.isNull() ) { + observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING); + } + // SNPs + else if ( reportedRefAllele.length() == 1 ) { + byte[] refByte = new byte[1]; + refByte[0] = ref.getBase(); + observedRefAllele = Allele.create(refByte, true); + } + // deletions + else { + // we can't validate arbitrarily long deletions + if ( reportedRefAllele.length() > 100 ) { + logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart())); + return; + } + + byte[] refBytes = ref.getBases(); + byte[] trueRef = new byte[reportedRefAllele.length()]; + for (int i = 0; i < reportedRefAllele.length(); i++) + trueRef[i] = refBytes[i+1]; + observedRefAllele = Allele.create(trueRef, true); + } + + // get the RS IDs + Set rsIDs = null; + if ( tracker.hasROD(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { + List dbsnpList = tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME); + rsIDs = new HashSet(); + for ( Object d : dbsnpList ) { + if (d instanceof DbSNPFeature ) + rsIDs.add(((DbSNPFeature)d).getRsID()); + } + } + + try { + switch( type ) { + case ALL: + vc.extraStrictValidation(observedRefAllele, rsIDs); + break; + case REF: + vc.validateReferenceBases(observedRefAllele); + break; + case IDS: + vc.validateRSIDs(rsIDs); + break; + case ALLELES: + vc.validateAlternateAlleles(); + break; + case CHR_COUNTS: + vc.validateChromosomeCounts(); + break; + } + } catch (TribbleException e) { + throw new UserException.MalformedFile(file, e.getMessage()); + } + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java new file mode 100755 index 000000000..b50fd09a4 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.junit.Test; + +import java.util.Arrays; + +public class ValidateVariantsIntegrationTest extends WalkerTest { + + public static String baseTestString(String file, String type) { + return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 -B:variant,VCF " + validationDataLocation + file + " --validationType " + type; + } + + @Test + public void testGoodFile() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleGood.vcf", "ALL"), + 0, + Arrays.asList("") + ); + + executeTest("test good file", spec); + } + + @Test + public void testBadRefBase1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleBad.vcf", "REF"), + 0, + UserException.MalformedFile.class + ); + + executeTest("test bad ref base #1", spec); + } + + @Test + public void testBadRefBase2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleBad2.vcf", "REF"), + 0, + UserException.MalformedFile.class + ); + + executeTest("test bad ref base #2", spec); + } + + @Test + public void testBadChrCount1() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleBad.vcf", "CHR_COUNTS"), + 0, + UserException.MalformedFile.class + ); + + executeTest("test bad chr counts #1", spec); + } + + @Test + public void testBadChrCount2() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleBad2.vcf", "CHR_COUNTS"), + 0, + UserException.MalformedFile.class + ); + + executeTest("test bad chr counts #2", spec); + } + + @Test + public void testBadID() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleBad.vcf", "IDS") + " -D " + GATKDataLocation + "dbsnp_129_b36.rod", + 0, + UserException.MalformedFile.class + ); + + executeTest("test bad RS ID", spec); + } + + @Test + public void testBadAllele() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleBad.vcf", "ALLELES"), + 0, + UserException.MalformedFile.class + ); + + executeTest("test bad alt allele", spec); + } +} \ No newline at end of file