First pass at a VCF validator. Will test more tonight.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4524 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-10-19 19:55:49 +00:00
parent 341e93ee12
commit 7a291a8ff3
2 changed files with 271 additions and 0 deletions

View File

@ -0,0 +1,155 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import java.util.*;
import java.io.File;
/**
* Validates a variants file.
*/
@Reference(window=@Window(start=0,stop=100))
@Requires(value={},referenceMetaData=@RMD(name=ValidateVariants.TARGET_ROD_NAME, type=VariantContext.class))
public class ValidateVariants extends RodWalker<Integer, Integer> {
protected static final String TARGET_ROD_NAME = "variant";
public enum ValidationType {
ALL, REF, IDS, ALLELES, CHR_COUNTS
}
@Hidden
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false)
protected ValidationType type = ValidationType.ALL;
private File file = null;
public void initialize() {
for ( ReferenceOrderedDataSource source : getToolkit().getRodDataSources() ) {
if ( source.getName().equals(TARGET_ROD_NAME) ) {
file = source.getReferenceOrderedData().getFile();
break;
}
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
Collection<VariantContext> VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false);
for ( VariantContext vc : VCs )
validate(vc, tracker, ref);
return VCs.size();
}
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) { return sum+value; }
public void onTraversalDone(Integer result) {
System.out.println("Successfully validated the input file. Checked " + result + " records with no failures.");
}
private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref) {
// get the true reference allele
Allele reportedRefAllele = vc.getReference();
Allele observedRefAllele;
// insertions
if ( reportedRefAllele.isNull() ) {
observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING);
}
// SNPs
else if ( reportedRefAllele.length() == 1 ) {
byte[] refByte = new byte[1];
refByte[0] = ref.getBase();
observedRefAllele = Allele.create(refByte, true);
}
// deletions
else {
// we can't validate arbitrarily long deletions
if ( reportedRefAllele.length() > 100 ) {
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart()));
return;
}
byte[] refBytes = ref.getBases();
byte[] trueRef = new byte[reportedRefAllele.length()];
for (int i = 0; i < reportedRefAllele.length(); i++)
trueRef[i] = refBytes[i+1];
observedRefAllele = Allele.create(trueRef, true);
}
// get the RS IDs
Set<String> rsIDs = null;
if ( tracker.hasROD(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
List<Object> dbsnpList = tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME);
rsIDs = new HashSet<String>();
for ( Object d : dbsnpList ) {
if (d instanceof DbSNPFeature )
rsIDs.add(((DbSNPFeature)d).getRsID());
}
}
try {
switch( type ) {
case ALL:
vc.extraStrictValidation(observedRefAllele, rsIDs);
break;
case REF:
vc.validateReferenceBases(observedRefAllele);
break;
case IDS:
vc.validateRSIDs(rsIDs);
break;
case ALLELES:
vc.validateAlternateAlleles();
break;
case CHR_COUNTS:
vc.validateChromosomeCounts();
break;
}
} catch (TribbleException e) {
throw new UserException.MalformedFile(file, e.getMessage());
}
}
}

View File

@ -0,0 +1,116 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.junit.Test;
import java.util.Arrays;
public class ValidateVariantsIntegrationTest extends WalkerTest {
public static String baseTestString(String file, String type) {
return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 -B:variant,VCF " + validationDataLocation + file + " --validationType " + type;
}
@Test
public void testGoodFile() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleGood.vcf", "ALL"),
0,
Arrays.asList("")
);
executeTest("test good file", spec);
}
@Test
public void testBadRefBase1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "REF"),
0,
UserException.MalformedFile.class
);
executeTest("test bad ref base #1", spec);
}
@Test
public void testBadRefBase2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad2.vcf", "REF"),
0,
UserException.MalformedFile.class
);
executeTest("test bad ref base #2", spec);
}
@Test
public void testBadChrCount1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "CHR_COUNTS"),
0,
UserException.MalformedFile.class
);
executeTest("test bad chr counts #1", spec);
}
@Test
public void testBadChrCount2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad2.vcf", "CHR_COUNTS"),
0,
UserException.MalformedFile.class
);
executeTest("test bad chr counts #2", spec);
}
@Test
public void testBadID() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "IDS") + " -D " + GATKDataLocation + "dbsnp_129_b36.rod",
0,
UserException.MalformedFile.class
);
executeTest("test bad RS ID", spec);
}
@Test
public void testBadAllele() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("validationExampleBad.vcf", "ALLELES"),
0,
UserException.MalformedFile.class
);
executeTest("test bad alt allele", spec);
}
}