First pass at a VCF validator. Will test more tonight.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4524 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
341e93ee12
commit
7a291a8ff3
|
|
@ -0,0 +1,155 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||||
|
import org.broad.tribble.TribbleException;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.Hidden;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates a variants file.
|
||||||
|
*/
|
||||||
|
@Reference(window=@Window(start=0,stop=100))
|
||||||
|
@Requires(value={},referenceMetaData=@RMD(name=ValidateVariants.TARGET_ROD_NAME, type=VariantContext.class))
|
||||||
|
public class ValidateVariants extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
|
protected static final String TARGET_ROD_NAME = "variant";
|
||||||
|
|
||||||
|
public enum ValidationType {
|
||||||
|
ALL, REF, IDS, ALLELES, CHR_COUNTS
|
||||||
|
}
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false)
|
||||||
|
protected ValidationType type = ValidationType.ALL;
|
||||||
|
|
||||||
|
private File file = null;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
for ( ReferenceOrderedDataSource source : getToolkit().getRodDataSources() ) {
|
||||||
|
if ( source.getName().equals(TARGET_ROD_NAME) ) {
|
||||||
|
file = source.getReferenceOrderedData().getFile();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
Collection<VariantContext> VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false);
|
||||||
|
for ( VariantContext vc : VCs )
|
||||||
|
validate(vc, tracker, ref);
|
||||||
|
|
||||||
|
return VCs.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
public Integer reduce(Integer value, Integer sum) { return sum+value; }
|
||||||
|
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
System.out.println("Successfully validated the input file. Checked " + result + " records with no failures.");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void validate(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref) {
|
||||||
|
// get the true reference allele
|
||||||
|
Allele reportedRefAllele = vc.getReference();
|
||||||
|
Allele observedRefAllele;
|
||||||
|
// insertions
|
||||||
|
if ( reportedRefAllele.isNull() ) {
|
||||||
|
observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING);
|
||||||
|
}
|
||||||
|
// SNPs
|
||||||
|
else if ( reportedRefAllele.length() == 1 ) {
|
||||||
|
byte[] refByte = new byte[1];
|
||||||
|
refByte[0] = ref.getBase();
|
||||||
|
observedRefAllele = Allele.create(refByte, true);
|
||||||
|
}
|
||||||
|
// deletions
|
||||||
|
else {
|
||||||
|
// we can't validate arbitrarily long deletions
|
||||||
|
if ( reportedRefAllele.length() > 100 ) {
|
||||||
|
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart()));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte[] refBytes = ref.getBases();
|
||||||
|
byte[] trueRef = new byte[reportedRefAllele.length()];
|
||||||
|
for (int i = 0; i < reportedRefAllele.length(); i++)
|
||||||
|
trueRef[i] = refBytes[i+1];
|
||||||
|
observedRefAllele = Allele.create(trueRef, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the RS IDs
|
||||||
|
Set<String> rsIDs = null;
|
||||||
|
if ( tracker.hasROD(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||||
|
List<Object> dbsnpList = tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME);
|
||||||
|
rsIDs = new HashSet<String>();
|
||||||
|
for ( Object d : dbsnpList ) {
|
||||||
|
if (d instanceof DbSNPFeature )
|
||||||
|
rsIDs.add(((DbSNPFeature)d).getRsID());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
switch( type ) {
|
||||||
|
case ALL:
|
||||||
|
vc.extraStrictValidation(observedRefAllele, rsIDs);
|
||||||
|
break;
|
||||||
|
case REF:
|
||||||
|
vc.validateReferenceBases(observedRefAllele);
|
||||||
|
break;
|
||||||
|
case IDS:
|
||||||
|
vc.validateRSIDs(rsIDs);
|
||||||
|
break;
|
||||||
|
case ALLELES:
|
||||||
|
vc.validateAlternateAlleles();
|
||||||
|
break;
|
||||||
|
case CHR_COUNTS:
|
||||||
|
vc.validateChromosomeCounts();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (TribbleException e) {
|
||||||
|
throw new UserException.MalformedFile(file, e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,116 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.WalkerTest;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class ValidateVariantsIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
|
public static String baseTestString(String file, String type) {
|
||||||
|
return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 -B:variant,VCF " + validationDataLocation + file + " --validationType " + type;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGoodFile() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("validationExampleGood.vcf", "ALL"),
|
||||||
|
0,
|
||||||
|
Arrays.asList("")
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest("test good file", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadRefBase1() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("validationExampleBad.vcf", "REF"),
|
||||||
|
0,
|
||||||
|
UserException.MalformedFile.class
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest("test bad ref base #1", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadRefBase2() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("validationExampleBad2.vcf", "REF"),
|
||||||
|
0,
|
||||||
|
UserException.MalformedFile.class
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest("test bad ref base #2", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadChrCount1() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("validationExampleBad.vcf", "CHR_COUNTS"),
|
||||||
|
0,
|
||||||
|
UserException.MalformedFile.class
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest("test bad chr counts #1", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadChrCount2() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("validationExampleBad2.vcf", "CHR_COUNTS"),
|
||||||
|
0,
|
||||||
|
UserException.MalformedFile.class
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest("test bad chr counts #2", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadID() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("validationExampleBad.vcf", "IDS") + " -D " + GATKDataLocation + "dbsnp_129_b36.rod",
|
||||||
|
0,
|
||||||
|
UserException.MalformedFile.class
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest("test bad RS ID", spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadAllele() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString("validationExampleBad.vcf", "ALLELES"),
|
||||||
|
0,
|
||||||
|
UserException.MalformedFile.class
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest("test bad alt allele", spec);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue