From a1a3d5fcb08c80af1590df78f9aacfd000d28db2 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 12 Feb 2010 22:40:55 +0000 Subject: [PATCH] Support for reading in table of rsIDs -> dbSNP builds to back generate a dbSNP build X from a single file. Very useful indeed. dbSNP -> VC now captures the rsID in the context git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2837 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantcontext/VariantContext.java | 2 +- .../gatk/refdata/VariantContextAdaptors.java | 5 +- .../MendelianViolationEvaluator.java | 3 + .../varianteval2/VariantEval2Walker.java | 61 +++++++++++++++++-- 4 files changed, 65 insertions(+), 6 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java index f1c874116..ab56b964b 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java @@ -215,7 +215,7 @@ public class VariantContext { * @param attributes */ public VariantContext(String name, GenomeLoc loc, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(name, loc, alleles, genotypeCollectionToMap(new HashMap(), genotypes), negLog10PError, filters, attributes); + this(name, loc, alleles, genotypes != null ? genotypeCollectionToMap(new HashMap(), genotypes) : null, negLog10PError, filters, attributes); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 502e139b8..91a870dcc 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -93,7 +93,10 @@ public class VariantContextAdaptors { alleles.add(new Allele(alt, false)); } - VariantContext vc = new VariantContext(name, dbsnp.getLocation(), alleles); + Map attributes = new HashMap(); + attributes.put("ID", dbsnp.getRS_ID()); + Collection genotypes = null; + VariantContext vc = new VariantContext(name, dbsnp.getLocation(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes); vc.validate(); return vc; } else diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java index 1e9909ba3..00e0cd95e 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java @@ -100,6 +100,9 @@ public class MendelianViolationEvaluator extends VariantEvaluator { Genotype dadG = vc.getGenotype(trio.dad); Genotype childG = vc.getGenotype(trio.child); + if ( momG == null || dadG == null || childG == null ) + throw new IllegalArgumentException(String.format("VariantContext didn't contain genotypes for expected trio members: mom=%s dad=%s child=%s", trio.mom, trio.dad, trio.child)); + if ( includeGenotype(momG) && includeGenotype(dadG) && includeGenotype(childG) ) { // all genotypes are good, so let's see if child is a violation diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java index eaff8adf5..c637cd076 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java @@ -17,6 +17,7 @@ import java.util.*; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.io.File; +import java.io.FileNotFoundException; // todo -- evalations should support comment lines // todo -- add Mendelian variable explanations (nDeNovo and nMissingTransmissions) @@ -120,6 +121,14 @@ public class VariantEval2Walker extends RodWalker { /** Right now we will only be looking at SNPS */ EnumSet ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP); + @Argument(shortName="rsID", fullName="rsID", doc="If provided, list of rsID and build number for capping known snps by their build date", required=false) + protected String rsIDFile = null; + + @Argument(shortName="maxRsIDBuild", fullName="maxRsIDBuild", doc="If provided, only variants with rsIDs <= maxRsIDBuild will be included in the set of knwon snps", required=false) + protected int maxRsIDBuild = Integer.MAX_VALUE; + + Set rsIDsToExclude = null; + // -------------------------------------------------------------------------------------------------------------- // // private walker data @@ -218,8 +227,50 @@ public class VariantEval2Walker extends RodWalker { if ( outputVCF != null ) writer = new VCFWriter(new File(outputVCF)); + + if ( rsIDFile != null ) { + if ( maxRsIDBuild == Integer.MAX_VALUE ) + throw new IllegalArgumentException("rsIDFile " + rsIDFile + " was given but associated max RSID build parameter wasn't available"); + rsIDsToExclude = getrsIDsToExclude(new File(rsIDFile), maxRsIDBuild); + } } + private static Set getrsIDsToExclude(File rsIDFile, int maxRsIDBuild) { + List toExclude = new LinkedList(); + + int n = 1; + try { + for ( String line : new xReadLines(rsIDFile) ) { + String parts[] = line.split(" "); + if ( parts.length != 2 ) + throw new StingException("Invalid rsID / build pair at " + n + " line = " + line ); + //System.out.printf("line %s %s %s%n", line, parts[0], parts[1]); + if ( Integer.valueOf(parts[1]) > maxRsIDBuild ) { + //System.out.printf("Excluding %s%n", line); + toExclude.add("rs"+parts[0]); + } + n++; + + if ( n % 1000000 == 0 ) + logger.info(String.format("Read %d rsIDs from rsID -> build file", n)); + } + } catch (FileNotFoundException e) { + throw new StingException(e.getMessage()); + } + + logger.info(String.format("Excluding %d of %d (%.2f%%) rsIDs found from builds > %d", + toExclude.size(), n, ((100.0 * toExclude.size())/n), maxRsIDBuild)); + + return new HashSet(toExclude); + } + + private final static String ID = "ID"; + private boolean excludeComp(VariantContext vc) { + String id = vc != null && vc.hasAttribute(ID) ? vc.getAttributeAsString(ID) : null; + boolean ex = rsIDsToExclude != null && id != null && rsIDsToExclude.contains(id); + //System.out.printf("Testing id %s ex=%b against %s%n", id, ex, vc); + return ex; + } private void determineAllEvalations() { evaluationClasses = PackageUtils.getClassesImplementingInterface(VariantEvaluator.class); @@ -426,18 +477,20 @@ public class VariantEval2Walker extends RodWalker { // todo -- we need to deal with dbSNP where there can be multiple records at the same start site. A potential solution is to // todo -- allow the variant evaluation to specify the type of variants it wants to see and only take the first such record at a site Map bindings = new HashMap(); - bindVariantContexts(bindings, evalNames, tracker, context); - bindVariantContexts(bindings, compNames, tracker, context); + bindVariantContexts(bindings, evalNames, tracker, context, false); + bindVariantContexts(bindings, compNames, tracker, context, true); return bindings; } private void bindVariantContexts(Map map, Collection names, - RefMetaDataTracker tracker, AlignmentContext context ) { + RefMetaDataTracker tracker, AlignmentContext context, boolean allowExcludes ) { for ( String name : names ) { Collection contexts = tracker.getVariantContexts(name, ALLOW_VARIANT_CONTEXT_TYPES, context.getLocation(), true, true); if ( context.size() > 1 ) throw new StingException("Found multiple variant contexts at " + context.getLocation()); - map.put(name, contexts.size() == 1 ? contexts.iterator().next() : null); + + VariantContext vc = contexts.size() == 1 ? contexts.iterator().next() : null; + map.put(name, allowExcludes && excludeComp(vc) ? null : vc); } }