From 04fdbbfa650a9c204e7ab57c5182d85ab2940e4e Mon Sep 17 00:00:00 2001 From: kiran Date: Thu, 14 Jan 2010 17:45:58 +0000 Subject: [PATCH] This is the beginning of a new version of VariantEval that can cut VCF files up in a variety of ways with JEXL expressions, select one sample out of a multi-sample VCF, and can load analysis modules dynamically. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2584 348d0f76-0448-11de-a6fe-93d51630548a --- .../newvarianteval/CounterEvaluation.java | 42 +++++ .../newvarianteval/NewVariantEval.java | 153 ++++++++++++++++++ .../SubstitutionEvaluation.java | 65 ++++++++ .../newvarianteval/VariantEvaluation.java | 18 +++ 4 files changed, 278 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/CounterEvaluation.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/NewVariantEval.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/SubstitutionEvaluation.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/VariantEvaluation.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/CounterEvaluation.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/CounterEvaluation.java new file mode 100755 index 000000000..1078838f8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/CounterEvaluation.java @@ -0,0 +1,42 @@ +package org.broadinstitute.sting.playground.gatk.walkers.diagnostics.newvarianteval; + +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; + +import java.util.List; +import java.util.ArrayList; + +public class CounterEvaluation implements VariantEvaluation { + private int numVariants = 0; + private int numCalls = 0; + private int numKnown = 0; + + public CounterEvaluation() {} + + public void update(RodVCF variant, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + numVariants += (variant != null && variant.isSNP()) ? 1 : 0; + numKnown += (variant != null && variant.isSNP() && !variant.getID().equals(".")) ? 1 : 0; + numCalls++; + } + + public String getName() { + return "Counter"; + } + + public List getResult() { + ArrayList results = new ArrayList(); + + double variantRate = ((double) numVariants)/((double) numCalls); + double knownVariantsPct = 100.0*((double) numKnown)/((double) numVariants); + + results.add(String.format("regionSize=%d", numCalls)); + results.add(String.format("variants=%d", numVariants)); + results.add(String.format("variantsKnown=%d", numKnown)); + results.add(String.format("knownVariantsPct=%f", knownVariantsPct)); + results.add(String.format("variantRate=1/%d", Math.round(1.0/variantRate))); + + return results; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/NewVariantEval.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/NewVariantEval.java new file mode 100755 index 000000000..0667dbd6a --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/NewVariantEval.java @@ -0,0 +1,153 @@ +package org.broadinstitute.sting.playground.gatk.walkers.diagnostics.newvarianteval; + +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RODRecordList; +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.PackageUtils; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.Variation; +import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; +import org.broadinstitute.sting.playground.gatk.walkers.varianteval.VariantAnalysis; +import org.apache.commons.jexl.ExpressionFactory; +import org.apache.commons.jexl.Expression; +import org.apache.commons.jexl.JexlHelper; +import org.apache.commons.jexl.JexlContext; + +import java.util.*; + +@Requires(value={},referenceMetaData=@RMD(name="eval",type=RodVCF.class)) +public class NewVariantEval extends RodWalker { + @Argument(fullName="filterExpression", shortName="filter", doc="Expression used with INFO fields to filter (see wiki docs for more info)", required=false) + private String FILTER_STRING = "FILTER == false"; + + @Argument(fullName="sampleName", shortName="sample", doc="If the VCF file has multiple samples, evaluate only the specified sample", required=false) + private String SAMPLE_NAME = null; + + private Expression filterExpression; + private HashSet evals; + + public void initialize() { + try { + filterExpression = ExpressionFactory.createExpression(FILTER_STRING); + } catch (Exception e) { + throw new StingException("Invalid expression used (" + FILTER_STRING + "). Please see the JEXL docs for correct syntax."); + } + + try { + evals = new HashSet(); + + List> cevals = PackageUtils.getClassesImplementingInterface(VariantEvaluation.class); + for (Class ceval : cevals) { + out.printf("Analysis: %s%n", ceval.getName()); + + VariantEvaluation eval = (VariantEvaluation) ceval.newInstance(); + evals.add(eval); + } + } catch (InstantiationException e) { + throw new StingException(e.getMessage()); + } catch (IllegalAccessException e) { + throw new StingException(e.getMessage()); + } + } + + public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + RODRecordList rods = tracker.getTrackData("eval", null); + + if ( rods != null ) { + RodVCF variant = (RodVCF) rods.getRecords().get(0); + + Map infoMap = new HashMap(variant.mCurrentRecord.getInfoValues()); + infoMap.put("QUAL", String.valueOf(variant.mCurrentRecord.getQual())); + infoMap.put("FILTER", String.valueOf(variant.mCurrentRecord.isFiltered())); + infoMap.put("ID", String.valueOf(variant.mCurrentRecord.getID())); + + for (String filterCode : variant.mCurrentRecord.getFilteringCodes()) { + infoMap.put(filterCode, "1"); + } + + JexlContext jContext = JexlHelper.createContext(); + jContext.setVars(infoMap); + + try { + return (Boolean) filterExpression.evaluate(jContext); + } catch (Exception e) { + throw new StingException(e.getMessage()); + } + } + + return true; + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + RODRecordList rods = tracker.getTrackData("eval", null); + RodVCF variant = (rods != null) ? (RodVCF) rods.getRecords().get(0) : null; + + if (variant != null && SAMPLE_NAME != null && variant.hasGenotypeData()) { + variant = selectSample(SAMPLE_NAME, variant); + + if (variant == null) { + throw new StingException(String.format("The sample '%s' is not present in the specified VCF", SAMPLE_NAME)); + } + } + + for (VariantEvaluation eval : evals) { + eval.update(variant, tracker, ref, context); + } + + return 1; + } + + private RodVCF selectSample(String sample, RodVCF variant) { + String[] samples = variant.getSampleNames(); + + for (int i = 0; i < samples.length; i++) { + if (samples[i].equalsIgnoreCase(sample)) { + List genotypeRecs = new ArrayList(); + genotypeRecs.add(variant.mCurrentRecord.getVCFGenotypeRecords().get(i)); + + variant.mCurrentRecord = new VCFRecord(variant.getReferenceForSNP(), + variant.getLocation().getContig(), + (int) variant.getLocation().getStart(), + variant.getID(), + variant.mCurrentRecord.getAlternateAlleles(), + variant.getQual(), + variant.getFilterString(), + variant.getInfoValues(), + variant.mCurrentRecord.getGenotypeFormatString(), + genotypeRecs); + + return variant; + } + } + + return null; + } + + public Integer reduceInit() { + return null; + } + + public Integer reduce(Integer value, Integer sum) { + return null; + } + + public void onTraversalDone(Integer sum) { + for (VariantEvaluation eval : evals) { + List results = eval.getResult(); + + for (String result : results) { + out.printf("%s:%s%n", eval.getName(), result); + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/SubstitutionEvaluation.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/SubstitutionEvaluation.java new file mode 100755 index 000000000..15a6c9cd1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/SubstitutionEvaluation.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.playground.gatk.walkers.diagnostics.newvarianteval; + +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.playground.utils.NamedTable; +import org.broadinstitute.sting.utils.BaseUtils; + +import java.util.List; +import java.util.ArrayList; + +public class SubstitutionEvaluation implements VariantEvaluation { + private NamedTable substitutions; + + public SubstitutionEvaluation() { + substitutions = new NamedTable(); + } + + public void update(RodVCF variant, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (variant != null) { + List altAlleles = variant.getAlternateAlleleList(); + String altAllele = altAlleles.get(0); + String refAllele = String.format("%c", BaseUtils.baseIndexToSimpleBase(ref.getBaseIndex())); + + substitutions.increment(refAllele, altAllele); + } + } + + public String getName() { + return "Substitution"; + } + + public List getResult() { + ArrayList results = new ArrayList(); + + int transitions = 0; + int transversions = 0; + int unknown = 0; + + for (char rbase : BaseUtils.BASES) { + String refAllele = String.format("%c", rbase); + + for (char abase : BaseUtils.BASES) { + String altAllele = String.format("%c", abase); + + if (BaseUtils.isTransition(rbase, abase)) { + transitions += substitutions.get(refAllele, altAllele); + } else if (BaseUtils.isTransversion(rbase, abase)) { + transversions += substitutions.get(refAllele, altAllele); + } else { + unknown += substitutions.get(refAllele, altAllele); + } + } + } + + results.add(String.format("%n%s", substitutions.toString())); + results.add(String.format("transitions=%d", transitions)); + results.add(String.format("transversions=%d", transversions)); + results.add(String.format("unknown=%d", unknown)); + results.add(String.format("titv=%f", ((double) transitions)/((double) transversions))); + + return results; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/VariantEvaluation.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/VariantEvaluation.java new file mode 100755 index 000000000..92b4b1d1e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/diagnostics/newvarianteval/VariantEvaluation.java @@ -0,0 +1,18 @@ +package org.broadinstitute.sting.playground.gatk.walkers.diagnostics.newvarianteval; + +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; + +import java.util.List; + +public interface VariantEvaluation { + public void update(RodVCF variant, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context); + + //public Integer map( + + public String getName(); + + public List getResult(); +}