From c5c11d5d1c9fdb3574e3198bcd2b2a8be021fbcd Mon Sep 17 00:00:00 2001 From: kiran Date: Wed, 29 Jul 2009 19:29:03 +0000 Subject: [PATCH] First attempt at modifying the VFW interfaces to support direct emission of relevant training data per feature and exclusion criterion. This way, you could run the program once, get the training sets, and then feed that training set back to the filters and have them automatically choose the optimal thresholds for themselves. This current version is pretty ugly right now... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1334 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variants/IVFBinomialStrand.java | 16 ++++- .../gatk/walkers/variants/IVFNull.java | 13 +++- .../walkers/variants/IVFPrimaryBases.java | 16 ++++- .../walkers/variants/IVFSecondaryBases.java | 16 ++++- .../variants/IndependentVariantFeature.java | 9 ++- .../gatk/walkers/variants/RatioFilter.java | 13 +++- .../walkers/variants/VECDepthOfCoverage.java | 23 ++++++- .../walkers/variants/VECFisherStrand.java | 20 ++++-- .../walkers/variants/VECLodThreshold.java | 19 ++++++ .../gatk/walkers/variants/VECNull.java | 28 ++++++++ .../variants/VECOnOffGenotypeRatio.java | 1 + .../variants/VariantExclusionCriterion.java | 8 ++- .../variants/VariantFiltrationWalker.java | 64 ++++++++++++++++--- 13 files changed, 220 insertions(+), 26 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECNull.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFBinomialStrand.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFBinomialStrand.java index 6a3d41b8c..8a20ba58a 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFBinomialStrand.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFBinomialStrand.java @@ -10,14 +10,16 @@ import java.util.List; public class IVFBinomialStrand implements IndependentVariantFeature { private double strandBalance = 0.5; + private double[] likelihoods; + public void initialize(String arguments) { if (arguments != null && !arguments.isEmpty()) { strandBalance = Double.valueOf(arguments); } } - public double[] compute(char ref, LocusContext context) { - double[] likelihoods = new double[10]; + public void compute(char ref, LocusContext context) { + likelihoods = new double[10]; ReadBackedPileup pileup = new ReadBackedPileup(ref, context); List reads = context.getReads(); @@ -43,7 +45,17 @@ public class IVFBinomialStrand implements IndependentVariantFeature { likelihoods[genotypeIndex] = Math.log10(MathUtils.binomialProbability(strandCounts[0], strandCounts[0] + strandCounts[1], strandBalance)); } + } + public double[] getLikelihoods() { return likelihoods; } + + public String getStudyHeader() { + return ""; + } + + public String getStudyInfo() { + return ""; + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFNull.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFNull.java index 32d91f446..d859b81cf 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFNull.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFNull.java @@ -5,7 +5,18 @@ import org.broadinstitute.sting.gatk.LocusContext; public class IVFNull implements IndependentVariantFeature { public void initialize(String arguments) {} - public double[] compute(char ref, LocusContext context) { + public void compute(char ref, LocusContext context) { + } + + public double[] getLikelihoods() { return new double[10]; } + + public String getStudyHeader() { + return ""; + } + + public String getStudyInfo() { + return ""; + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFPrimaryBases.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFPrimaryBases.java index fc05e45e5..ef12fc5bb 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFPrimaryBases.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFPrimaryBases.java @@ -7,6 +7,8 @@ import org.broadinstitute.sting.utils.ReadBackedPileup; public class IVFPrimaryBases implements IndependentVariantFeature { private double[] p = { 0.933, 0.972, 0.970, 0.960, 0.945, 0.990, 0.971, 0.943, 0.978, 0.928 }; + private double[] likelihoods; + /** * Method so that features can initialize themselves based on a short argument string. At the moment, each feature is * responsible for interpreting their own argument string. @@ -31,8 +33,8 @@ public class IVFPrimaryBases implements IndependentVariantFeature { * @param context the context for the given locus * @return a ten-element array of log-likelihood result of the feature applied to each genotype */ - public double[] compute(char ref, LocusContext context) { - double[] likelihoods = new double[10]; + public void compute(char ref, LocusContext context) { + likelihoods = new double[10]; ReadBackedPileup pileup = new ReadBackedPileup(ref, context); String primaryBases = pileup.getBases(); @@ -62,7 +64,17 @@ public class IVFPrimaryBases implements IndependentVariantFeature { likelihoods[genotypeIndex] = Math.log10(Prior); } + } + public double[] getLikelihoods() { return likelihoods; } + + public String getStudyHeader() { + return ""; + } + + public String getStudyInfo() { + return ""; + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFSecondaryBases.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFSecondaryBases.java index 51b2e609c..611f6a2b7 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFSecondaryBases.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IVFSecondaryBases.java @@ -8,6 +8,8 @@ public class IVFSecondaryBases implements IndependentVariantFeature { private double[] p2on = { 0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000 }; private double[] p2off = { 0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505 }; + private double[] likelihoods; + /** * Method so that features can initialize themselves based on a short argument string. At the moment, each feature is * responsible for interpreting their own argument string. @@ -58,8 +60,8 @@ public class IVFSecondaryBases implements IndependentVariantFeature { * @param context the context for the given locus * @return a ten-element array of log-likelihood result of the feature applied to each genotype */ - public double[] compute(char ref, LocusContext context) { - double[] likelihoods = new double[10]; + public void compute(char ref, LocusContext context) { + likelihoods = new double[10]; ReadBackedPileup pileup = new ReadBackedPileup(ref, context); String primaryBases = pileup.getBases(); @@ -109,7 +111,17 @@ public class IVFSecondaryBases implements IndependentVariantFeature { likelihoods[genotypeIndex] = logOffPrior + logOnPrior; } + } + public double[] getLikelihoods() { return likelihoods; } + + public String getStudyHeader() { + return ""; + } + + public String getStudyInfo() { + return ""; + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IndependentVariantFeature.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IndependentVariantFeature.java index 3e5e6c3fb..3943c6c46 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IndependentVariantFeature.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/IndependentVariantFeature.java @@ -25,7 +25,12 @@ public interface IndependentVariantFeature { * * @param ref the reference base * @param context the context for the given locus - * @return a ten-element array of log-likelihood result of the feature applied to each genotype */ - public double[] compute(char ref, LocusContext context); + public void compute(char ref, LocusContext context); + + public double[] getLikelihoods(); + + public String getStudyHeader(); + + public String getStudyInfo(); } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java index 177d9097d..d912599a7 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java @@ -207,6 +207,7 @@ public abstract class RatioFilter implements VariantExclusionCriterion { protected String name = null; protected GenotypeFeatureData.Tail tail = null; + protected boolean exclude = false; /** * A short-term hack to stop the systme from rejecting poorly covered sites, under the assumption @@ -256,7 +257,7 @@ public abstract class RatioFilter implements VariantExclusionCriterion { public boolean useZeroQualityReads() { return false; } - public boolean exclude(char ref, LocusContext context, rodVariants variant) { + public void compute(char ref, LocusContext context, rodVariants variant) { boolean exclude = false; // @@ -285,10 +286,20 @@ public abstract class RatioFilter implements VariantExclusionCriterion { name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n, value, gfd, exclude, pileup.getBases())); } + } + public boolean isExcludable() { return exclude; } + public String getStudyHeader() { + return ""; + } + + public String getStudyInfo() { + return ""; + } + private final static double SEARCH_INCREMENT = 0.01; private final static double integralPValueThreshold = 0.05; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECDepthOfCoverage.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECDepthOfCoverage.java index 2d107754e..1dc2a0d28 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECDepthOfCoverage.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECDepthOfCoverage.java @@ -13,15 +13,32 @@ import org.broadinstitute.sting.gatk.refdata.rodVariants; public class VECDepthOfCoverage implements VariantExclusionCriterion { private int maximum = 200; + private boolean exclude = false; + private int depth; + public void initialize(String arguments) { if (arguments != null && !arguments.isEmpty()) { maximum = Integer.valueOf(arguments); } } - public boolean useZeroQualityReads() { return false; } + public void compute(char ref, LocusContext context, rodVariants variant) { + exclude = context.getReads().size() > maximum; - public boolean exclude(char ref, LocusContext context, rodVariants variant) { - return context.getReads().size() > maximum; + depth = context.getReads().size(); } + + public boolean isExcludable() { + return exclude; + } + + public String getStudyHeader() { + return "#depth"; + } + + public String getStudyInfo() { + return Integer.toString(depth); + } + + public boolean useZeroQualityReads() { return false; } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java index be57d5d5a..91415685f 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java @@ -10,8 +10,8 @@ import java.util.List; import cern.jet.math.Arithmetic; public class VECFisherStrand implements VariantExclusionCriterion { - private double pvalueLimit = 0.0001; + private boolean exclude; public void initialize(String arguments) { if (arguments != null && !arguments.isEmpty()) { @@ -19,15 +19,27 @@ public class VECFisherStrand implements VariantExclusionCriterion { } } - public boolean exclude(char ref, LocusContext context, rodVariants variant) { + public void compute(char ref, LocusContext context, rodVariants variant) { int allele1 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(0)); int allele2 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(1)); if (allele1 != allele2) { - return strandTest(ref, context, allele1, allele2, pvalueLimit, null); + exclude = strandTest(ref, context, allele1, allele2, pvalueLimit, null); + } else { + exclude = false; } + } - return false; + public boolean isExcludable() { + return exclude; + } + + public String getStudyHeader() { + return ""; + } + + public String getStudyInfo() { + return ""; } public boolean useZeroQualityReads() { return false; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECLodThreshold.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECLodThreshold.java index 9415c9bce..3eeb2932f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECLodThreshold.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECLodThreshold.java @@ -6,15 +6,34 @@ import org.broadinstitute.sting.gatk.LocusContext; public class VECLodThreshold implements VariantExclusionCriterion { private double lodThreshold = 5.0; + private boolean exclude; + public void initialize(String arguments) { if (arguments != null && !arguments.isEmpty()) { lodThreshold = Double.valueOf(arguments); } } + public void compute(char ref, LocusContext context, rodVariants variant) { + exclude = (variant.getLodBtr() < lodThreshold); + } + public boolean useZeroQualityReads() { return false; } public boolean exclude(char ref, LocusContext context, rodVariants variant) { return (variant.getLodBtr() < lodThreshold); } + + public boolean isExcludable() { + return exclude; + } + + public String getStudyHeader() { + return ""; + } + + public String getStudyInfo() { + return ""; + } + } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECNull.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECNull.java new file mode 100755 index 000000000..acb2f66df --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECNull.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variants; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.rodVariants; + +public class VECNull implements VariantExclusionCriterion { + public void initialize(String arguments) { + } + + public void compute(char ref, LocusContext context, rodVariants variant) { + } + + public boolean isExcludable() { + return false; + } + + public String getStudyHeader() { + return ""; + } + + public String getStudyInfo() { + return ""; + } + + public boolean useZeroQualityReads() { + return false; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java index 560decb8d..33bf995c2 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.variants; import org.broadinstitute.sting.gatk.refdata.rodVariants; +import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.utils.*; public class VECOnOffGenotypeRatio extends RatioFilter { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantExclusionCriterion.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantExclusionCriterion.java index 3b2a8f504..41220572d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantExclusionCriterion.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantExclusionCriterion.java @@ -6,6 +6,12 @@ import org.broadinstitute.sting.gatk.refdata.rodVariants; public interface VariantExclusionCriterion { public void initialize(String arguments); - public boolean exclude(char ref, LocusContext context, rodVariants variant); + public void compute(char ref, LocusContext context, rodVariants variant); + + public boolean isExcludable(); + + public String getStudyHeader(); + + public String getStudyInfo(); public boolean useZeroQualityReads(); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java index d68d96fe2..459716e37 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java @@ -21,8 +21,9 @@ import java.util.*; import net.sf.samtools.SAMRecord; /** - * VariantFiltrationWalker applies specified conditionally independent features to pre-called variants, thus modifying - * the likelihoods of each genotype. At the moment, the variants are expected to be in gelitext format. + * VariantFiltrationWalker applies specified conditionally independent features and filters to pre-called variants. + * The former modifiesthe likelihoods of each genotype, while the latter makes a decision to include or exclude a + * variant outright. At the moment, the variants are expected to be in gelitext format. */ @Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=rodVariants.class)) public class VariantFiltrationWalker extends LocusWalker { @@ -31,12 +32,15 @@ public class VariantFiltrationWalker extends LocusWalker { @Argument(fullName="exclusion_criterion", shortName="X", doc="Exclusion test (optionally with arguments) to apply to variant call. Syntax: 'testname[:arguments]'", required=false) public String[] EXCLUSIONS; @Argument(fullName="verbose", shortName="V", doc="Show how the variant likelihoods are changing with the application of each feature") public Boolean VERBOSE = false; @Argument(fullName="list", shortName="ls", doc="List the available features and exclusion criteria and exit") public Boolean LIST = false; + @Argument(fullName="learning_mode", shortName="LM", doc="Output parseable information on each filter that can then be fed back to the filter as a training set") public Boolean LEARNING = false; @Argument(fullName="truth", shortName="truth", doc="Operate on truth set only") public Boolean TRUTH = false; private List> featureClasses; private List> exclusionClasses; + private PrintWriter vwriter; private HashMap ewriters; + private HashMap swriters; private ArrayList requestedFeatures; private ArrayList requestedExclusions; @@ -61,6 +65,8 @@ public class VariantFiltrationWalker extends LocusWalker { requestedFeatures = new ArrayList(); requestedExclusions = new ArrayList(); + swriters = new HashMap(); + // Initialize requested features if (FEATURES != null) { for (String requestedFeatureString : FEATURES) { @@ -76,6 +82,13 @@ public class VariantFiltrationWalker extends LocusWalker { IndependentVariantFeature ivf = (IndependentVariantFeature) featureClass.newInstance(); ivf.initialize(requestedFeatureArgs); + if (LEARNING) { + PrintWriter studyWriter = new PrintWriter(VARIANTS_OUT_HEAD + "." + featureClassName + ".study"); + studyWriter.println(ivf.getStudyHeader()); + + swriters.put(featureClassName, studyWriter); + } + requestedFeatures.add(ivf); } catch (InstantiationException e) { throw new StingException(String.format("Cannot instantiate feature class '%s': must be concrete class", featureClass.getSimpleName())); @@ -104,6 +117,13 @@ public class VariantFiltrationWalker extends LocusWalker { VariantExclusionCriterion vec = (VariantExclusionCriterion) exclusionClass.newInstance(); vec.initialize(requestedExclusionArgs); + if (LEARNING) { + PrintWriter studyWriter = new PrintWriter(VARIANTS_OUT_HEAD + "." + exclusionClassName + ".study"); + studyWriter.println(vec.getStudyHeader()); + + swriters.put(exclusionClassName, studyWriter); + } + requestedExclusions.add(vec); PrintWriter writer = new PrintWriter(VARIANTS_OUT_HEAD + ".excluded." + exclusionClassName + ".geli.calls"); @@ -185,9 +205,22 @@ public class VariantFiltrationWalker extends LocusWalker { // Apply features that modify the likelihoods and LOD scores for ( IndependentVariantFeature ivf : requestedFeatures ) { - variant.adjustLikelihoods(ivf.compute(ref, context)); + ivf.compute(ref, context); + + String featureClassName = rationalizeClassName(ivf.getClass()); + + double[] weights = ivf.getLikelihoods(); + + variant.adjustLikelihoods(weights); if (VERBOSE) { out.println(rationalizeClassName(ivf.getClass()) + ":\n " + variant); } + + if (LEARNING) { + PrintWriter swriter = swriters.get(featureClassName); + if (swriter != null) { + swriter.println(ivf.getStudyInfo()); + } + } } // Apply exclusion tests that accept or reject the variant call @@ -198,10 +231,21 @@ public class VariantFiltrationWalker extends LocusWalker { LocusContext Q0freeContext = removeQ0reads(context); for ( VariantExclusionCriterion vec : requestedExclusions ) { - boolean excludeResult = vec.exclude(ref, (vec.useZeroQualityReads() ? context : Q0freeContext), variant); + vec.compute(ref, (vec.useZeroQualityReads() ? context : Q0freeContext), variant); - if (excludeResult) { - exclusionResults.add(rationalizeClassName(vec.getClass())); + String exclusionClassName = rationalizeClassName(vec.getClass()); + + if (vec.isExcludable()) { + exclusionResults.add(exclusionClassName); + } + + if (LEARNING) { + PrintWriter swriter = swriters.get(exclusionClassName); + + if (swriter != null) { + swriter.println(vec.getStudyInfo()); + swriter.flush(); + } } } @@ -272,8 +316,12 @@ public class VariantFiltrationWalker extends LocusWalker { vwriter.close(); - for (PrintWriter writer : ewriters.values()) { - writer.close(); + for (PrintWriter ewriter : ewriters.values()) { + ewriter.close(); + } + + for (PrintWriter swriter : swriters.values()) { + swriter.close(); } } }