First attempt at modifying the VFW interfaces to support direct emission of relevant training data per feature and exclusion criterion. This way, you could run the program once, get the training sets, and then feed that training set back to the filters and have them automatically choose the optimal thresholds for themselves. This current version is pretty ugly right now...

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1334 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-07-29 19:29:03 +00:00
parent 3554897222
commit c5c11d5d1c
13 changed files with 220 additions and 26 deletions

View File

@ -10,14 +10,16 @@ import java.util.List;
public class IVFBinomialStrand implements IndependentVariantFeature {
private double strandBalance = 0.5;
private double[] likelihoods;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
strandBalance = Double.valueOf(arguments);
}
}
public double[] compute(char ref, LocusContext context) {
double[] likelihoods = new double[10];
public void compute(char ref, LocusContext context) {
likelihoods = new double[10];
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
List<SAMRecord> reads = context.getReads();
@ -43,7 +45,17 @@ public class IVFBinomialStrand implements IndependentVariantFeature {
likelihoods[genotypeIndex] = Math.log10(MathUtils.binomialProbability(strandCounts[0], strandCounts[0] + strandCounts[1], strandBalance));
}
}
public double[] getLikelihoods() {
return likelihoods;
}
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
}
}

View File

@ -5,7 +5,18 @@ import org.broadinstitute.sting.gatk.LocusContext;
public class IVFNull implements IndependentVariantFeature {
public void initialize(String arguments) {}
public double[] compute(char ref, LocusContext context) {
public void compute(char ref, LocusContext context) {
}
public double[] getLikelihoods() {
return new double[10];
}
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
}
}

View File

@ -7,6 +7,8 @@ import org.broadinstitute.sting.utils.ReadBackedPileup;
public class IVFPrimaryBases implements IndependentVariantFeature {
private double[] p = { 0.933, 0.972, 0.970, 0.960, 0.945, 0.990, 0.971, 0.943, 0.978, 0.928 };
private double[] likelihoods;
/**
* Method so that features can initialize themselves based on a short argument string. At the moment, each feature is
* responsible for interpreting their own argument string.
@ -31,8 +33,8 @@ public class IVFPrimaryBases implements IndependentVariantFeature {
* @param context the context for the given locus
* @return a ten-element array of log-likelihood result of the feature applied to each genotype
*/
public double[] compute(char ref, LocusContext context) {
double[] likelihoods = new double[10];
public void compute(char ref, LocusContext context) {
likelihoods = new double[10];
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
String primaryBases = pileup.getBases();
@ -62,7 +64,17 @@ public class IVFPrimaryBases implements IndependentVariantFeature {
likelihoods[genotypeIndex] = Math.log10(Prior);
}
}
public double[] getLikelihoods() {
return likelihoods;
}
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
}
}

View File

@ -8,6 +8,8 @@ public class IVFSecondaryBases implements IndependentVariantFeature {
private double[] p2on = { 0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000 };
private double[] p2off = { 0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505 };
private double[] likelihoods;
/**
* Method so that features can initialize themselves based on a short argument string. At the moment, each feature is
* responsible for interpreting their own argument string.
@ -58,8 +60,8 @@ public class IVFSecondaryBases implements IndependentVariantFeature {
* @param context the context for the given locus
* @return a ten-element array of log-likelihood result of the feature applied to each genotype
*/
public double[] compute(char ref, LocusContext context) {
double[] likelihoods = new double[10];
public void compute(char ref, LocusContext context) {
likelihoods = new double[10];
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
String primaryBases = pileup.getBases();
@ -109,7 +111,17 @@ public class IVFSecondaryBases implements IndependentVariantFeature {
likelihoods[genotypeIndex] = logOffPrior + logOnPrior;
}
}
public double[] getLikelihoods() {
return likelihoods;
}
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
}
}

View File

@ -25,7 +25,12 @@ public interface IndependentVariantFeature {
*
* @param ref the reference base
* @param context the context for the given locus
* @return a ten-element array of log-likelihood result of the feature applied to each genotype
*/
public double[] compute(char ref, LocusContext context);
public void compute(char ref, LocusContext context);
public double[] getLikelihoods();
public String getStudyHeader();
public String getStudyInfo();
}

View File

@ -207,6 +207,7 @@ public abstract class RatioFilter implements VariantExclusionCriterion {
protected String name = null;
protected GenotypeFeatureData.Tail tail = null;
protected boolean exclude = false;
/**
* A short-term hack to stop the systme from rejecting poorly covered sites, under the assumption
@ -256,7 +257,7 @@ public abstract class RatioFilter implements VariantExclusionCriterion {
public boolean useZeroQualityReads() { return false; }
public boolean exclude(char ref, LocusContext context, rodVariants variant) {
public void compute(char ref, LocusContext context, rodVariants variant) {
boolean exclude = false;
//
@ -285,10 +286,20 @@ public abstract class RatioFilter implements VariantExclusionCriterion {
name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n,
value, gfd, exclude, pileup.getBases()));
}
}
public boolean isExcludable() {
return exclude;
}
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
}
private final static double SEARCH_INCREMENT = 0.01;
private final static double integralPValueThreshold = 0.05;

View File

@ -13,15 +13,32 @@ import org.broadinstitute.sting.gatk.refdata.rodVariants;
public class VECDepthOfCoverage implements VariantExclusionCriterion {
private int maximum = 200;
private boolean exclude = false;
private int depth;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
maximum = Integer.valueOf(arguments);
}
}
public boolean useZeroQualityReads() { return false; }
public void compute(char ref, LocusContext context, rodVariants variant) {
exclude = context.getReads().size() > maximum;
public boolean exclude(char ref, LocusContext context, rodVariants variant) {
return context.getReads().size() > maximum;
depth = context.getReads().size();
}
public boolean isExcludable() {
return exclude;
}
public String getStudyHeader() {
return "#depth";
}
public String getStudyInfo() {
return Integer.toString(depth);
}
public boolean useZeroQualityReads() { return false; }
}

View File

@ -10,8 +10,8 @@ import java.util.List;
import cern.jet.math.Arithmetic;
public class VECFisherStrand implements VariantExclusionCriterion {
private double pvalueLimit = 0.0001;
private boolean exclude;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
@ -19,15 +19,27 @@ public class VECFisherStrand implements VariantExclusionCriterion {
}
}
public boolean exclude(char ref, LocusContext context, rodVariants variant) {
public void compute(char ref, LocusContext context, rodVariants variant) {
int allele1 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(0));
int allele2 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(1));
if (allele1 != allele2) {
return strandTest(ref, context, allele1, allele2, pvalueLimit, null);
exclude = strandTest(ref, context, allele1, allele2, pvalueLimit, null);
} else {
exclude = false;
}
}
return false;
public boolean isExcludable() {
return exclude;
}
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
}
public boolean useZeroQualityReads() { return false; }

View File

@ -6,15 +6,34 @@ import org.broadinstitute.sting.gatk.LocusContext;
public class VECLodThreshold implements VariantExclusionCriterion {
private double lodThreshold = 5.0;
private boolean exclude;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
lodThreshold = Double.valueOf(arguments);
}
}
public void compute(char ref, LocusContext context, rodVariants variant) {
exclude = (variant.getLodBtr() < lodThreshold);
}
public boolean useZeroQualityReads() { return false; }
public boolean exclude(char ref, LocusContext context, rodVariants variant) {
return (variant.getLodBtr() < lodThreshold);
}
public boolean isExcludable() {
return exclude;
}
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
}
}

View File

@ -0,0 +1,28 @@
package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.rodVariants;
public class VECNull implements VariantExclusionCriterion {
public void initialize(String arguments) {
}
public void compute(char ref, LocusContext context, rodVariants variant) {
}
public boolean isExcludable() {
return false;
}
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
}
public boolean useZeroQualityReads() {
return false;
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.*;
public class VECOnOffGenotypeRatio extends RatioFilter {

View File

@ -6,6 +6,12 @@ import org.broadinstitute.sting.gatk.refdata.rodVariants;
public interface VariantExclusionCriterion {
public void initialize(String arguments);
public boolean exclude(char ref, LocusContext context, rodVariants variant);
public void compute(char ref, LocusContext context, rodVariants variant);
public boolean isExcludable();
public String getStudyHeader();
public String getStudyInfo();
public boolean useZeroQualityReads();
}

View File

@ -21,8 +21,9 @@ import java.util.*;
import net.sf.samtools.SAMRecord;
/**
* VariantFiltrationWalker applies specified conditionally independent features to pre-called variants, thus modifying
* the likelihoods of each genotype. At the moment, the variants are expected to be in gelitext format.
* VariantFiltrationWalker applies specified conditionally independent features and filters to pre-called variants.
* The former modifiesthe likelihoods of each genotype, while the latter makes a decision to include or exclude a
* variant outright. At the moment, the variants are expected to be in gelitext format.
*/
@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=rodVariants.class))
public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
@ -31,12 +32,15 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="exclusion_criterion", shortName="X", doc="Exclusion test (optionally with arguments) to apply to variant call. Syntax: 'testname[:arguments]'", required=false) public String[] EXCLUSIONS;
@Argument(fullName="verbose", shortName="V", doc="Show how the variant likelihoods are changing with the application of each feature") public Boolean VERBOSE = false;
@Argument(fullName="list", shortName="ls", doc="List the available features and exclusion criteria and exit") public Boolean LIST = false;
@Argument(fullName="learning_mode", shortName="LM", doc="Output parseable information on each filter that can then be fed back to the filter as a training set") public Boolean LEARNING = false;
@Argument(fullName="truth", shortName="truth", doc="Operate on truth set only") public Boolean TRUTH = false;
private List<Class<? extends IndependentVariantFeature>> featureClasses;
private List<Class<? extends VariantExclusionCriterion>> exclusionClasses;
private PrintWriter vwriter;
private HashMap<String, PrintWriter> ewriters;
private HashMap<String, PrintWriter> swriters;
private ArrayList<IndependentVariantFeature> requestedFeatures;
private ArrayList<VariantExclusionCriterion> requestedExclusions;
@ -61,6 +65,8 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
requestedFeatures = new ArrayList<IndependentVariantFeature>();
requestedExclusions = new ArrayList<VariantExclusionCriterion>();
swriters = new HashMap<String, PrintWriter>();
// Initialize requested features
if (FEATURES != null) {
for (String requestedFeatureString : FEATURES) {
@ -76,6 +82,13 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
IndependentVariantFeature ivf = (IndependentVariantFeature) featureClass.newInstance();
ivf.initialize(requestedFeatureArgs);
if (LEARNING) {
PrintWriter studyWriter = new PrintWriter(VARIANTS_OUT_HEAD + "." + featureClassName + ".study");
studyWriter.println(ivf.getStudyHeader());
swriters.put(featureClassName, studyWriter);
}
requestedFeatures.add(ivf);
} catch (InstantiationException e) {
throw new StingException(String.format("Cannot instantiate feature class '%s': must be concrete class", featureClass.getSimpleName()));
@ -104,6 +117,13 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
VariantExclusionCriterion vec = (VariantExclusionCriterion) exclusionClass.newInstance();
vec.initialize(requestedExclusionArgs);
if (LEARNING) {
PrintWriter studyWriter = new PrintWriter(VARIANTS_OUT_HEAD + "." + exclusionClassName + ".study");
studyWriter.println(vec.getStudyHeader());
swriters.put(exclusionClassName, studyWriter);
}
requestedExclusions.add(vec);
PrintWriter writer = new PrintWriter(VARIANTS_OUT_HEAD + ".excluded." + exclusionClassName + ".geli.calls");
@ -185,9 +205,22 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
// Apply features that modify the likelihoods and LOD scores
for ( IndependentVariantFeature ivf : requestedFeatures ) {
variant.adjustLikelihoods(ivf.compute(ref, context));
ivf.compute(ref, context);
String featureClassName = rationalizeClassName(ivf.getClass());
double[] weights = ivf.getLikelihoods();
variant.adjustLikelihoods(weights);
if (VERBOSE) { out.println(rationalizeClassName(ivf.getClass()) + ":\n " + variant); }
if (LEARNING) {
PrintWriter swriter = swriters.get(featureClassName);
if (swriter != null) {
swriter.println(ivf.getStudyInfo());
}
}
}
// Apply exclusion tests that accept or reject the variant call
@ -198,10 +231,21 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
LocusContext Q0freeContext = removeQ0reads(context);
for ( VariantExclusionCriterion vec : requestedExclusions ) {
boolean excludeResult = vec.exclude(ref, (vec.useZeroQualityReads() ? context : Q0freeContext), variant);
vec.compute(ref, (vec.useZeroQualityReads() ? context : Q0freeContext), variant);
if (excludeResult) {
exclusionResults.add(rationalizeClassName(vec.getClass()));
String exclusionClassName = rationalizeClassName(vec.getClass());
if (vec.isExcludable()) {
exclusionResults.add(exclusionClassName);
}
if (LEARNING) {
PrintWriter swriter = swriters.get(exclusionClassName);
if (swriter != null) {
swriter.println(vec.getStudyInfo());
swriter.flush();
}
}
}
@ -272,8 +316,12 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
vwriter.close();
for (PrintWriter writer : ewriters.values()) {
writer.close();
for (PrintWriter ewriter : ewriters.values()) {
ewriter.close();
}
for (PrintWriter swriter : swriters.values()) {
swriter.close();
}
}
}