Revert to having these filters use integration over binomial probs

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1499 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-09-02 01:40:22 +00:00
parent 05c164ec69
commit 91ccb0f8c5
4 changed files with 72 additions and 330 deletions

View File

@ -2,329 +2,95 @@ package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.contexts.VariantContext; import org.broadinstitute.sting.gatk.contexts.VariantContext;
import org.broadinstitute.sting.gatk.refdata.rodVariants; import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.gatk.refdata.TabularROD;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
class GenotypeFeatureData extends ArrayList<Double> {
public enum Tail { LeftTailed, RightTailed, TwoTailed }
private String genotype = null;
private Integer[] permutation = null;
private Logger logger = null;
private Tail tail = null;
double lowThreshold = -1;
double highThreshold = -1;
public GenotypeFeatureData(final String genotype, Logger logger, Tail tail ) {
super();
this.genotype = genotype;
this.logger = logger;
this.tail = tail;
}
public Tail getTail() {
return tail;
}
public double getLowThreshold() {
return lowThreshold;
}
public void setLowThreshold(double lowThreshold) {
this.lowThreshold = lowThreshold;
}
public double getHighThreshold() {
return highThreshold;
}
public void setHighThreshold(double highThreshold) {
this.highThreshold = highThreshold;
}
public void finalizeData() {
permutation = Utils.SortPermutation(this);
}
public void determineThresholds(final double pvalueLimit) {
if ( pvalueLimit <= 0.0 || pvalueLimit >= 1.0 )
throw new RuntimeException(String.format("Invalid pValue limit = %f", pvalueLimit));
switch ( tail ) {
case LeftTailed:
lowThreshold = highThreshold = determineOneTailThreshold(pvalueLimit);
break;
case RightTailed:
lowThreshold = highThreshold = determineOneTailThreshold(1 - pvalueLimit);
break;
case TwoTailed:
lowThreshold = determineOneTailThreshold(pvalueLimit/2);
highThreshold = determineOneTailThreshold(1-pvalueLimit/2);
break;
}
//logger.info(String.format(" %d of %d elements are >= threshold = %f (%f percent)",
// nElementsAboveThreshold(), nElements(), getHighThreshold(), nElementsAboveThreshold() / ( 1.0 * nElements())));
//logger.info(String.format(" %d of %d elements are <= threshold = %f (%f percent)",
// nElementsBelowThreshold(), nElements(), getLowThreshold(), nElementsBelowThreshold() / ( 1.0 * nElements())));
}
private double determineOneTailThreshold(final double pvalueLimit) {
double trueSortedIndexLimit = pvalueLimit * nElements();
int sortedIndexLimit = (int)(pvalueLimit > 0.5 ? Math.ceil(trueSortedIndexLimit) : Math.floor(trueSortedIndexLimit));
double threshold = get(itemIndex(sortedIndexLimit));
logger.debug(String.format("determineTailThreshold(%s, %f) => %f => %d => %f", genotype, pvalueLimit, trueSortedIndexLimit, sortedIndexLimit, threshold));
return threshold;
}
public int nElementsBelowThreshold() { return nElementsPassingThreshold(true, false); }
public int nElementsAboveThreshold() { return nElementsPassingThreshold(false, true); }
public int nElementsPassingThreshold(boolean keepBelow, boolean keepAbove) {
int count = 0;
for ( double stat : this ) {
if ( passesThreshold(stat, keepBelow, keepAbove) ) count++;
}
return count;
}
public boolean passesThreshold(double value, boolean keepBelow, boolean keepAbove) {
//System.out.printf("passThreshold %s, %b, %b = %b%n", value, keepBelow, keepAbove, value < threshold && keepBelow || value >= threshold && keepAbove);
return value <= getHighThreshold() && keepBelow || value >= getLowThreshold() && keepAbove;
}
public boolean passesThreshold(double value) {
switch ( tail ) {
case LeftTailed:
return value >= getLowThreshold();
case RightTailed:
return value <= getHighThreshold();
case TwoTailed:
return value >= getLowThreshold() && value < getHighThreshold();
default:
return true;
}
}
public double pValue(double value) {
return 1.0;
}
public int itemIndex(int sortedIndex) {
return permutation[sortedIndex];
}
public int nElements() { return size(); }
public String toString() {
return String.format("[GenotypeFeatureData: genotype=%s, tail=%s, lowThreshold=%f, highThreshold=%f]", genotype, tail, lowThreshold, highThreshold);
}
}
class ObservationTable extends HashMap<String, GenotypeFeatureData> {
String statField = null;
File observationsFile = null;
Logger logger = null;
GenotypeFeatureData.Tail tail;
public ObservationTable(final File observationsFile, final String statField, Logger logger, GenotypeFeatureData.Tail tail ) throws NoSuchFieldException, IOException {
this.observationsFile = observationsFile;
this.statField = statField;
this.logger = logger;
this.tail = tail;
readAllData(observationsFile, statField);
}
public Set<String> genotypes() {
return keySet();
}
public void readAllData(final File observationsFile, final String statField) throws NoSuchFieldException, FileNotFoundException, IOException {
ArrayList<String> header = TabularROD.readHeader(observationsFile);
//logger.info(String.format("Starting to read table %s", observationsFile));
for ( String line : new xReadLines(observationsFile) ) {
TabularROD d = new TabularROD("ignoreMe", header);
String[] parts = line.split("\\s+");
if ( d.parseLine(header, parts) ) {
if (! d.containsKey(statField)) {
throw new NoSuchFieldException(String.format("Could not find field %s in line %s", statField, line));
}
double stat = Double.valueOf(d.get(statField));
final String genotype = d.get("genotype");
GenotypeFeatureData gfd = getData(genotype);
gfd.add(stat);
}
}
for ( GenotypeFeatureData gfd : this.values() ) {
gfd.finalizeData();
}
//logger.info(String.format("Read table %s, found %d genotypes/data pairs in %s field",
// observationsFile, size(), statField));
}
public GenotypeFeatureData getData(final String genotype) {
if ( ! containsKey(genotype) ) {
GenotypeFeatureData gfd = new GenotypeFeatureData(genotype, logger, tail);
put(genotype, gfd);
}
return get(genotype);
}
public String toString() {
return String.format("[ObservationTable: file=%s, field=%s, nGenotypes=%d]", observationsFile, statField, size());
}
}
public abstract class RatioFilter implements VariantExclusionCriterion { public abstract class RatioFilter implements VariantExclusionCriterion {
private static final double defaultMinGenotypeConfidenceToTest = 5.0; // TODO -- must be replaced with true confidence scoore, right now assumes LOD
protected double minGenotypeConfidenceToTest = defaultMinGenotypeConfidenceToTest;
protected double pvalueLimit = -1; protected double pvalueLimit = -1;
protected File observationsFile = null;
protected String statField = null; // "AlleleRatio";
protected Logger logger = null; // Logger.getLogger(RatioFilter.class); protected Logger logger = null; // Logger.getLogger(RatioFilter.class);
protected ObservationTable dataTable = null;
protected String name = null; protected String name = null;
protected GenotypeFeatureData.Tail tail = null;
protected enum Tail { LeftTailed, RightTailed, TwoTailed }
protected Tail tail = null;
protected double lowThreshold = -1;
protected double highThreshold = -1;
protected boolean exclude = false; protected boolean exclude = false;
/** public RatioFilter(final String name, Class myClass, Tail tail ) {
* A short-term hack to stop the systme from rejecting poorly covered sites, under the assumption
* that the ratio test is poorly determined without at least MIN_COUNTS_TO_APPLY_TEST. To be
* replaced by the more robust sampling approach outlined below
*/
final private static int MIN_COUNTS_TO_APPLY_TEST = 20;
final static boolean integrateOverSamplingProbabilities = true;
public RatioFilter(final String name, final String statField, Class myClass, GenotypeFeatureData.Tail tail ) {
this.name = name; this.name = name;
this.statField = statField;
this.tail = tail; this.tail = tail;
logger = Logger.getLogger(myClass); logger = Logger.getLogger(myClass);
} }
public void initialize(String arguments) { protected void setLowThreshold(double threshold) {
if (arguments != null && !arguments.isEmpty()) { lowThreshold = threshold;
String[] argPieces = arguments.split(","); }
try {
pvalueLimit = Double.valueOf(argPieces[0]); protected void setHighThreshold(double threshold) {
observationsFile = new File(argPieces[1]); highThreshold = threshold;
dataTable = new ObservationTable(observationsFile, statField, logger, tail);
logger.info(String.format("Initialized data for ratio filter %s: %s", name, dataTable));
for ( String genotype : dataTable.genotypes() ) {
GenotypeFeatureData gfd = dataTable.get(genotype);
gfd.determineThresholds(pvalueLimit);
logger.info(gfd.toString());
}
} catch ( NumberFormatException e ) {
throw new RuntimeException(String.format("Couldn't parse pValue limit from %s", argPieces[0]), e);
} catch ( FileNotFoundException e ) {
throw new RuntimeException("Couldn't open ObservationTable " + observationsFile, e);
} catch ( NoSuchFieldException e ) {
throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile, e);
} catch ( IOException e ) {
throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile,e );
}
}
} }
protected abstract boolean applyToVariant(rodVariants variant);
protected abstract Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant); protected abstract Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant);
protected abstract boolean excludeHetsOnly();
public boolean useZeroQualityReads() { return false; } public boolean useZeroQualityReads() { return false; }
public void compute(VariantContextWindow contextWindow) { public void compute(VariantContextWindow contextWindow) {
boolean exclude = false;
VariantContext context = contextWindow.getContext(); VariantContext context = contextWindow.getContext();
rodVariants variant = context.getVariant(); rodVariants variant = context.getVariant();
char ref = context.getReferenceContext().getBase(); char ref = context.getReferenceContext().getBase();
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getAlignmentContext(useZeroQualityReads())); ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getAlignmentContext(useZeroQualityReads()));
if ( applyToVariant(variant) ) {
Pair<Integer, Integer> counts = scoreVariant(ref, pileup, variant); Pair<Integer, Integer> counts = scoreVariant(ref, pileup, variant);
GenotypeFeatureData gfd = dataTable.get(orderedBases(variant.getBestGenotype()));
if (integrateOverSamplingProbabilities)
exclude = integralExclude(gfd, counts);
else
exclude = pointEstimateExclude(gfd, counts);
boolean highGenotypeConfidence = variant.getConsensusConfidence() > minGenotypeConfidenceToTest;
boolean excludable = !excludeHetsOnly() || GenotypeUtils.isHet(variant);
exclude = excludable && highGenotypeConfidence && integralExclude(counts);
// //
// for printing only // for printing only
// //
int n = counts.first + counts.second; int n = counts.first + counts.second;
double value = counts.first / (1.0 * counts.first + counts.second); double value = counts.first / (1.0 * counts.first + counts.second);
logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, %s, exclude=%b, bases=%s", logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, exclude=%b, bases=%s",
name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n, name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n,
value, gfd, exclude, pileup.getBases())); value, exclude, pileup.getBases()));
}
}
public boolean isExcludable() {
return exclude;
}
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
} }
private final static double SEARCH_INCREMENT = 0.01; private final static double SEARCH_INCREMENT = 0.01;
private final static double integralPValueThreshold = 0.05; private final static double integralPValueThreshold = 0.05;
private boolean integralExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) { private boolean integralExclude(Pair<Integer, Integer> counts ) {
double sumExclude = 0.0, sumP = 0.0; double sumExclude = 0.0, sumP = 0.0;
int n = counts.first + counts.second; int n = counts.first + counts.second;
for ( double r = 0.0; r <= 1.0; r += SEARCH_INCREMENT ) { for ( double r = 0.0; r <= 1.0; r += SEARCH_INCREMENT ) {
double p = MathUtils.binomialProbability(counts.first, n, r); double p = MathUtils.binomialProbability(counts.first, n, r);
sumP += p; sumP += p;
boolean exclude = ! gfd.passesThreshold(r); boolean exclude = ! passesThreshold(r);
sumExclude += p * (exclude ? 1.0 : 0.0); sumExclude += p * (exclude ? 1.0 : 0.0);
//System.out.printf("integral: k=%d, n=%d, r=%f, p=%f, sumP = %f, exclude=%b | sum=%f, percentExcluded=%f%n", System.out.printf("integral: k=%d, n=%d, r=%f, p=%f, sumP = %f, exclude=%b | sum=%f, percentExcluded=%f%n",
// counts.first, n, r, p, sumP, exclude, sumExclude, sumExclude / sumP); counts.first, n, r, p, sumP, exclude, sumExclude, sumExclude / sumP);
} }
double percentExcluded = sumExclude / sumP; double percentExcluded = sumExclude / sumP;
return 1 - percentExcluded <= integralPValueThreshold ; return 1 - percentExcluded <= integralPValueThreshold ;
} }
private boolean pointEstimateExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) { private boolean passesThreshold(double value) {
int n = counts.first + counts.second; switch ( tail ) {
if ( n < MIN_COUNTS_TO_APPLY_TEST ) return false; case LeftTailed:
double value = counts.first / (1.0 * counts.first + counts.second); return value >= lowThreshold;
//double value = counts.first / (1.0 * Math.max(counts.second, 1.0)); case RightTailed:
return ! gfd.passesThreshold(value); return value <= highThreshold;
case TwoTailed:
return value >= lowThreshold && value < highThreshold;
default:
return true;
} }
private String orderedBases(String s) {
char[] charArray = s.toCharArray();
java.util.Arrays.sort(charArray);
return new String(charArray);
} }
} }

View File

@ -1,20 +1,19 @@
package org.broadinstitute.sting.playground.gatk.walkers.variants; package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.refdata.rodVariants; import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.gatk.contexts.VariantContext;
import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.GenotypeUtils;
public class VECAlleleBalance implements VariantExclusionCriterion { //extends RatioFilter { public class VECAlleleBalance extends RatioFilter {
private static final double defaultMinGenotypeConfidenceToTest = 5.0; // TODO -- must be replaced with true confidence scoore, right now assumes LOD
private boolean exclude; private double lowThreshold = 0.10, highThreshold = 0.85;
private double lowThreshold = 0.1, highThreshold = 0.85;
private double minGenotypeConfidenceToTest = defaultMinGenotypeConfidenceToTest;
private double ratio; private double ratio;
public VECAlleleBalance() {
super("Allele Balance Ratio", VECAlleleBalance.class, Tail.TwoTailed);
}
public void initialize(String arguments) { public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) { if (arguments != null && !arguments.isEmpty()) {
String[] argPieces = arguments.split(","); String[] argPieces = arguments.split(",");
@ -23,15 +22,13 @@ public class VECAlleleBalance implements VariantExclusionCriterion { //extends
if ( argPieces.length > 2 ) if ( argPieces.length > 2 )
minGenotypeConfidenceToTest = Double.valueOf(argPieces[2]); minGenotypeConfidenceToTest = Double.valueOf(argPieces[2]);
} }
setLowThreshold(lowThreshold);
setHighThreshold(highThreshold);
} }
/** /**
* Return the count of bases matching the major (first) and minor (second) alleles as a pair. * Return the count of bases matching the major (first) and minor (second) alleles as a pair.
* *
* @param ref
* @param pileup
* @param variant
* @return
*/ */
protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) { protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
final String genotype = variant.getBestGenotype(); final String genotype = variant.getBestGenotype();
@ -49,30 +46,11 @@ public class VECAlleleBalance implements VariantExclusionCriterion { //extends
int refCount = a == ref ? aCount : bCount; int refCount = a == ref ? aCount : bCount;
int altCount = a == ref ? bCount : aCount; int altCount = a == ref ? bCount : aCount;
return new Pair(refCount, altCount); ratio = (double)refCount / (double)altCount;
return new Pair<Integer, Integer>(refCount, altCount);
} }
public void compute(VariantContextWindow contextWindow) { protected boolean excludeHetsOnly() { return true; }
VariantContext context = contextWindow.getContext();
char ref = context.getReferenceContext().getBase();
rodVariants variant = context.getVariant();
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getAlignmentContext(useZeroQualityReads()));
Pair<Integer, Integer> counts = scoreVariant(ref, pileup, variant);
int n = counts.first + counts.second;
ratio = counts.first.doubleValue() / (double)n;
boolean highGenotypeConfidence = variant.getConsensusConfidence() > minGenotypeConfidenceToTest;
boolean failsHetExpectation = GenotypeUtils.isHet(variant) && (ratio < lowThreshold || ratio > highThreshold);
exclude = failsHetExpectation && highGenotypeConfidence;
// if ( failsHetExpectation ) {
// String header = highGenotypeConfidence ? "FILTER-HIGH-CONFIDENCE" : "PASS-LOW-CONFIDENCE";
// System.out.printf("[%s] %s getConsensusConfidence() = %f > minGenotypeConfidenceToTest = %f exclude=%b %s%n",
// header, variant.getBestGenotype(), variant.getConsensusConfidence(), minGenotypeConfidenceToTest, exclude,
// ! highGenotypeConfidence ? variant : "" );
// }
}
public boolean useZeroQualityReads() { return false; } public boolean useZeroQualityReads() { return false; }

View File

@ -1,29 +1,30 @@
package org.broadinstitute.sting.playground.gatk.walkers.variants; package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.refdata.rodVariants; import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.gatk.contexts.VariantContext;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
public class VECOnOffGenotypeRatio implements VariantExclusionCriterion { // extends RatioFilter { public class VECOnOffGenotypeRatio extends RatioFilter {
//final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.LeftTailed;
private boolean exclude;
private double threshold = 0.0; private double threshold = 0.0;
private double ratio; private double ratio;
public VECOnOffGenotypeRatio() {
super("On/Off Genotype Ratio", VECOnOffGenotypeRatio.class, Tail.LeftTailed);
}
public void initialize(String arguments) { public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) { if (arguments != null && !arguments.isEmpty()) {
threshold = Double.valueOf(arguments); String[] argPieces = arguments.split(",");
threshold = Double.valueOf(argPieces[0]);
if ( argPieces.length > 1 )
minGenotypeConfidenceToTest = Double.valueOf(argPieces[1]);
} }
setLowThreshold(threshold);
} }
/** /**
* Return the counts of bases that are on (matching the bestGenotype) and off (not matching the * Return the counts of bases that are on (matching the bestGenotype) and off (not matching the
* best genotype). On are in the first field, off in the second. * best genotype). On are in the first field, off in the second.
* *
* @param ref
* @param pileup
* @param variant
* @return
*/ */
protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) { protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
final String genotype = variant.getBestGenotype().toUpperCase(); final String genotype = variant.getBestGenotype().toUpperCase();
@ -43,19 +44,11 @@ public class VECOnOffGenotypeRatio implements VariantExclusionCriterion { // ext
//System.out.printf("count = %d, on=%d, off=%d for %c in %s%n", count, on, off, base, genotype); //System.out.printf("count = %d, on=%d, off=%d for %c in %s%n", count, on, off, base, genotype);
} }
ratio = (double)on / (double)off;
return new Pair<Integer, Integer>(on, off); return new Pair<Integer, Integer>(on, off);
} }
public void compute(VariantContextWindow contextWindow) { protected boolean excludeHetsOnly() { return false; }
VariantContext context = contextWindow.getContext();
char ref = context.getReferenceContext().getBase();
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getAlignmentContext(useZeroQualityReads()));
Pair<Integer, Integer> counts = scoreVariant(ref, pileup, context.getVariant());
int n = counts.first + counts.second;
ratio = counts.first.doubleValue() / (double)n;
exclude = ratio < threshold;
}
public double inclusionProbability() { public double inclusionProbability() {
return exclude ? 0.0 : 1.0; return exclude ? 0.0 : 1.0;

View File

@ -294,6 +294,11 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
*/ */
public void onTraversalDone(Integer result) { public void onTraversalDone(Integer result) {
// move the window over so that we can filter the last few variants // move the window over so that we can filter the last few variants
if ( windowInitializer != null ) {
while ( windowInitializer.size() < windowSize )
windowInitializer.add(null);
variantContextWindow = new VariantContextWindow(windowInitializer);
}
for (int i=0; i < windowSize; i++) { for (int i=0; i < windowSize; i++) {
variantContextWindow.moveWindow(null); variantContextWindow.moveWindow(null);
compute(); compute();