From 91ccb0f8c50da9032f03ec22f3244979dfe9582b Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 2 Sep 2009 01:40:22 +0000 Subject: [PATCH] Revert to having these filters use integration over binomial probs git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1499 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/variants/RatioFilter.java | 320 +++--------------- .../walkers/variants/VECAlleleBalance.java | 44 +-- .../variants/VECOnOffGenotypeRatio.java | 33 +- .../variants/VariantFiltrationWalker.java | 5 + 4 files changed, 72 insertions(+), 330 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java index b0f7d66d1..7ed081343 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java @@ -2,329 +2,95 @@ package org.broadinstitute.sting.playground.gatk.walkers.variants; import org.broadinstitute.sting.gatk.contexts.VariantContext; import org.broadinstitute.sting.gatk.refdata.rodVariants; -import org.broadinstitute.sting.gatk.refdata.TabularROD; import org.broadinstitute.sting.utils.*; import org.apache.log4j.Logger; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Set; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; - -class GenotypeFeatureData extends ArrayList { - public enum Tail { LeftTailed, RightTailed, TwoTailed } - - private String genotype = null; - private Integer[] permutation = null; - private Logger logger = null; - - private Tail tail = null; - - double lowThreshold = -1; - double highThreshold = -1; - - public GenotypeFeatureData(final String genotype, Logger logger, Tail tail ) { - super(); - this.genotype = genotype; - this.logger = logger; - this.tail = tail; - } - - public Tail getTail() { - return tail; - } - - public double getLowThreshold() { - return lowThreshold; - } - - public void setLowThreshold(double lowThreshold) { - this.lowThreshold = lowThreshold; - } - - public double getHighThreshold() { - return highThreshold; - } - - public void setHighThreshold(double highThreshold) { - this.highThreshold = highThreshold; - } - - public void finalizeData() { - permutation = Utils.SortPermutation(this); - } - - public void determineThresholds(final double pvalueLimit) { - if ( pvalueLimit <= 0.0 || pvalueLimit >= 1.0 ) - throw new RuntimeException(String.format("Invalid pValue limit = %f", pvalueLimit)); - - switch ( tail ) { - case LeftTailed: - lowThreshold = highThreshold = determineOneTailThreshold(pvalueLimit); - break; - case RightTailed: - lowThreshold = highThreshold = determineOneTailThreshold(1 - pvalueLimit); - break; - case TwoTailed: - lowThreshold = determineOneTailThreshold(pvalueLimit/2); - highThreshold = determineOneTailThreshold(1-pvalueLimit/2); - break; - } - - //logger.info(String.format(" %d of %d elements are >= threshold = %f (%f percent)", - // nElementsAboveThreshold(), nElements(), getHighThreshold(), nElementsAboveThreshold() / ( 1.0 * nElements()))); - //logger.info(String.format(" %d of %d elements are <= threshold = %f (%f percent)", - // nElementsBelowThreshold(), nElements(), getLowThreshold(), nElementsBelowThreshold() / ( 1.0 * nElements()))); - } - - private double determineOneTailThreshold(final double pvalueLimit) { - double trueSortedIndexLimit = pvalueLimit * nElements(); - int sortedIndexLimit = (int)(pvalueLimit > 0.5 ? Math.ceil(trueSortedIndexLimit) : Math.floor(trueSortedIndexLimit)); - double threshold = get(itemIndex(sortedIndexLimit)); - - logger.debug(String.format("determineTailThreshold(%s, %f) => %f => %d => %f", genotype, pvalueLimit, trueSortedIndexLimit, sortedIndexLimit, threshold)); - - return threshold; - } - - public int nElementsBelowThreshold() { return nElementsPassingThreshold(true, false); } - public int nElementsAboveThreshold() { return nElementsPassingThreshold(false, true); } - - public int nElementsPassingThreshold(boolean keepBelow, boolean keepAbove) { - int count = 0; - - for ( double stat : this ) { - if ( passesThreshold(stat, keepBelow, keepAbove) ) count++; - } - - return count; - } - - public boolean passesThreshold(double value, boolean keepBelow, boolean keepAbove) { - //System.out.printf("passThreshold %s, %b, %b = %b%n", value, keepBelow, keepAbove, value < threshold && keepBelow || value >= threshold && keepAbove); - return value <= getHighThreshold() && keepBelow || value >= getLowThreshold() && keepAbove; - } - - public boolean passesThreshold(double value) { - switch ( tail ) { - case LeftTailed: - return value >= getLowThreshold(); - case RightTailed: - return value <= getHighThreshold(); - case TwoTailed: - return value >= getLowThreshold() && value < getHighThreshold(); - default: - return true; - } - } - - public double pValue(double value) { - return 1.0; - } - - public int itemIndex(int sortedIndex) { - return permutation[sortedIndex]; - } - - public int nElements() { return size(); } - - public String toString() { - return String.format("[GenotypeFeatureData: genotype=%s, tail=%s, lowThreshold=%f, highThreshold=%f]", genotype, tail, lowThreshold, highThreshold); - } -} - - -class ObservationTable extends HashMap { - String statField = null; - File observationsFile = null; - Logger logger = null; - GenotypeFeatureData.Tail tail; - - public ObservationTable(final File observationsFile, final String statField, Logger logger, GenotypeFeatureData.Tail tail ) throws NoSuchFieldException, IOException { - this.observationsFile = observationsFile; - this.statField = statField; - this.logger = logger; - this.tail = tail; - readAllData(observationsFile, statField); - } - - public Set genotypes() { - return keySet(); - } - - public void readAllData(final File observationsFile, final String statField) throws NoSuchFieldException, FileNotFoundException, IOException { - ArrayList header = TabularROD.readHeader(observationsFile); - - //logger.info(String.format("Starting to read table %s", observationsFile)); - - for ( String line : new xReadLines(observationsFile) ) { - TabularROD d = new TabularROD("ignoreMe", header); - String[] parts = line.split("\\s+"); - if ( d.parseLine(header, parts) ) { - if (! d.containsKey(statField)) { - throw new NoSuchFieldException(String.format("Could not find field %s in line %s", statField, line)); - } - double stat = Double.valueOf(d.get(statField)); - - final String genotype = d.get("genotype"); - GenotypeFeatureData gfd = getData(genotype); - gfd.add(stat); - } - } - - for ( GenotypeFeatureData gfd : this.values() ) { - gfd.finalizeData(); - } - - //logger.info(String.format("Read table %s, found %d genotypes/data pairs in %s field", - // observationsFile, size(), statField)); - } - - public GenotypeFeatureData getData(final String genotype) { - if ( ! containsKey(genotype) ) { - GenotypeFeatureData gfd = new GenotypeFeatureData(genotype, logger, tail); - put(genotype, gfd); - } - - return get(genotype); - } - - public String toString() { - return String.format("[ObservationTable: file=%s, field=%s, nGenotypes=%d]", observationsFile, statField, size()); - } -} public abstract class RatioFilter implements VariantExclusionCriterion { + private static final double defaultMinGenotypeConfidenceToTest = 5.0; // TODO -- must be replaced with true confidence scoore, right now assumes LOD + protected double minGenotypeConfidenceToTest = defaultMinGenotypeConfidenceToTest; + protected double pvalueLimit = -1; - protected File observationsFile = null; - protected String statField = null; // "AlleleRatio"; protected Logger logger = null; // Logger.getLogger(RatioFilter.class); - protected ObservationTable dataTable = null; protected String name = null; - protected GenotypeFeatureData.Tail tail = null; + + protected enum Tail { LeftTailed, RightTailed, TwoTailed } + protected Tail tail = null; + protected double lowThreshold = -1; + protected double highThreshold = -1; protected boolean exclude = false; - /** - * A short-term hack to stop the systme from rejecting poorly covered sites, under the assumption - * that the ratio test is poorly determined without at least MIN_COUNTS_TO_APPLY_TEST. To be - * replaced by the more robust sampling approach outlined below - */ - final private static int MIN_COUNTS_TO_APPLY_TEST = 20; - - final static boolean integrateOverSamplingProbabilities = true; - - public RatioFilter(final String name, final String statField, Class myClass, GenotypeFeatureData.Tail tail ) { + public RatioFilter(final String name, Class myClass, Tail tail ) { this.name = name; - this.statField = statField; this.tail = tail; logger = Logger.getLogger(myClass); } - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - String[] argPieces = arguments.split(","); - try { - pvalueLimit = Double.valueOf(argPieces[0]); - observationsFile = new File(argPieces[1]); - - dataTable = new ObservationTable(observationsFile, statField, logger, tail); - - logger.info(String.format("Initialized data for ratio filter %s: %s", name, dataTable)); - for ( String genotype : dataTable.genotypes() ) { - GenotypeFeatureData gfd = dataTable.get(genotype); - gfd.determineThresholds(pvalueLimit); - logger.info(gfd.toString()); - } - } catch ( NumberFormatException e ) { - throw new RuntimeException(String.format("Couldn't parse pValue limit from %s", argPieces[0]), e); - } catch ( FileNotFoundException e ) { - throw new RuntimeException("Couldn't open ObservationTable " + observationsFile, e); - } catch ( NoSuchFieldException e ) { - throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile, e); - } catch ( IOException e ) { - throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile,e ); - } - } + protected void setLowThreshold(double threshold) { + lowThreshold = threshold; + } + + protected void setHighThreshold(double threshold) { + highThreshold = threshold; } - protected abstract boolean applyToVariant(rodVariants variant); protected abstract Pair scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant); + protected abstract boolean excludeHetsOnly(); public boolean useZeroQualityReads() { return false; } public void compute(VariantContextWindow contextWindow) { - boolean exclude = false; VariantContext context = contextWindow.getContext(); rodVariants variant = context.getVariant(); char ref = context.getReferenceContext().getBase(); ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getAlignmentContext(useZeroQualityReads())); - if ( applyToVariant(variant) ) { - Pair counts = scoreVariant(ref, pileup, variant); - GenotypeFeatureData gfd = dataTable.get(orderedBases(variant.getBestGenotype())); + Pair counts = scoreVariant(ref, pileup, variant); - if (integrateOverSamplingProbabilities) - exclude = integralExclude(gfd, counts); - else - exclude = pointEstimateExclude(gfd, counts); - - // - // for printing only - // - int n = counts.first + counts.second; - double value = counts.first / (1.0 * counts.first + counts.second); - logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, %s, exclude=%b, bases=%s", - name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n, - value, gfd, exclude, pileup.getBases())); - } - } - - public boolean isExcludable() { - return exclude; - } - - public String getStudyHeader() { - return ""; - } - - public String getStudyInfo() { - return ""; + boolean highGenotypeConfidence = variant.getConsensusConfidence() > minGenotypeConfidenceToTest; + boolean excludable = !excludeHetsOnly() || GenotypeUtils.isHet(variant); + exclude = excludable && highGenotypeConfidence && integralExclude(counts); + // + // for printing only + // + int n = counts.first + counts.second; + double value = counts.first / (1.0 * counts.first + counts.second); + logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, exclude=%b, bases=%s", + name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n, + value, exclude, pileup.getBases())); } private final static double SEARCH_INCREMENT = 0.01; private final static double integralPValueThreshold = 0.05; - private boolean integralExclude(GenotypeFeatureData gfd, Pair counts ) { + private boolean integralExclude(Pair counts ) { double sumExclude = 0.0, sumP = 0.0; int n = counts.first + counts.second; for ( double r = 0.0; r <= 1.0; r += SEARCH_INCREMENT ) { double p = MathUtils.binomialProbability(counts.first, n, r); sumP += p; - boolean exclude = ! gfd.passesThreshold(r); + boolean exclude = ! passesThreshold(r); sumExclude += p * (exclude ? 1.0 : 0.0); - //System.out.printf("integral: k=%d, n=%d, r=%f, p=%f, sumP = %f, exclude=%b | sum=%f, percentExcluded=%f%n", - // counts.first, n, r, p, sumP, exclude, sumExclude, sumExclude / sumP); + System.out.printf("integral: k=%d, n=%d, r=%f, p=%f, sumP = %f, exclude=%b | sum=%f, percentExcluded=%f%n", + counts.first, n, r, p, sumP, exclude, sumExclude, sumExclude / sumP); } double percentExcluded = sumExclude / sumP; return 1 - percentExcluded <= integralPValueThreshold ; } - private boolean pointEstimateExclude(GenotypeFeatureData gfd, Pair counts ) { - int n = counts.first + counts.second; - if ( n < MIN_COUNTS_TO_APPLY_TEST ) return false; - double value = counts.first / (1.0 * counts.first + counts.second); - //double value = counts.first / (1.0 * Math.max(counts.second, 1.0)); - return ! gfd.passesThreshold(value); - } - - private String orderedBases(String s) { - char[] charArray = s.toCharArray(); - java.util.Arrays.sort(charArray); - return new String(charArray); + private boolean passesThreshold(double value) { + switch ( tail ) { + case LeftTailed: + return value >= lowThreshold; + case RightTailed: + return value <= highThreshold; + case TwoTailed: + return value >= lowThreshold && value < highThreshold; + default: + return true; + } } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECAlleleBalance.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECAlleleBalance.java index 95c43dfac..5ddc92562 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECAlleleBalance.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECAlleleBalance.java @@ -1,20 +1,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.variants; import org.broadinstitute.sting.gatk.refdata.rodVariants; -import org.broadinstitute.sting.gatk.contexts.VariantContext; import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.GenotypeUtils; -public class VECAlleleBalance implements VariantExclusionCriterion { //extends RatioFilter { - private static final double defaultMinGenotypeConfidenceToTest = 5.0; // TODO -- must be replaced with true confidence scoore, right now assumes LOD +public class VECAlleleBalance extends RatioFilter { - private boolean exclude; - private double lowThreshold = 0.1, highThreshold = 0.85; - private double minGenotypeConfidenceToTest = defaultMinGenotypeConfidenceToTest; + private double lowThreshold = 0.10, highThreshold = 0.85; private double ratio; + public VECAlleleBalance() { + super("Allele Balance Ratio", VECAlleleBalance.class, Tail.TwoTailed); + } + public void initialize(String arguments) { if (arguments != null && !arguments.isEmpty()) { String[] argPieces = arguments.split(","); @@ -23,15 +22,13 @@ public class VECAlleleBalance implements VariantExclusionCriterion { //extends if ( argPieces.length > 2 ) minGenotypeConfidenceToTest = Double.valueOf(argPieces[2]); } + setLowThreshold(lowThreshold); + setHighThreshold(highThreshold); } /** * Return the count of bases matching the major (first) and minor (second) alleles as a pair. * - * @param ref - * @param pileup - * @param variant - * @return */ protected Pair scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) { final String genotype = variant.getBestGenotype(); @@ -49,30 +46,11 @@ public class VECAlleleBalance implements VariantExclusionCriterion { //extends int refCount = a == ref ? aCount : bCount; int altCount = a == ref ? bCount : aCount; - return new Pair(refCount, altCount); + ratio = (double)refCount / (double)altCount; + return new Pair(refCount, altCount); } - public void compute(VariantContextWindow contextWindow) { - VariantContext context = contextWindow.getContext(); - char ref = context.getReferenceContext().getBase(); - rodVariants variant = context.getVariant(); - - ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getAlignmentContext(useZeroQualityReads())); - Pair counts = scoreVariant(ref, pileup, variant); - int n = counts.first + counts.second; - ratio = counts.first.doubleValue() / (double)n; - - boolean highGenotypeConfidence = variant.getConsensusConfidence() > minGenotypeConfidenceToTest; - boolean failsHetExpectation = GenotypeUtils.isHet(variant) && (ratio < lowThreshold || ratio > highThreshold); - exclude = failsHetExpectation && highGenotypeConfidence; - -// if ( failsHetExpectation ) { -// String header = highGenotypeConfidence ? "FILTER-HIGH-CONFIDENCE" : "PASS-LOW-CONFIDENCE"; -// System.out.printf("[%s] %s getConsensusConfidence() = %f > minGenotypeConfidenceToTest = %f exclude=%b %s%n", -// header, variant.getBestGenotype(), variant.getConsensusConfidence(), minGenotypeConfidenceToTest, exclude, -// ! highGenotypeConfidence ? variant : "" ); -// } - } + protected boolean excludeHetsOnly() { return true; } public boolean useZeroQualityReads() { return false; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java index 1683bb088..44a8accab 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java @@ -1,30 +1,31 @@ package org.broadinstitute.sting.playground.gatk.walkers.variants; import org.broadinstitute.sting.gatk.refdata.rodVariants; -import org.broadinstitute.sting.gatk.contexts.VariantContext; import org.broadinstitute.sting.utils.*; -public class VECOnOffGenotypeRatio implements VariantExclusionCriterion { // extends RatioFilter { - //final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.LeftTailed; - private boolean exclude; +public class VECOnOffGenotypeRatio extends RatioFilter { private double threshold = 0.0; private double ratio; + public VECOnOffGenotypeRatio() { + super("On/Off Genotype Ratio", VECOnOffGenotypeRatio.class, Tail.LeftTailed); + } + public void initialize(String arguments) { if (arguments != null && !arguments.isEmpty()) { - threshold = Double.valueOf(arguments); + String[] argPieces = arguments.split(","); + threshold = Double.valueOf(argPieces[0]); + if ( argPieces.length > 1 ) + minGenotypeConfidenceToTest = Double.valueOf(argPieces[1]); } + setLowThreshold(threshold); } /** * Return the counts of bases that are on (matching the bestGenotype) and off (not matching the * best genotype). On are in the first field, off in the second. * - * @param ref - * @param pileup - * @param variant - * @return - */ + */ protected Pair scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) { final String genotype = variant.getBestGenotype().toUpperCase(); final String bases = pileup.getBases(); @@ -43,19 +44,11 @@ public class VECOnOffGenotypeRatio implements VariantExclusionCriterion { // ext //System.out.printf("count = %d, on=%d, off=%d for %c in %s%n", count, on, off, base, genotype); } + ratio = (double)on / (double)off; return new Pair(on, off); } - public void compute(VariantContextWindow contextWindow) { - VariantContext context = contextWindow.getContext(); - char ref = context.getReferenceContext().getBase(); - - ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getAlignmentContext(useZeroQualityReads())); - Pair counts = scoreVariant(ref, pileup, context.getVariant()); - int n = counts.first + counts.second; - ratio = counts.first.doubleValue() / (double)n; - exclude = ratio < threshold; - } + protected boolean excludeHetsOnly() { return false; } public double inclusionProbability() { return exclude ? 0.0 : 1.0; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java index d14d7b800..7645bc840 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VariantFiltrationWalker.java @@ -294,6 +294,11 @@ public class VariantFiltrationWalker extends LocusWalker { */ public void onTraversalDone(Integer result) { // move the window over so that we can filter the last few variants + if ( windowInitializer != null ) { + while ( windowInitializer.size() < windowSize ) + windowInitializer.add(null); + variantContextWindow = new VariantContextWindow(windowInitializer); + } for (int i=0; i < windowSize; i++) { variantContextWindow.moveWindow(null); compute();