Re-revert back to point estimation for now. We need to do this right, just not yet.

Also, it's safer to let colt do the log factorial calculations for us. 


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1503 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-09-02 15:33:18 +00:00
parent eb664ae287
commit 55013eff78
4 changed files with 28 additions and 41 deletions

View File

@ -4,10 +4,12 @@ import org.broadinstitute.sting.gatk.contexts.VariantContext;
import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.utils.*;
import org.apache.log4j.Logger;
import cern.jet.math.Arithmetic;
public abstract class RatioFilter implements VariantExclusionCriterion {
private static final double defaultMinGenotypeConfidenceToTest = 5.0; // TODO -- must be replaced with true confidence scoore, right now assumes LOD
private static final int minDepthOfCoverage = 25; // TODO -- must be replaced with a proper probability calculation
protected double minGenotypeConfidenceToTest = defaultMinGenotypeConfidenceToTest;
protected double pvalueLimit = -1;
@ -50,21 +52,31 @@ public abstract class RatioFilter implements VariantExclusionCriterion {
boolean highGenotypeConfidence = variant.getConsensusConfidence() > minGenotypeConfidenceToTest;
boolean excludable = !excludeHetsOnly() || GenotypeUtils.isHet(variant);
exclude = excludable && highGenotypeConfidence && integralExclude(counts);
exclude = excludable && highGenotypeConfidence && pointEstimateExclude(counts);
//
// for printing only
//
int n = counts.first + counts.second;
double value = counts.first / (1.0 * counts.first + counts.second);
logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, exclude=%b, bases=%s",
logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, exclude=%b, location=%s, bases=%s",
name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n,
value, exclude, pileup.getBases()));
value, exclude, variant.getLocation(), pileup.getBases()));
}
private final static double SEARCH_INCREMENT = 0.01;
private final static double integralPValueThreshold = 0.05;
private boolean integralExclude(Pair<Integer, Integer> counts ) {
// TODO - this whole calculation needs to be redone correctly
private boolean pointEstimateExclude(Pair<Integer, Integer> counts) {
if ( counts.first + counts.second < minDepthOfCoverage )
return false;
int n = counts.first + counts.second;
double ratio = counts.first.doubleValue() / (double)n;
return !passesThreshold(ratio);
}
private boolean integralExclude(Pair<Integer, Integer> counts) {
double sumExclude = 0.0, sumP = 0.0;
int n = counts.first + counts.second;
for ( double r = 0.0; r <= 1.0; r += SEARCH_INCREMENT ) {

View File

@ -7,7 +7,7 @@ import org.broadinstitute.sting.utils.Pair;
public class VECAlleleBalance extends RatioFilter {
private double lowThreshold = 0.10, highThreshold = 0.85;
private double lowThreshold = 0.25, highThreshold = 0.75;
private double ratio;
public VECAlleleBalance() {

View File

@ -5,8 +5,8 @@ import org.broadinstitute.sting.gatk.contexts.VariantContext;
import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.utils.BaseUtils;
import net.sf.samtools.SAMRecord;
import cern.jet.math.Arithmetic;
import java.util.ArrayList;
import java.util.List;
@ -14,13 +14,11 @@ public class VECFisherStrand implements VariantExclusionCriterion {
private double pvalueLimit = 0.00001;
private double pValue;
private boolean exclude;
private ArrayList<Double> factorials = new ArrayList<Double>();
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
pvalueLimit = Double.valueOf(arguments);
}
factorials.add(0.0); // add fact(0)
}
public void compute(VariantContextWindow contextWindow) {
@ -56,8 +54,6 @@ public class VECFisherStrand implements VariantExclusionCriterion {
if ( !variantIsHet(table) )
return false;
updateFactorials(table);
double pCutoff = computePValue(table);
//printTable(table, pCutoff);
@ -135,30 +131,16 @@ public class VECFisherStrand implements VariantExclusionCriterion {
int N = rowSums[0] + rowSums[1];
// calculate in log space so we don't die with high numbers
double pCutoff = factorials.get(rowSums[0])
+ factorials.get(rowSums[1])
+ factorials.get(colSums[0])
+ factorials.get(colSums[1])
- factorials.get(table[0][0])
- factorials.get(table[0][1])
- factorials.get(table[1][0])
- factorials.get(table[1][1])
- factorials.get(N);
double pCutoff = Arithmetic.logFactorial(rowSums[0])
+ Arithmetic.logFactorial(rowSums[1])
+ Arithmetic.logFactorial(colSums[0])
+ Arithmetic.logFactorial(colSums[1])
- Arithmetic.logFactorial(table[0][0])
- Arithmetic.logFactorial(table[0][1])
- Arithmetic.logFactorial(table[1][0])
- Arithmetic.logFactorial(table[1][1])
- Arithmetic.logFactorial(N);
return Math.exp(pCutoff);
//pCutoff *= (double) Arithmetic.factorial(rowSums[0]);
//pCutoff *= (double) Arithmetic.factorial(rowSums[1]);
//pCutoff *= (double) Arithmetic.factorial(colSums[0]);
//pCutoff *= (double) Arithmetic.factorial(colSums[1]);
//pCutoff /= (double) Arithmetic.factorial(N);
//
//for (int i = 0; i < 2; i++) {
// for (int j = 0; j < 2; j++) {
// pCutoff /= (double) Arithmetic.factorial(table[i][j]);
// }
//}
//
//return pCutoff;
}
private static int sumRow(int[][] table, int column) {
@ -213,11 +195,4 @@ public class VECFisherStrand implements VariantExclusionCriterion {
return table;
}
private void updateFactorials(int[][] table) {
// calculate in log space so we don't die with high numbers
int max = table[0][0] + table[0][1] + table[1][0] + table[1][1];
for (int i = factorials.size(); i <= max; i++)
factorials.add(factorials.get(i - 1) + Math.log(i));
}
}

View File

@ -4,7 +4,7 @@ import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.utils.*;
public class VECOnOffGenotypeRatio extends RatioFilter {
private double threshold = 0.0;
private double threshold = 0.8;
private double ratio;
public VECOnOffGenotypeRatio() {