Complete reworking of Fisher's exact test for strand bias:

- fixed math bug (pValue needs to be initialized to pCutoff, not 0)
- perform factorial calculations in log space so that huge numbers don't explode
- cache factorial calculations so that each value needs to be computed just once for any given instance of the filter

I've tested it against R and it has held up so far...



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1341 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-07-30 18:52:13 +00:00
parent 4033c718d2
commit b282635b05
2 changed files with 40 additions and 152 deletions

View File

@ -1,135 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.*;
@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData={@RMD(name="hapmap",type=HapMapGenotypeROD.class),@RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="variant",type=rodVariants.class)})
public class StrandTestPrinter extends LocusWalker<Integer, Integer> {
@Argument(fullName="concordance_out", shortName="CO", doc="File to which concordant variant stats should be written", required=true)
File CONCORDANCE_OUT = null;
@Argument(fullName="sample_name", shortName="sample", doc="Sample name (e.g. NA12878)", required=true)
String SAMPLE_NAME = null;
private PrintWriter writer = null;
/**
* Prepare the output file and the list of available features.
*/
public void initialize() {
try {
writer = new PrintWriter(CONCORDANCE_OUT);
} catch (FileNotFoundException e) {
throw new StingException(String.format("Could not open file for writing"));
}
writer.println("location\tvariant\tlodBestVsRef\tStrandScore\tForwardCount\tReverseCount\tInHapmap\tIndbSNP\tOurCall\tHapMapCall");
}
/**
* Initialize the number of loci processed to zero.
*
* @return 0
*/
public Integer reduceInit() { return 0; }
/**
* For each site of interest, rescore the genotype likelihoods by applying the specified feature set.
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return 1 if the locus is a het site, 0 if otherwise
*/
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
HapMapGenotypeROD hapmap = (HapMapGenotypeROD)tracker.lookup("hapmap", null);
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbsnp", null);
rodVariants variant = (rodVariants)tracker.lookup("variant", null);
StringBuffer sb = new StringBuffer();
char refUpper = Character.toUpperCase(ref);
boolean realHetVariant = false, realVariant = false, calledHetVariant = false;
if ( hapmap != null ) {
String genotype = hapmap.get(SAMPLE_NAME).toUpperCase();
if ( genotype.charAt(0) != refUpper || genotype.charAt(1) != refUpper ) {
realVariant = true;
if ( genotype.charAt(0) != genotype.charAt(1) ) {
realHetVariant = true;
sb.append(context.getLocation() + "\t" + genotype + "\t");
}
}
}
if ( variant != null ) {
String genotype = variant.getBestGenotype().toUpperCase();
if ( (genotype.charAt(0) != refUpper || genotype.charAt(1) != refUpper) &&
(genotype.charAt(0) != genotype.charAt(1)) ) {
calledHetVariant = true;
if ( !realHetVariant )
sb.append(context.getLocation() + "\t" + genotype + "\t");
}
}
if ( !realHetVariant && !calledHetVariant )
return 0;
if ( variant != null )
sb.append(variant.getLodBtr() + "\t");
else
sb.append("-1\t");
int allele1, allele2;
if ( hapmap != null ) {
allele1 = BaseUtils.simpleBaseToBaseIndex(hapmap.get(SAMPLE_NAME).charAt(0));
allele2 = BaseUtils.simpleBaseToBaseIndex(hapmap.get(SAMPLE_NAME).charAt(1));
} else {
allele1 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(0));
allele2 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(1));
}
VECFisherStrand.strandTest(ref, context, allele1, allele2, -1, sb);
sb.append((hapmap != null ? "1" : "0") + "\t" + (dbsnp != null ? "1" : "0") + "\t");
if ( variant == null)
sb.append("REF\t");
else if ( calledHetVariant )
sb.append("HET\t");
else
sb.append("HOMNONREF\t");
if ( realHetVariant )
sb.append("HET\n");
else if ( realVariant )
sb.append("HOMNONREF\n");
else
sb.append("REF\n");
writer.print(sb.toString());
return 0;
}
/**
* Increment the number of loci processed.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of loci processed.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
/**
* Tell the user the number of hets processed and close out the files.
*
* @param result the number of variants seen.
*/
public void onTraversalDone(Integer result) {
out.printf("Processed %d hets.\n", result);
writer.close();
}
}

View File

@ -5,18 +5,20 @@ import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.utils.BaseUtils;
import net.sf.samtools.SAMRecord;
import java.util.ArrayList;
import java.util.List;
import cern.jet.math.Arithmetic;
public class VECFisherStrand implements VariantExclusionCriterion {
private double pvalueLimit = 0.0001;
private double pvalueLimit = 0.00001;
private boolean exclude;
private ArrayList<Double> factorials = new ArrayList<Double>();
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
pvalueLimit = Double.valueOf(arguments);
}
factorials.add(0.0); // add fact(0)
}
public void compute(char ref, LocusContext context, rodVariants variant) {
@ -44,15 +46,17 @@ public class VECFisherStrand implements VariantExclusionCriterion {
public boolean useZeroQualityReads() { return false; }
public static boolean strandTest(char ref, LocusContext context, int allele1, int allele2, double threshold, StringBuffer out) {
public boolean strandTest(char ref, LocusContext context, int allele1, int allele2, double threshold, StringBuffer out) {
int[][] table = getContingencyTable(context, allele1, allele2);
if ( !variantIsHet(table) )
return false;
updateFactorials(table);
double pCutoff = computePValue(table);
//printTable(table, pCutoff);
double pValue = 0.0;
double pValue = pCutoff;
while (rotateTable(table)) {
double pValuePiece = computePValue(table);
@ -119,25 +123,37 @@ public class VECFisherStrand implements VariantExclusionCriterion {
return (table[0][1] >= 0 && table[1][0] >= 0) ? true : false;
}
private static double computePValue(int[][] table) {
double pCutoff = 1.0;
private double computePValue(int[][] table) {
int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
int N = rowSums[0] + rowSums[1];
pCutoff *= (double) Arithmetic.factorial(rowSums[0]);
pCutoff *= (double) Arithmetic.factorial(rowSums[1]);
pCutoff *= (double) Arithmetic.factorial(colSums[0]);
pCutoff *= (double) Arithmetic.factorial(colSums[1]);
pCutoff /= (double) Arithmetic.factorial(N);
// calculate in log space so we don't die with high numbers
double pCutoff = factorials.get(rowSums[0])
+ factorials.get(rowSums[1])
+ factorials.get(colSums[0])
+ factorials.get(colSums[1])
- factorials.get(table[0][0])
- factorials.get(table[0][1])
- factorials.get(table[1][0])
- factorials.get(table[1][1])
- factorials.get(N);
return Math.exp(pCutoff);
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
pCutoff /= (double) Arithmetic.factorial(table[i][j]);
}
}
return pCutoff;
//pCutoff *= (double) Arithmetic.factorial(rowSums[0]);
//pCutoff *= (double) Arithmetic.factorial(rowSums[1]);
//pCutoff *= (double) Arithmetic.factorial(colSums[0]);
//pCutoff *= (double) Arithmetic.factorial(colSums[1]);
//pCutoff /= (double) Arithmetic.factorial(N);
//
//for (int i = 0; i < 2; i++) {
// for (int j = 0; j < 2; j++) {
// pCutoff /= (double) Arithmetic.factorial(table[i][j]);
// }
//}
//
//return pCutoff;
}
private static int sumRow(int[][] table, int column) {
@ -192,4 +208,11 @@ public class VECFisherStrand implements VariantExclusionCriterion {
return table;
}
private void updateFactorials(int[][] table) {
// calculate in log space so we don't die with high numbers
int max = table[0][0] + table[0][1] + table[1][0] + table[1][1];
for (int i = factorials.size(); i <= max; i++)
factorials.add(factorials.get(i - 1) + Math.log(i));
}
}