From b282635b05569763173220bf74805d0b20e6d085 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 30 Jul 2009 18:52:13 +0000 Subject: [PATCH] Complete reworking of Fisher's exact test for strand bias: - fixed math bug (pValue needs to be initialized to pCutoff, not 0) - perform factorial calculations in log space so that huge numbers don't explode - cache factorial calculations so that each value needs to be computed just once for any given instance of the filter I've tested it against R and it has held up so far... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1341 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variants/StrandTestPrinter.java | 135 ------------------ .../walkers/variants/VECFisherStrand.java | 57 +++++--- 2 files changed, 40 insertions(+), 152 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/StrandTestPrinter.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/StrandTestPrinter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/StrandTestPrinter.java deleted file mode 100755 index be3a15698..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/StrandTestPrinter.java +++ /dev/null @@ -1,135 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.variants; - -import org.broadinstitute.sting.gatk.LocusContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.io.*; - -@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData={@RMD(name="hapmap",type=HapMapGenotypeROD.class),@RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="variant",type=rodVariants.class)}) -public class StrandTestPrinter extends LocusWalker { - @Argument(fullName="concordance_out", shortName="CO", doc="File to which concordant variant stats should be written", required=true) - File CONCORDANCE_OUT = null; - @Argument(fullName="sample_name", shortName="sample", doc="Sample name (e.g. NA12878)", required=true) - String SAMPLE_NAME = null; - - private PrintWriter writer = null; - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - try { - writer = new PrintWriter(CONCORDANCE_OUT); - } catch (FileNotFoundException e) { - throw new StingException(String.format("Could not open file for writing")); - } - writer.println("location\tvariant\tlodBestVsRef\tStrandScore\tForwardCount\tReverseCount\tInHapmap\tIndbSNP\tOurCall\tHapMapCall"); - } - - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - /** - * For each site of interest, rescore the genotype likelihoods by applying the specified feature set. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return 1 if the locus is a het site, 0 if otherwise - */ - public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { - HapMapGenotypeROD hapmap = (HapMapGenotypeROD)tracker.lookup("hapmap", null); - rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbsnp", null); - rodVariants variant = (rodVariants)tracker.lookup("variant", null); - - StringBuffer sb = new StringBuffer(); - char refUpper = Character.toUpperCase(ref); - - boolean realHetVariant = false, realVariant = false, calledHetVariant = false; - if ( hapmap != null ) { - String genotype = hapmap.get(SAMPLE_NAME).toUpperCase(); - if ( genotype.charAt(0) != refUpper || genotype.charAt(1) != refUpper ) { - realVariant = true; - if ( genotype.charAt(0) != genotype.charAt(1) ) { - realHetVariant = true; - sb.append(context.getLocation() + "\t" + genotype + "\t"); - } - } - } - if ( variant != null ) { - String genotype = variant.getBestGenotype().toUpperCase(); - if ( (genotype.charAt(0) != refUpper || genotype.charAt(1) != refUpper) && - (genotype.charAt(0) != genotype.charAt(1)) ) { - calledHetVariant = true; - if ( !realHetVariant ) - sb.append(context.getLocation() + "\t" + genotype + "\t"); - } - } - - if ( !realHetVariant && !calledHetVariant ) - return 0; - - if ( variant != null ) - sb.append(variant.getLodBtr() + "\t"); - else - sb.append("-1\t"); - - int allele1, allele2; - if ( hapmap != null ) { - allele1 = BaseUtils.simpleBaseToBaseIndex(hapmap.get(SAMPLE_NAME).charAt(0)); - allele2 = BaseUtils.simpleBaseToBaseIndex(hapmap.get(SAMPLE_NAME).charAt(1)); - } else { - allele1 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(0)); - allele2 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(1)); - } - VECFisherStrand.strandTest(ref, context, allele1, allele2, -1, sb); - - sb.append((hapmap != null ? "1" : "0") + "\t" + (dbsnp != null ? "1" : "0") + "\t"); - if ( variant == null) - sb.append("REF\t"); - else if ( calledHetVariant ) - sb.append("HET\t"); - else - sb.append("HOMNONREF\t"); - - if ( realHetVariant ) - sb.append("HET\n"); - else if ( realVariant ) - sb.append("HOMNONREF\n"); - else - sb.append("REF\n"); - - writer.print(sb.toString()); - - return 0; - } - - /** - * Increment the number of loci processed. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of loci processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - /** - * Tell the user the number of hets processed and close out the files. - * - * @param result the number of variants seen. - */ - public void onTraversalDone(Integer result) { - out.printf("Processed %d hets.\n", result); - - writer.close(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java index 91415685f..0eebfbc80 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECFisherStrand.java @@ -5,18 +5,20 @@ import org.broadinstitute.sting.gatk.refdata.rodVariants; import org.broadinstitute.sting.utils.BaseUtils; import net.sf.samtools.SAMRecord; +import java.util.ArrayList; import java.util.List; -import cern.jet.math.Arithmetic; public class VECFisherStrand implements VariantExclusionCriterion { - private double pvalueLimit = 0.0001; + private double pvalueLimit = 0.00001; private boolean exclude; + private ArrayList factorials = new ArrayList(); public void initialize(String arguments) { if (arguments != null && !arguments.isEmpty()) { pvalueLimit = Double.valueOf(arguments); } + factorials.add(0.0); // add fact(0) } public void compute(char ref, LocusContext context, rodVariants variant) { @@ -44,15 +46,17 @@ public class VECFisherStrand implements VariantExclusionCriterion { public boolean useZeroQualityReads() { return false; } - public static boolean strandTest(char ref, LocusContext context, int allele1, int allele2, double threshold, StringBuffer out) { + public boolean strandTest(char ref, LocusContext context, int allele1, int allele2, double threshold, StringBuffer out) { int[][] table = getContingencyTable(context, allele1, allele2); if ( !variantIsHet(table) ) return false; + updateFactorials(table); + double pCutoff = computePValue(table); //printTable(table, pCutoff); - double pValue = 0.0; + double pValue = pCutoff; while (rotateTable(table)) { double pValuePiece = computePValue(table); @@ -119,25 +123,37 @@ public class VECFisherStrand implements VariantExclusionCriterion { return (table[0][1] >= 0 && table[1][0] >= 0) ? true : false; } - private static double computePValue(int[][] table) { - double pCutoff = 1.0; + private double computePValue(int[][] table) { int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; int N = rowSums[0] + rowSums[1]; - pCutoff *= (double) Arithmetic.factorial(rowSums[0]); - pCutoff *= (double) Arithmetic.factorial(rowSums[1]); - pCutoff *= (double) Arithmetic.factorial(colSums[0]); - pCutoff *= (double) Arithmetic.factorial(colSums[1]); - pCutoff /= (double) Arithmetic.factorial(N); + // calculate in log space so we don't die with high numbers + double pCutoff = factorials.get(rowSums[0]) + + factorials.get(rowSums[1]) + + factorials.get(colSums[0]) + + factorials.get(colSums[1]) + - factorials.get(table[0][0]) + - factorials.get(table[0][1]) + - factorials.get(table[1][0]) + - factorials.get(table[1][1]) + - factorials.get(N); + return Math.exp(pCutoff); - for (int i = 0; i < 2; i++) { - for (int j = 0; j < 2; j++) { - pCutoff /= (double) Arithmetic.factorial(table[i][j]); - } - } - return pCutoff; + //pCutoff *= (double) Arithmetic.factorial(rowSums[0]); + //pCutoff *= (double) Arithmetic.factorial(rowSums[1]); + //pCutoff *= (double) Arithmetic.factorial(colSums[0]); + //pCutoff *= (double) Arithmetic.factorial(colSums[1]); + //pCutoff /= (double) Arithmetic.factorial(N); + // + //for (int i = 0; i < 2; i++) { + // for (int j = 0; j < 2; j++) { + // pCutoff /= (double) Arithmetic.factorial(table[i][j]); + // } + //} + // + //return pCutoff; } private static int sumRow(int[][] table, int column) { @@ -192,4 +208,11 @@ public class VECFisherStrand implements VariantExclusionCriterion { return table; } + + private void updateFactorials(int[][] table) { + // calculate in log space so we don't die with high numbers + int max = table[0][0] + table[0][1] + table[1][0] + table[1][1]; + for (int i = factorials.size(); i <= max; i++) + factorials.add(factorials.get(i - 1) + Math.log(i)); + } }