Complete reworking of Fisher's exact test for strand bias:
- fixed math bug (pValue needs to be initialized to pCutoff, not 0) - perform factorial calculations in log space so that huge numbers don't explode - cache factorial calculations so that each value needs to be computed just once for any given instance of the filter I've tested it against R and it has held up so far... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1341 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4033c718d2
commit
b282635b05
|
|
@ -1,135 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variants;
|
||||
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData={@RMD(name="hapmap",type=HapMapGenotypeROD.class),@RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="variant",type=rodVariants.class)})
|
||||
public class StrandTestPrinter extends LocusWalker<Integer, Integer> {
|
||||
@Argument(fullName="concordance_out", shortName="CO", doc="File to which concordant variant stats should be written", required=true)
|
||||
File CONCORDANCE_OUT = null;
|
||||
@Argument(fullName="sample_name", shortName="sample", doc="Sample name (e.g. NA12878)", required=true)
|
||||
String SAMPLE_NAME = null;
|
||||
|
||||
private PrintWriter writer = null;
|
||||
|
||||
/**
|
||||
* Prepare the output file and the list of available features.
|
||||
*/
|
||||
public void initialize() {
|
||||
try {
|
||||
writer = new PrintWriter(CONCORDANCE_OUT);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException(String.format("Could not open file for writing"));
|
||||
}
|
||||
writer.println("location\tvariant\tlodBestVsRef\tStrandScore\tForwardCount\tReverseCount\tInHapmap\tIndbSNP\tOurCall\tHapMapCall");
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the number of loci processed to zero.
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
/**
|
||||
* For each site of interest, rescore the genotype likelihoods by applying the specified feature set.
|
||||
*
|
||||
* @param tracker the meta-data tracker
|
||||
* @param ref the reference base
|
||||
* @param context the context for the given locus
|
||||
* @return 1 if the locus is a het site, 0 if otherwise
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
HapMapGenotypeROD hapmap = (HapMapGenotypeROD)tracker.lookup("hapmap", null);
|
||||
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbsnp", null);
|
||||
rodVariants variant = (rodVariants)tracker.lookup("variant", null);
|
||||
|
||||
StringBuffer sb = new StringBuffer();
|
||||
char refUpper = Character.toUpperCase(ref);
|
||||
|
||||
boolean realHetVariant = false, realVariant = false, calledHetVariant = false;
|
||||
if ( hapmap != null ) {
|
||||
String genotype = hapmap.get(SAMPLE_NAME).toUpperCase();
|
||||
if ( genotype.charAt(0) != refUpper || genotype.charAt(1) != refUpper ) {
|
||||
realVariant = true;
|
||||
if ( genotype.charAt(0) != genotype.charAt(1) ) {
|
||||
realHetVariant = true;
|
||||
sb.append(context.getLocation() + "\t" + genotype + "\t");
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( variant != null ) {
|
||||
String genotype = variant.getBestGenotype().toUpperCase();
|
||||
if ( (genotype.charAt(0) != refUpper || genotype.charAt(1) != refUpper) &&
|
||||
(genotype.charAt(0) != genotype.charAt(1)) ) {
|
||||
calledHetVariant = true;
|
||||
if ( !realHetVariant )
|
||||
sb.append(context.getLocation() + "\t" + genotype + "\t");
|
||||
}
|
||||
}
|
||||
|
||||
if ( !realHetVariant && !calledHetVariant )
|
||||
return 0;
|
||||
|
||||
if ( variant != null )
|
||||
sb.append(variant.getLodBtr() + "\t");
|
||||
else
|
||||
sb.append("-1\t");
|
||||
|
||||
int allele1, allele2;
|
||||
if ( hapmap != null ) {
|
||||
allele1 = BaseUtils.simpleBaseToBaseIndex(hapmap.get(SAMPLE_NAME).charAt(0));
|
||||
allele2 = BaseUtils.simpleBaseToBaseIndex(hapmap.get(SAMPLE_NAME).charAt(1));
|
||||
} else {
|
||||
allele1 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(0));
|
||||
allele2 = BaseUtils.simpleBaseToBaseIndex(variant.getBestGenotype().charAt(1));
|
||||
}
|
||||
VECFisherStrand.strandTest(ref, context, allele1, allele2, -1, sb);
|
||||
|
||||
sb.append((hapmap != null ? "1" : "0") + "\t" + (dbsnp != null ? "1" : "0") + "\t");
|
||||
if ( variant == null)
|
||||
sb.append("REF\t");
|
||||
else if ( calledHetVariant )
|
||||
sb.append("HET\t");
|
||||
else
|
||||
sb.append("HOMNONREF\t");
|
||||
|
||||
if ( realHetVariant )
|
||||
sb.append("HET\n");
|
||||
else if ( realVariant )
|
||||
sb.append("HOMNONREF\n");
|
||||
else
|
||||
sb.append("REF\n");
|
||||
|
||||
writer.print(sb.toString());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the number of loci processed.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return the new number of loci processed.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell the user the number of hets processed and close out the files.
|
||||
*
|
||||
* @param result the number of variants seen.
|
||||
*/
|
||||
public void onTraversalDone(Integer result) {
|
||||
out.printf("Processed %d hets.\n", result);
|
||||
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -5,18 +5,20 @@ import org.broadinstitute.sting.gatk.refdata.rodVariants;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import cern.jet.math.Arithmetic;
|
||||
|
||||
public class VECFisherStrand implements VariantExclusionCriterion {
|
||||
private double pvalueLimit = 0.0001;
|
||||
private double pvalueLimit = 0.00001;
|
||||
private boolean exclude;
|
||||
private ArrayList<Double> factorials = new ArrayList<Double>();
|
||||
|
||||
public void initialize(String arguments) {
|
||||
if (arguments != null && !arguments.isEmpty()) {
|
||||
pvalueLimit = Double.valueOf(arguments);
|
||||
}
|
||||
factorials.add(0.0); // add fact(0)
|
||||
}
|
||||
|
||||
public void compute(char ref, LocusContext context, rodVariants variant) {
|
||||
|
|
@ -44,15 +46,17 @@ public class VECFisherStrand implements VariantExclusionCriterion {
|
|||
|
||||
public boolean useZeroQualityReads() { return false; }
|
||||
|
||||
public static boolean strandTest(char ref, LocusContext context, int allele1, int allele2, double threshold, StringBuffer out) {
|
||||
public boolean strandTest(char ref, LocusContext context, int allele1, int allele2, double threshold, StringBuffer out) {
|
||||
int[][] table = getContingencyTable(context, allele1, allele2);
|
||||
if ( !variantIsHet(table) )
|
||||
return false;
|
||||
|
||||
updateFactorials(table);
|
||||
|
||||
double pCutoff = computePValue(table);
|
||||
//printTable(table, pCutoff);
|
||||
|
||||
double pValue = 0.0;
|
||||
double pValue = pCutoff;
|
||||
while (rotateTable(table)) {
|
||||
double pValuePiece = computePValue(table);
|
||||
|
||||
|
|
@ -119,25 +123,37 @@ public class VECFisherStrand implements VariantExclusionCriterion {
|
|||
return (table[0][1] >= 0 && table[1][0] >= 0) ? true : false;
|
||||
}
|
||||
|
||||
private static double computePValue(int[][] table) {
|
||||
double pCutoff = 1.0;
|
||||
private double computePValue(int[][] table) {
|
||||
|
||||
int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
|
||||
int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
|
||||
int N = rowSums[0] + rowSums[1];
|
||||
|
||||
pCutoff *= (double) Arithmetic.factorial(rowSums[0]);
|
||||
pCutoff *= (double) Arithmetic.factorial(rowSums[1]);
|
||||
pCutoff *= (double) Arithmetic.factorial(colSums[0]);
|
||||
pCutoff *= (double) Arithmetic.factorial(colSums[1]);
|
||||
pCutoff /= (double) Arithmetic.factorial(N);
|
||||
// calculate in log space so we don't die with high numbers
|
||||
double pCutoff = factorials.get(rowSums[0])
|
||||
+ factorials.get(rowSums[1])
|
||||
+ factorials.get(colSums[0])
|
||||
+ factorials.get(colSums[1])
|
||||
- factorials.get(table[0][0])
|
||||
- factorials.get(table[0][1])
|
||||
- factorials.get(table[1][0])
|
||||
- factorials.get(table[1][1])
|
||||
- factorials.get(N);
|
||||
return Math.exp(pCutoff);
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
for (int j = 0; j < 2; j++) {
|
||||
pCutoff /= (double) Arithmetic.factorial(table[i][j]);
|
||||
}
|
||||
}
|
||||
return pCutoff;
|
||||
//pCutoff *= (double) Arithmetic.factorial(rowSums[0]);
|
||||
//pCutoff *= (double) Arithmetic.factorial(rowSums[1]);
|
||||
//pCutoff *= (double) Arithmetic.factorial(colSums[0]);
|
||||
//pCutoff *= (double) Arithmetic.factorial(colSums[1]);
|
||||
//pCutoff /= (double) Arithmetic.factorial(N);
|
||||
//
|
||||
//for (int i = 0; i < 2; i++) {
|
||||
// for (int j = 0; j < 2; j++) {
|
||||
// pCutoff /= (double) Arithmetic.factorial(table[i][j]);
|
||||
// }
|
||||
//}
|
||||
//
|
||||
//return pCutoff;
|
||||
}
|
||||
|
||||
private static int sumRow(int[][] table, int column) {
|
||||
|
|
@ -192,4 +208,11 @@ public class VECFisherStrand implements VariantExclusionCriterion {
|
|||
|
||||
return table;
|
||||
}
|
||||
|
||||
private void updateFactorials(int[][] table) {
|
||||
// calculate in log space so we don't die with high numbers
|
||||
int max = table[0][0] + table[0][1] + table[1][0] + table[1][1];
|
||||
for (int i = factorials.size(); i <= max; i++)
|
||||
factorials.add(factorials.get(i - 1) + Math.log(i));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue