Added - new tests (Hapmap was re-added)

Modified - Hapmap now takes a -q command to filter out variants by quality
Modified - MathUtils - cumBinomialProbLog now uses BigDecimal to handle some numerical imprecisions
Modified - PowerBelowFrequency - returns 0.0 if called with a negative number (can't be done from inside the walker itself, but since it's called elsewhere one can't be too careful)



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2350 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-12-14 21:57:20 +00:00
parent 8e44bfd2ef
commit b42fc905e8
5 changed files with 131 additions and 36 deletions

View File

@ -34,9 +34,11 @@ public class HapmapPoolAllelicInfoWalker extends LocusWalker<String, PrintWriter
public int poolSize = -1;
@Argument(fullName="sampleNames", shortName="samples", doc="Sample name bindings", required=true)
public String sampleNameFile = null;
@Argument(fullName="minCallQuality", shortName="q", doc="Ignore calls with below this quality, defaults to -1")
public double minCallQ = -1;
private PrintWriter output;
private static double EPSILON = Math.pow(10,-4);
private String[] sampleNames = null;
private PowerBelowFrequencyWalker powerWalker = null;
private ConcordanceTruthTable ctt = null;
@ -83,7 +85,7 @@ public class HapmapPoolAllelicInfoWalker extends LocusWalker<String, PrintWriter
Variation call = (Variation) tracker.lookup("calls",null);
if ( call == null ) {
called = 0;
} else if ( call.isReference() ) {
} else if ( call.isReference() || call.getNegLog10PError() < minCallQ-EPSILON ) {
called = 0;
} else {
called = 1;

View File

@ -19,11 +19,15 @@ import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* User: chartl
* Date: Oct 8, 2009
* Time: 9:44:35 AM
* To change this template use File | Settings | File Templates.
*/
/**
* Given an input N, this walker calculates the power to detect a polymorphism with N, N-1, N-2, ..., 1 variant alleles in a pooled setting
*/
@By(DataSource.REFERENCE)
public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
@Argument(fullName="lodThreshold", shortName="lod", doc="Threshold for log likelihood ratio to be called a SNP. Defaults to 3.0", required = false)
@ -159,17 +163,20 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
int kaccept = (int) Math.ceil( ( lodThreshold - ( (double) depth ) * ( dkterm_num - dkterm_denom ) ) /
( kterm_num - kterm_denom- dkterm_num + dkterm_denom ) );
System.out.println("Error="+p_error+" snpProp="+snpProp+" alleles="+alleles+" lodThreshold="+lodThreshold+" kaccept="+kaccept);
// we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs
// we can optimize this by checking to see which sum is smaller
if ( depth - kaccept < kaccept ) {// kaccept > depth/2 - calculate power as P[hits between kaccept and depth]
power = MathUtils.cumBinomialProbLog(kaccept, depth, depth, snpProp);
} else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and kaccept]
power = 1-MathUtils.cumBinomialProbLog(0,kaccept,depth,snpProp);
if (kaccept <= 0) {
power = 0.0;
} else {
// we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs
// we can optimize this by checking to see which sum is smaller
if ( depth - kaccept < kaccept ) {// kaccept > depth/2 - calculate power as P[hits between kaccept and depth]
power = MathUtils.cumBinomialProbLog(kaccept, depth, depth, snpProp);
} else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and kaccept]
power = 1-MathUtils.cumBinomialProbLog(0,kaccept,depth,snpProp);
}
}
}
return power;
}

View File

@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils;
import cern.jet.math.Arithmetic;
import java.math.BigDecimal;
/**
* MathUtils is a static class (no instantiation allowed!) with some useful math methods.
*
@ -180,11 +182,22 @@ public class MathUtils {
*/
public static double cumBinomialProbLog(int start, int end, int total, double probHit) {
double cumProb = 0.0;
double prevProb;
BigDecimal probCache = BigDecimal.ZERO;
for(int hits = start; hits < end; hits++) {
cumProb += binomialProbabilityLog(hits, total, probHit);
prevProb = cumProb;
double probability = binomialProbabilityLog(hits,total,probHit);
cumProb += probability;
if ( probability > 0 && cumProb - prevProb < probability/2 ) { // loss of precision
probCache = probCache.add(new BigDecimal(prevProb));
cumProb = 0.0;
hits--; // repeat loop
// prevProb changes at start of loop
}
}
return cumProb;
return probCache.add(new BigDecimal(cumProb)).doubleValue();
}
/**

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: chartl
@ -12,48 +14,44 @@ import org.junit.Test;
*/
public class HapmapPoolAllelicInfoIntegrationTest extends WalkerTest {
@Test
@Test
public void testFHSPool3() {
// Help! I'm broken!
/*String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt "
String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt "
+ "-B /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_sample_paths.txt "
+ "-B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
+ "-I /humgen/gsa-scr1/GATK_Data/Validation_Data/FHSP_pool3_test.bam "
+ "-R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -of %s "
+ "-ps 40 -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list";
+ "-ps 40 -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list -q 0";
String md5ForThisTest = "da1222d7514f247eae066134d8e3cde3";
WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
executeTest("Pool 3 of FHS Pilot on testbed intervals", spec); */
executeTest("Pool 3 of FHS Pilot on testbed intervals", spec);
}
// todo -- chris must fix
// @Test
// public void testFHSPool3NoIntervals() {
// String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt "
// + "-B /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_sample_paths.txt "
// + "-B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
// + "-I /humgen/gsa-scr1/GATK_Data/Validation_Data/FHSP_pool3_test.bam "
// + "-R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -of %s "
// + "-ps 40";
// String md5ForThisTest = "120f3307d94d613c3559a1051fe3aaef";
// WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
// executeTest("Pool 3 of FHS Pilot without testbed intervals", spec);
// }
@Test
public void testFHSPool3NoIntervals() {
String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_samples.txt "
+ "-B /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_sample_paths.txt "
+ "-B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli "
+ "-I /humgen/gsa-scr1/GATK_Data/Validation_Data/FHSP_pool3_test.bam "
+ "-R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -of %s "
+ "-ps 40 -q 0";
String md5ForThisTest = "9ee3f390657c32c9c4fbe654bcd3cdf9";
WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
executeTest("Testing hapmap sites without reads at them (no testbed intervals)", spec);
}
@Test
public void testSmallPool() {
// Help! I'm broken!
/*String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878_NA12891_samples.txt "
String test_args = "-T HapmapPoolAllelicInfo -samples /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878_NA12891_samples.txt "
+"-B NA12878,GFF,/humgen/gsa-scr1/andrewk/hapmap_1kg/gffs/NA12878.gff "
+"-B NA12891,GFF,/humgen/gsa-scr1/andrewk/hapmap_1kg/gffs/NA12891.gff "
+"-I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12891.a2b.recal.annotation_subset.bam "
+"-L chr1:14000000-18000000 -B calls,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12891.a2b.ssg1b.geli.calls "
+"-of %s -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -ps 2";
+"-of %s -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -ps 2 -q 0";
String md5ForThisTest = "cc81a9d8279e7b6ad50f9dc1f19a7419";
WalkerTestSpec spec = new WalkerTestSpec(test_args, 1, Arrays.asList(md5ForThisTest));
executeTest("Test on pool of two individuals -- CEU father+daughter chips against subset of the father's calls; power at lod 3", spec);
*/
}
}

View File

@ -0,0 +1,75 @@
package org.broadinstitute.sting.playground.gatk.walkers.poolseq;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.junit.Test;
import org.junit.Assert;
import java.math.BigDecimal;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Dec 14, 2009
* Time: 10:19:53 AM
* To change this template use File | Settings | File Templates.
*/
public class PowerTest extends BaseTest {
// test for various items involved in PowerBelowFrequency
@Test
public void testBinomialProbabilityLog() {
logger.warn("Testing binomial probability in log space");
Assert.assertTrue(MathUtils.compareDoubles(MathUtils.binomialProbabilityLog(5,7,0.3),0.02500,1e-4)==0);
Assert.assertTrue(MathUtils.compareDoubles(MathUtils.binomialProbabilityLog(8000,15000,0.6),2.56e-62,1e-64) == 0);
Assert.assertTrue(MathUtils.compareDoubles(MathUtils.binomialProbabilityLog(12000,20000,0.5),7.34e-178,1e-180) == 0);
Assert.assertTrue(MathUtils.compareDoubles(MathUtils.binomialProbabilityLog(600,800,0.75),0.0326,0.0001) == 0);
Assert.assertTrue(MathUtils.compareDoubles(MathUtils.binomialProbabilityLog(50,500,0.2),6.404e-10,1e-13)==0);
Assert.assertTrue(MathUtils.compareDoubles(MathUtils.binomialProbabilityLog(5277,8279,0.60344),1.6023145e-11,1e-18) == 0);
}
@Test
public void testBinomialCDF() {
logger.warn("Testing binomial CDF");
double test1 = MathUtils.cumBinomialProbLog(0,5277,8279,0.64344);
System.out.println("Binomial CDF -- Test 1: "+test1);
Assert.assertTrue(MathUtils.compareDoubles(test1,0.127,5e-3) == 0);
double test2 = MathUtils.cumBinomialProbLog(0,1472,9886,0.172);
System.out.println("Binomial CDF -- Test 2: "+test2);
Assert.assertTrue(MathUtils.compareDoubles(test2,3.1e-10,5e-11)==0);
double test3 = MathUtils.cumBinomialProbLog(0,1472,9886,0.142);
System.out.println("Binomial CDF -- Test 3: "+test3);
Assert.assertTrue(MathUtils.compareDoubles(test3,0.975,5e-3)==0);
double test4 = MathUtils.cumBinomialProbLog(0,6781,21297,0.321742);
System.out.println("Binomial CDF -- Test 4:"+test4);
Assert.assertTrue(MathUtils.compareDoubles(test4,0.150,5e-3)==0);
double test5 = MathUtils.cumBinomialProbLog(0,10609,21297,0.5001);
System.out.println("Binomial CDF == Test 5:"+test5);
Assert.assertTrue(MathUtils.compareDoubles(test5,0.286,5e-3)==0);
}
/* @Test
public void testPowerCalculation() {
PowerBelowFrequencyWalker w = new PowerBelowFrequencyWalker();
w.initialize();
logger.warn("Testing power on pool of 100 people");
w.setPoolSize(100);
// TEST 1
double error = 0.000123;
byte q = QualityUtils.probToQual(1-error);
double pow = w.theoreticalPower(1000,q,1,5);
System.out.println("Qual for "+error+"="+q+" and power="+pow);
Assert.assertTrue(MathUtils.compareDoubles(pow,0.7356,1e-4) == 0);
// TEST 2
error = 0.00512;
q = QualityUtils.probToQual(1-error);
double err2 = QualityUtils.qualToErrorProb(q);
pow = w.theoreticalPower(40,q,12,3);
System.out.println("Qual for "+error+"="+q+" back to error="+err2+" and power="+pow);
Assert.assertTrue(MathUtils.compareDoubles(pow,0.2173,1e-4) == 0);
}*/
}