Now works on single samples and computes metrics.

Here is an example metrics output from a very tiny region:

	Allele Frequency Metrics (LOD >= 5)
	-------------------------------------------------
	Total loci                         : 14704
	Total called with confidence       : 10920 (74.27%)
	Number of Variants                 : 16 (0.15%) (1/682)
    Fraction of variant sites in dbSNP : 100.00%

Missing:
    Microarray(hapmap) concordance, tp/fp.

Optional:
    Histograms of depth of coverage, LOD, observed allele frequency, etc.



Still to implement:
    Propagate command line argument N (number of chromosomes) into walker to enable pooled calling.
    Take allele frequency priors as input.




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@133 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
jmaguire 2009-03-22 15:45:12 +00:00
parent f7ad17016d
commit 4e0cd6ab84
2 changed files with 100 additions and 78 deletions

View File

@ -17,72 +17,96 @@ import java.util.List;
* To change this template use File | Settings | File Templates. * To change this template use File | Settings | File Templates.
*/ */
public class AlleleFrequencyMetricsWalker extends BasicLociWalker<Integer, Integer> { public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequencyEstimate, String>
{
long dbsnp_tp=0; long dbsnp_hits=0;
long dbsnp_fp=0; long num_variants=0;
long num_snps=0; long num_loci_total=0;
long num_loci=0; long num_loci_confident=0;
double LOD_cutoff = 5; double LOD_cutoff = 5;
//public void calculateMetrics(List<ReferenceOrderedDatum> rodData, AlleleFrequencyWalker.AlleleFrequencyEstimate alleleFreq) { AlleleFrequencyWalker caller;
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
//AlleleFrequencyWalker = AlleleFrequencyWalker();
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyWalker().map(rodData, ref, context);
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context)
{
AlleleFrequencyEstimate alleleFreq = caller.map(rodData, ref, context);
boolean is_dbSNP_SNP = false; boolean is_dbSNP_SNP = false;
for ( ReferenceOrderedDatum datum : rodData ) { for ( ReferenceOrderedDatum datum : rodData )
if ( datum != null && datum instanceof rodDbSNP) { {
if ( datum != null && datum instanceof rodDbSNP)
{
rodDbSNP dbsnp = (rodDbSNP)datum; rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true; if (dbsnp.isSNP()) is_dbSNP_SNP = true;
} }
} }
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff) { // we confidently called it a SNP! num_loci_total += 1;
if (is_dbSNP_SNP) {
dbsnp_tp += 1; if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
}else{
dbsnp_fp += 1; if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff)
{
// Confident variant.
num_variants += 1;
if (is_dbSNP_SNP)
{
dbsnp_hits += 1;
} }
} }
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff) { return alleleFreq;
//System.out.println(alleleFreq.getLogOddsVarRef()); }
num_snps++;
public void printMetrics()
{
if (num_loci_total == 0) { return; }
System.out.printf("\n");
System.out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
System.out.printf("METRICS -------------------------------------------------\n");
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
if (num_variants != 0)
{
System.out.printf("METRICS Number of Variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
} }
num_loci++; System.out.println();
return 1;
} }
public void printMetrics() { public void onTraversalDone()
System.out.println("\nAllele Frequency Metrics:\n"); {
System.out.printf("Precision of LOD >= %.0f SNPs w.r.t dbSNP: %.2f\n", LOD_cutoff, (float)dbsnp_tp / (dbsnp_fp + dbsnp_tp) * 100);
System.out.printf("\\--TP: %d\n", dbsnp_tp);
System.out.printf("\\--FP: %d\n", dbsnp_fp);
System.out.println();
System.out.printf("SNPs (LOD > %.0f): %d\n", LOD_cutoff, num_snps);
System.out.printf("Total loci: %d\n", num_loci);
System.out.printf("SNPs / loci: 1/%.0f\n", (float)num_loci/num_snps);
System.out.println();
}
public void onTraversalDone() {
printMetrics(); printMetrics();
} }
public String reduceInit()
{
caller = new AlleleFrequencyWalker();
return "";
}
public Integer reduceInit() { return 0; } public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5))
{
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
alleleFreq.location,
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
alleleFreq.depth));
}
public Integer reduce(Integer alleleFreq, Integer sum) { if (this.num_loci_total % 10000 == 0) { printMetrics(); }
//System.out.printf("%s %.2f\n", alleleFreq.asString(), alleleFreq.logOddsVarRef); return "null";
return 0;//value + sum;
} }

View File

@ -49,7 +49,6 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
for (byte b : bases) for (byte b : bases)
base_counts[nuc2num[b]]++; base_counts[nuc2num[b]]++;
// Find alternate allele - 2nd most frequent non-ref allele // Find alternate allele - 2nd most frequent non-ref allele
// (maybe we should check for ties and eval both or check most common including quality scores) // (maybe we should check for ties and eval both or check most common including quality scores)
int altnum = -1; // start with first base numerical identity (0,1,2,3) int altnum = -1; // start with first base numerical identity (0,1,2,3)
@ -62,7 +61,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
} }
assert(altnum != -1); assert(altnum != -1);
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(N, bases, quals, refnum, altnum); AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases, quals, refnum, altnum, base_string.length());
// Print dbSNP data if its there // Print dbSNP data if its there
if (false) { if (false) {
@ -74,20 +73,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
} }
} }
} }
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
context.getLocation(),
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
base_string.length()));
return alleleFreq; return alleleFreq;
} }
public AlleleFrequencyEstimate AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum) public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
{ {
// q = hypothetical %nonref // q = hypothetical %nonref
@ -99,7 +88,6 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
// b = number of bases at locus // b = number of bases at locus
double epsilon = 0; // 1e-2; double epsilon = 0; // 1e-2;
double qstar; double qstar;
int qstar_N; int qstar_N;
@ -111,36 +99,45 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
double best_qhat = Math.log10(0); double best_qhat = Math.log10(0);
double best_posterior = Math.log10(0); double best_posterior = Math.log10(0);
double best_pDq = Math.log10(0);
double best_pqG = Math.log10(0);
double best_pG = Math.log10(0);
for (double q=0.0; q <= qend; q += qstep) // hypothetic allele balance that we sample over for (double q=0.0; q <= qend; q += qstep) // hypothetic allele balance that we sample over
{ {
long q_R = Math.round(q*bases.length); long q_R = Math.round(q*bases.length);
for (qstar = epsilon, qstar_N = 0; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++) // qstar - true allele balances for (qstar = epsilon + ((1.0 - 2*epsilon)/N), qstar_N = 1; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++) // qstar - true allele balances
{ {
// for N=2: these are 0.0 + epsilon, 0.5, 1.0 - epsilon corresponding to reference, het-SNP, homo-SNP // for N=2: these are 0.0 + epsilon, 0.5, 1.0 - epsilon corresponding to reference, het-SNP, homo-SNP
double pDq = P_D_q(bases, quals, q, refnum, altnum); double pDq = P_D_q(bases, quals, q, refnum, altnum);
double pqG = P_q_G(bases, N, q, qstar, q_R); double pqG = P_q_G(bases, N, q, qstar, q_R);
double pG = P_G(N, qstar_N); //= P_G(N, qstar); double pG = P_G(N, qstar_N);
double posterior = pDq + pqG; // + pG; double posterior = pDq + pqG + pG;
if (posterior > best_posterior) if (posterior > best_posterior)
{ {
best_qstar = qstar; best_qstar = qstar;
best_qhat = q; best_qhat = q;
best_posterior = posterior; best_posterior = posterior;
best_pDq = pDq;
best_pqG = pqG;
best_pG = pG;
} }
} }
} }
double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, 0.0, 0) + P_G(N, 0); double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, epsilon, 0) + P_G(N, 0);
double LOD = best_posterior - posterior_null_hyp; double LOD = best_posterior - posterior_null_hyp;
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(num2nuc[refnum], AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location,
num2nuc[refnum],
num2nuc[altnum], num2nuc[altnum],
N, N,
best_qhat, best_qhat,
best_qstar, best_qstar,
LOD); LOD,
depth);
return alleleFreq; return alleleFreq;
} }
@ -182,14 +179,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
static double P_G(int N, int qstar_N) static double P_G(int N, int qstar_N)
{ {
if (N==2) // badly hard coded right now.
{ if (qstar_N == 0) { return Math.log10(0.999); }
return Math.log10(p_G_N_2[ qstar_N ]); else if (qstar_N == N) { return Math.log10(1e-5); }
} else { return Math.log10(1e-3); }
else
{
return Math.log10(1.0);
}
} }
static String genotypeTypeString(double q, int N){ static String genotypeTypeString(double q, int N){
@ -237,13 +230,23 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum) public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum)
{ {
if (alleleFreq.LOD >= 5)
{
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
alleleFreq.location,
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
alleleFreq.depth));
}
return 0; return 0;
} }
static int nuc2num[]; static int nuc2num[];
static char num2nuc[]; static char num2nuc[];
static double p_G_N_2[]; // pop. gen. priors for N=2
public AlleleFrequencyWalker() { public AlleleFrequencyWalker() {
nuc2num = new int[128]; nuc2num = new int[128];
nuc2num['A'] = 0; nuc2num['A'] = 0;
@ -260,11 +263,6 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
num2nuc[1] = 'C'; num2nuc[1] = 'C';
num2nuc[2] = 'T'; num2nuc[2] = 'T';
num2nuc[3] = 'G'; num2nuc[3] = 'G';
p_G_N_2 = new double[3];
p_G_N_2[0] = 0.999;
p_G_N_2[1] = 1e-3;
p_G_N_2[2] = 1e-5;
} }
@ -309,7 +307,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}}; {0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}};
AlleleFrequencyWalker w = new AlleleFrequencyWalker(); AlleleFrequencyWalker w = new AlleleFrequencyWalker();
AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(N, het_bases, het_quals, 0, 1); AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator("null", N, het_bases, het_quals, 0, 1, 20);
System.out.print(String.format("50/50 Het : %s %c %c %f %f %f %d %s\n", System.out.print(String.format("50/50 Het : %s %c %c %f %f %f %d %s\n",
"null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.LOD, 20, "null")); "null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.LOD, 20, "null"));