Now works on single samples and computes metrics.

Here is an example metrics output from a very tiny region:

	Allele Frequency Metrics (LOD >= 5)
	-------------------------------------------------
	Total loci                         : 14704
	Total called with confidence       : 10920 (74.27%)
	Number of Variants                 : 16 (0.15%) (1/682)
    Fraction of variant sites in dbSNP : 100.00%

Missing:
    Microarray(hapmap) concordance, tp/fp.

Optional:
    Histograms of depth of coverage, LOD, observed allele frequency, etc.



Still to implement:
    Propagate command line argument N (number of chromosomes) into walker to enable pooled calling.
    Take allele frequency priors as input.




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@133 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
jmaguire 2009-03-22 15:45:12 +00:00
parent f7ad17016d
commit 4e0cd6ab84
2 changed files with 100 additions and 78 deletions

View File

@ -17,72 +17,96 @@ import java.util.List;
* To change this template use File | Settings | File Templates.
*/
public class AlleleFrequencyMetricsWalker extends BasicLociWalker<Integer, Integer> {
public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequencyEstimate, String>
{
long dbsnp_tp=0;
long dbsnp_fp=0;
long num_snps=0;
long num_loci=0;
long dbsnp_hits=0;
long num_variants=0;
long num_loci_total=0;
long num_loci_confident=0;
double LOD_cutoff = 5;
//public void calculateMetrics(List<ReferenceOrderedDatum> rodData, AlleleFrequencyWalker.AlleleFrequencyEstimate alleleFreq) {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
//AlleleFrequencyWalker = AlleleFrequencyWalker();
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyWalker().map(rodData, ref, context);
AlleleFrequencyWalker caller;
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context)
{
AlleleFrequencyEstimate alleleFreq = caller.map(rodData, ref, context);
boolean is_dbSNP_SNP = false;
for ( ReferenceOrderedDatum datum : rodData ) {
if ( datum != null && datum instanceof rodDbSNP) {
for ( ReferenceOrderedDatum datum : rodData )
{
if ( datum != null && datum instanceof rodDbSNP)
{
rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
}
}
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff) { // we confidently called it a SNP!
if (is_dbSNP_SNP) {
dbsnp_tp += 1;
}else{
dbsnp_fp += 1;
num_loci_total += 1;
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff)
{
// Confident variant.
num_variants += 1;
if (is_dbSNP_SNP)
{
dbsnp_hits += 1;
}
}
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff) {
//System.out.println(alleleFreq.getLogOddsVarRef());
num_snps++;
return alleleFreq;
}
public void printMetrics()
{
if (num_loci_total == 0) { return; }
System.out.printf("\n");
System.out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
System.out.printf("METRICS -------------------------------------------------\n");
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
if (num_variants != 0)
{
System.out.printf("METRICS Number of Variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
}
num_loci++;
return 1;
System.out.println();
}
public void printMetrics() {
System.out.println("\nAllele Frequency Metrics:\n");
System.out.printf("Precision of LOD >= %.0f SNPs w.r.t dbSNP: %.2f\n", LOD_cutoff, (float)dbsnp_tp / (dbsnp_fp + dbsnp_tp) * 100);
System.out.printf("\\--TP: %d\n", dbsnp_tp);
System.out.printf("\\--FP: %d\n", dbsnp_fp);
System.out.println();
System.out.printf("SNPs (LOD > %.0f): %d\n", LOD_cutoff, num_snps);
System.out.printf("Total loci: %d\n", num_loci);
System.out.printf("SNPs / loci: 1/%.0f\n", (float)num_loci/num_snps);
System.out.println();
}
public void onTraversalDone() {
public void onTraversalDone()
{
printMetrics();
}
public String reduceInit()
{
caller = new AlleleFrequencyWalker();
return "";
}
public Integer reduceInit() { return 0; }
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5))
{
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
alleleFreq.location,
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
alleleFreq.depth));
}
public Integer reduce(Integer alleleFreq, Integer sum) {
if (this.num_loci_total % 10000 == 0) { printMetrics(); }
//System.out.printf("%s %.2f\n", alleleFreq.asString(), alleleFreq.logOddsVarRef);
return 0;//value + sum;
return "null";
}

View File

@ -48,7 +48,6 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
int[] base_counts = new int[4];
for (byte b : bases)
base_counts[nuc2num[b]]++;
// Find alternate allele - 2nd most frequent non-ref allele
// (maybe we should check for ties and eval both or check most common including quality scores)
@ -62,7 +61,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
}
assert(altnum != -1);
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(N, bases, quals, refnum, altnum);
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases, quals, refnum, altnum, base_string.length());
// Print dbSNP data if its there
if (false) {
@ -74,20 +73,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
}
}
}
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
context.getLocation(),
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
base_string.length()));
return alleleFreq;
}
public AlleleFrequencyEstimate AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum)
public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
{
// q = hypothetical %nonref
@ -99,7 +88,6 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
// b = number of bases at locus
double epsilon = 0; // 1e-2;
double qstar;
int qstar_N;
@ -111,36 +99,45 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
double best_qhat = Math.log10(0);
double best_posterior = Math.log10(0);
double best_pDq = Math.log10(0);
double best_pqG = Math.log10(0);
double best_pG = Math.log10(0);
for (double q=0.0; q <= qend; q += qstep) // hypothetic allele balance that we sample over
{
long q_R = Math.round(q*bases.length);
for (qstar = epsilon, qstar_N = 0; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++) // qstar - true allele balances
for (qstar = epsilon + ((1.0 - 2*epsilon)/N), qstar_N = 1; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++) // qstar - true allele balances
{
// for N=2: these are 0.0 + epsilon, 0.5, 1.0 - epsilon corresponding to reference, het-SNP, homo-SNP
double pDq = P_D_q(bases, quals, q, refnum, altnum);
double pqG = P_q_G(bases, N, q, qstar, q_R);
double pG = P_G(N, qstar_N); //= P_G(N, qstar);
double posterior = pDq + pqG; // + pG;
double pG = P_G(N, qstar_N);
double posterior = pDq + pqG + pG;
if (posterior > best_posterior)
{
best_qstar = qstar;
best_qhat = q;
best_posterior = posterior;
best_pDq = pDq;
best_pqG = pqG;
best_pG = pG;
}
}
}
double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, 0.0, 0) + P_G(N, 0);
double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, epsilon, 0) + P_G(N, 0);
double LOD = best_posterior - posterior_null_hyp;
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(num2nuc[refnum],
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location,
num2nuc[refnum],
num2nuc[altnum],
N,
best_qhat,
best_qstar,
LOD);
LOD,
depth);
return alleleFreq;
}
@ -182,14 +179,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
static double P_G(int N, int qstar_N)
{
if (N==2)
{
return Math.log10(p_G_N_2[ qstar_N ]);
}
else
{
return Math.log10(1.0);
}
// badly hard coded right now.
if (qstar_N == 0) { return Math.log10(0.999); }
else if (qstar_N == N) { return Math.log10(1e-5); }
else { return Math.log10(1e-3); }
}
static String genotypeTypeString(double q, int N){
@ -237,13 +230,23 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum)
{
if (alleleFreq.LOD >= 5)
{
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
alleleFreq.location,
alleleFreq.ref,
alleleFreq.alt,
alleleFreq.qhat,
alleleFreq.qstar,
alleleFreq.LOD,
alleleFreq.depth));
}
return 0;
}
static int nuc2num[];
static char num2nuc[];
static double p_G_N_2[]; // pop. gen. priors for N=2
public AlleleFrequencyWalker() {
nuc2num = new int[128];
nuc2num['A'] = 0;
@ -260,11 +263,6 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
num2nuc[1] = 'C';
num2nuc[2] = 'T';
num2nuc[3] = 'G';
p_G_N_2 = new double[3];
p_G_N_2[0] = 0.999;
p_G_N_2[1] = 1e-3;
p_G_N_2[2] = 1e-5;
}
@ -309,7 +307,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
{0.001/3.0, 0.999, 0.001/3.0, 0.001/3.0}};
AlleleFrequencyWalker w = new AlleleFrequencyWalker();
AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator(N, het_bases, het_quals, 0, 1);
AlleleFrequencyEstimate estimate = w.AlleleFrequencyEstimator("null", N, het_bases, het_quals, 0, 1, 20);
System.out.print(String.format("50/50 Het : %s %c %c %f %f %f %d %s\n",
"null", estimate.ref, estimate.alt, estimate.qhat, estimate.qstar, estimate.LOD, 20, "null"));