One line format is useable and two levels of debug output are available (debug = 1: one line format, debug = 2: table of sampled probs for each locus). Class AlleleFrequencyMetrics computes %dbSNP and frequency of SNPs.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@99 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-03-19 15:05:05 +00:00
parent f1034f3dfd
commit 5fa99f430e
2 changed files with 247 additions and 53 deletions

View File

@ -3,26 +3,27 @@ package org.broadinstitute.sting.gatk.walkers;
//import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.utils.AlleleFrequencyMetrics;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.Arrays;
public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyWalker.AlleleFrequencyEstimate, Integer> {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
// Convert context data into bases and 4-base quals
// Set number of chromosomes, N, to 2 for now
int N = 2;
boolean debug = false;
int debug = 0;
// Convert bases to CharArray
int numReads = context.getReads().size(); //numReads();
byte[] bases = new byte[numReads];
String base_string = "";
double[][] quals = new double[numReads][4];
int refnum = nuc2num[ref];
@ -33,6 +34,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
int offset = offsets.get(i);
bases[i] = read.getReadBases()[offset];
base_string += read.getReadString().charAt(offset);
int callednum = nuc2num[bases[i]];
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
@ -60,21 +62,33 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
}
assert(altnum > 0);
if (bases.length > 0) { //altcount > 2) {
System.out.format("Pos: %s | ref: %c | alt: %c | %2dA | %2dC | %2dT | %2dG | %2d total | ", context.getLocation(), num2nuc[refnum],
num2nuc[altnum], base_counts[0], base_counts[1], base_counts[2], base_counts[3], bases.length);
if (debug >= 1) System.out.format("Pos: %s ref: %c alt: %c ", context.getLocation(), num2nuc[refnum], num2nuc[altnum]);
//System.out.format("%2dA %2dC %2dT %2dG %2d total", base_counts[0], base_counts[1], base_counts[2], base_counts[3], bases.length);
if (debug) print_base_qual_matrix(quals, numReads);
if (debug >= 2) print_base_qual_matrix(quals, numReads);
// Check if we really want to do this one
AlleleFrequencyEstimator(N, bases, quals, refnum, altnum, debug);
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(N, bases, quals, refnum, altnum, debug);
// Print dbSNP data if its there
if (debug >= 1) {
for ( ReferenceOrderedDatum datum : rodData ) {
if ( datum != null && datum instanceof rodDbSNP) {
rodDbSNP dbsnp = (rodDbSNP)datum;
//System.out.printf(" DBSNP %s on %s => %s%n", dbsnp.toSimpleString(), dbsnp.strand, Utils.join("/", dbsnp.getAllelesFWD()));
System.out.printf("ROD: %s ",dbsnp.toMediumString());
}
}
}
if (debug) System.out.println();
if (debug >= 1) System.out.format(" %s\n", base_string);
if (debug >= 2) System.out.println();
return 1;
alleleMetrics.calculateMetrics(rodData, alleleFreq);
return alleleFreq;
}
static void AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum, boolean debug) {
AlleleFrequencyEstimate AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum, int debug) {
// q = hypothetical %nonref
// qstar = true %nonref
@ -83,47 +97,138 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
// alt = alternate hypothesis base (most frequent nonref base)
// ref = reference base
// b = number of bases at locus
double epsilon = 0; // 1e-2;
double qstar;
if (debug) {
System.out.format("%4s | ", "q ");
int qstar_N;
if (debug >= 2) {
System.out.format("%4s ", "q ");
for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N)
System.out.format("%5s%12s %12s %12s | ", "qstar", "p(D|q) ", "p(q|G) ", "posterior ");
if (debug) System.out.println();
System.out.format("%5s%10s %10s %10s ", "qstar", "p(D|q) ", "p(q|G) ", "posterior ");
System.out.println();
}
double highest_posterior = -1.0;
double highest_qstar = -1.0;
double highest_q = -1.0;
MixtureLikelihood[] bestMixtures = { new MixtureLikelihood(-1,-1,-1), new MixtureLikelihood(-1,-1,-1), new MixtureLikelihood(-1,-1,-1) };
/*double[] highest_posterior = new double[] {-1.0, -1.0, -1.0};
double[] highest_qstar = new double[] {-1.0, -1.0, -1.0};
double[] highest_q = new double[] {-1.0, -1.0, -1.0};*/
double qstep = 0.01;
double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating
for (double q=0.0; q <= qend; q += qstep) {
if (debug) System.out.format("%.2f | ", q);
for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N) {
double pDq = P_D_q(bases, quals, q, refnum, altnum);
double pqG = P_q_G(bases, N, q, qstar);
double pG = P_G(N, qstar, altnum);
double posterior = pDq * pqG * pG;
if (debug) System.out.format("%.2f %.10f %.10f %.10f | ", qstar, pDq, pqG, posterior);
if (posterior > highest_posterior) {
highest_posterior = posterior;
highest_qstar = qstar;
highest_q = q;
}
}
if (debug) System.out.println();
}
/*System.out.printf("Maximal likelihood posterior: %.6f\n", highest_posterior);
System.out.printf("q that maximimizes posterior: %.6f\n", highest_q);
System.out.printf("qstar used when calculating max q: %.6f\n", highest_qstar);*/
System.out.printf("qhat %.2f | qstar %.2f | ", highest_q, highest_qstar);
for (double q=0.0; q <= qend; q += qstep) { // hypothetic allele balance that we sample over
if (debug >= 2) System.out.format("%.2f ", q);
// Print out the called bases
long numNonrefBases = Math.round(highest_q * N);
long numRefBases = N - numNonrefBases;
String genotype = repeat(num2nuc[refnum], numRefBases) + repeat(num2nuc[altnum], numNonrefBases);
System.out.printf("gen: %s\n", genotype);
long q_R = Math.round(q*bases.length);
for (qstar = epsilon, qstar_N = 0; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++) { // qstar - true allele balances
// for N=2: these are 0.0 + epsilon, 0.5, 1.0 - epsilon corresponding to reference, het-SNP, homo-SNP
//int qstar_N = Math.round((float)qstar*N);
double pDq = P_D_q(bases, quals, q, refnum, altnum);
double pqG = P_q_G(bases, N, q, qstar, q_R);
double pG = P_G(N, qstar_N); //= P_G(N, qstar);
double posterior = pDq * pqG * pG;
if (debug >= 2) System.out.format("%.2f %10.7f %10.7f %10.7f ", qstar, Math.log10(pDq), Math.log10(pqG), Math.log10(posterior));
if (posterior > bestMixtures[qstar_N].posterior)
bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q);
}
if (debug >= 2) System.out.println();
}
// Calculate log-odds difference - must happen before mixtures are sorted by posterior prob.
double logOddsVarRef = logOddsNonrefRef(bestMixtures);
if (debug >= 1) System.out.printf("LOD(Var,Ref): %6.2f ", logOddsVarRef);
// Print all mixture likelihoods in order of qstar
for (MixtureLikelihood mix : bestMixtures) // outputs: "het 3.26 "
if (debug >= 1) System.out.printf("%3s: %6.2f ", genotypeTypeString(mix.qstar, N), Math.log10(mix.posterior / (1-mix.posterior)));
// Sort by posterior probability
Arrays.sort(bestMixtures); // We only used best entry, so this could be a generic max function (optimization)
if (debug >= 1) System.out.printf("qhat: %.2f qstar: %.2f ", bestMixtures[0].qhat, bestMixtures[0].qstar); //highest_q, highest_qstar);
if (debug >= 1) System.out.printf("prob: %.2e ", bestMixtures[0].posterior);
if (debug >= 1) System.out.printf("type: %3s ", genotypeTypeString(bestMixtures[0].qstar, N));
double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) * P_q_G(bases, N, 0.0, 0.0, 0) * P_G(N, 0);
if (debug >= 1) System.out.printf("P(null): %.2e ", posterior_null_hyp);
//double LOD = Math.log10(bestMixtures[0].posterior) - Math.log10(posterior_null_hyp);
double LOD = logOdds(bestMixtures[0].posterior, posterior_null_hyp);
if (debug >= 1) System.out.printf("LOD: %6.2f ", LOD);
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(num2nuc[refnum], num2nuc[refnum],
N, bestMixtures[0].qhat, bestMixtures[0].qstar, logOddsVarRef);
if (debug >= 1) System.out.printf("gen: %s", genotypeTypeString(bestMixtures[0].qstar, N));
return alleleFreq;
}
public class MixtureLikelihood implements Comparable<MixtureLikelihood> {
public double posterior;
public double qstar;
public double qhat;
MixtureLikelihood(double posterior, double qstar, double qhat) {
this.posterior = posterior;
this.qstar = qstar;
this.qhat = qhat;
}
public int compareTo (MixtureLikelihood other) {
// default sorting is REVERSE sorting on posterior prob; gives array with top post. prob. as first element
if (posterior < other.posterior) return 1;
else if (posterior > other.posterior) return -1;
else return 0;
}
}
public class AlleleFrequencyEstimate {
//AlleleFrequencyEstimate();
char ref;
char alt;
int N;
double qhat;
double qstar;
double logOddsVarRef;
public double getQstar() {return qstar;}
public double getLogOddsVarRef() {return logOddsVarRef;}
public AlleleFrequencyEstimate (char ref, char alt, int N, double qhat, double qstar, double logOddsVarRef) {
this.ref = ref;
this.alt = alt;
this.N = N;
this.qhat = qhat;
this.qstar = qstar;
this.logOddsVarRef = logOddsVarRef;
}
public String asString () {
// Print out the called bases
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref
// base has you calculate qstar = 0.0 and qhat = 0.49 and that leads to a genotype predicition of AG according
// to qhat, but AA according to qstar. This needs to be further investigated to see whether we really want
// to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1
long numNonrefBases = Math.round(qstar * N);
long numRefBases = N - numNonrefBases;
return repeat(ref, numRefBases) + repeat(alt, numNonrefBases);
}
}
static double logOdds(double prob1, double prob2) {
return Math.log10(prob1 / (1-prob1)) - Math.log10(prob2 / (1-prob2));
}
static double logOddsNonrefRef(MixtureLikelihood[] mixtures) {
// takes list of mixtureLikelihoods SORTED BY QSTAR (default ordering)
// Works for N == 2 right now!
assert (mixtures.length == 2);
MixtureLikelihood bestNonref = ((mixtures[1].posterior > mixtures[2].posterior) ? mixtures[1] : mixtures[2]);
return logOdds(bestNonref.posterior, mixtures[0].posterior);
// Code to calculate LOD using qstar=0 qhat=0
//double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) * P_q_G(bases, N, 0.0, 0.0) * P_G(N, 0.0);
//double LOD = Math.log10(highest_posterior) - Math.log10(posterior_null_hyp);
}
static double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum) {
@ -136,18 +241,30 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
return p;
}
static double P_q_G(byte [] bases, int N, double q, double qstar) {
return binomialProb(Math.round(q*bases.length), bases.length, qstar);
static double P_q_G(byte [] bases, int N, double q, double qstar, long q_R) {
return binomialProb(q_R, bases.length, qstar);
}
static double P_G(int N, double qstar, int altnum) {
static double P_G(int N, int qstar_N) {
if (N==2) {
return p_G_N_2[ Math.round((float)(qstar * N)) ];
return p_G_N_2[ qstar_N ];
}else{
return 1.0;
}
}
static String genotypeTypeString(double q, int N){
switch (Math.round((float)q*N)) {
case (0):
return "ref";
case (1):
return "het";
case (2):
return "hom";
}
return "ERROR";
}
static double binomialProb(long k, long n, double p) {
// k - numebr of successes
// n - number of Bernoulli trials
@ -168,6 +285,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
}
public static String repeat(char letter, long count) {
// Repeat a character count times
String result = "";
for (int i=0; i<count; i++) {
result = result + letter;
@ -178,8 +296,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return value + sum;
public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum) {
//System.out.printf("%s %.2f\n", alleleFreq.asString(), alleleFreq.logOddsVarRef);
return 0;//value + sum;
}
@ -207,8 +327,17 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
p_G_N_2[0] = 0.999;
p_G_N_2[1] = 1e-3;
p_G_N_2[2] = 1e-5;
alleleMetrics = new AlleleFrequencyMetrics();
}
AlleleFrequencyMetrics alleleMetrics;
public void onTraversalDone() {
alleleMetrics.printMetrics();
}
void print_base_qual_matrix(double [][]quals, int numReads) {
// Print quals for debugging
System.out.println("Base quality matrix");

View File

@ -0,0 +1,65 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.AlleleFrequencyWalker;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: andrewk
* Date: Mar 18, 2009
* Time: 5:28:58 PM
* To change this template use File | Settings | File Templates.
*/
public class AlleleFrequencyMetrics {
long dbsnp_tp=0;
long dbsnp_fp=0;
long num_snps=0;
long num_loci=0;
public void calculateMetrics(List<ReferenceOrderedDatum> rodData, AlleleFrequencyWalker.AlleleFrequencyEstimate alleleFreq) {
boolean is_dbSNP_SNP = false;
for ( ReferenceOrderedDatum datum : rodData ) {
if ( datum != null && datum instanceof rodDbSNP) {
rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
}
}
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLogOddsVarRef() >= 5.0) { // we confidently called it a SNP!
if (is_dbSNP_SNP) {
dbsnp_tp += 1;
}else{
dbsnp_fp += 1;
}
}
if (alleleFreq.getQstar() > 0.0) num_snps++;
num_loci++;
}
public void printMetrics() {
System.out.println("\nAllele Frequency Metrics:\n");
System.out.printf("Precision of LOD >= 5 SNPs w.r.t dbSNP: %.2f\n", (float)dbsnp_tp / (dbsnp_fp + dbsnp_tp) * 100);
System.out.printf("\\--TP: %d\n", dbsnp_tp);
System.out.printf("\\--FP: %d\n", dbsnp_fp);
System.out.println();
System.out.printf("SNPs (LOD > 0): %d\n", num_snps);
System.out.printf("Total loci: %d\n", num_loci);
System.out.printf("SNPs / loci: 1/%.0f\n", (float)num_loci/num_snps);
System.out.println();
}
}