One line format is useable and two levels of debug output are available (debug = 1: one line format, debug = 2: table of sampled probs for each locus). Class AlleleFrequencyMetrics computes %dbSNP and frequency of SNPs.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@99 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f1034f3dfd
commit
5fa99f430e
|
|
@ -3,26 +3,27 @@ package org.broadinstitute.sting.gatk.walkers;
|
|||
//import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.utils.AlleleFrequencyMetrics;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
|
||||
public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyWalker.AlleleFrequencyEstimate, Integer> {
|
||||
|
||||
|
||||
|
||||
|
||||
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
|
||||
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
|
||||
|
||||
// Convert context data into bases and 4-base quals
|
||||
|
||||
// Set number of chromosomes, N, to 2 for now
|
||||
int N = 2;
|
||||
boolean debug = false;
|
||||
int debug = 0;
|
||||
|
||||
// Convert bases to CharArray
|
||||
int numReads = context.getReads().size(); //numReads();
|
||||
byte[] bases = new byte[numReads];
|
||||
String base_string = "";
|
||||
double[][] quals = new double[numReads][4];
|
||||
int refnum = nuc2num[ref];
|
||||
|
||||
|
|
@ -33,6 +34,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
|
|||
int offset = offsets.get(i);
|
||||
|
||||
bases[i] = read.getReadBases()[offset];
|
||||
base_string += read.getReadString().charAt(offset);
|
||||
int callednum = nuc2num[bases[i]];
|
||||
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
|
||||
|
||||
|
|
@ -60,21 +62,33 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
|
|||
}
|
||||
assert(altnum > 0);
|
||||
|
||||
if (bases.length > 0) { //altcount > 2) {
|
||||
System.out.format("Pos: %s | ref: %c | alt: %c | %2dA | %2dC | %2dT | %2dG | %2d total | ", context.getLocation(), num2nuc[refnum],
|
||||
num2nuc[altnum], base_counts[0], base_counts[1], base_counts[2], base_counts[3], bases.length);
|
||||
if (debug >= 1) System.out.format("Pos: %s ref: %c alt: %c ", context.getLocation(), num2nuc[refnum], num2nuc[altnum]);
|
||||
//System.out.format("%2dA %2dC %2dT %2dG %2d total", base_counts[0], base_counts[1], base_counts[2], base_counts[3], bases.length);
|
||||
|
||||
if (debug) print_base_qual_matrix(quals, numReads);
|
||||
if (debug >= 2) print_base_qual_matrix(quals, numReads);
|
||||
|
||||
// Check if we really want to do this one
|
||||
AlleleFrequencyEstimator(N, bases, quals, refnum, altnum, debug);
|
||||
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(N, bases, quals, refnum, altnum, debug);
|
||||
|
||||
// Print dbSNP data if its there
|
||||
if (debug >= 1) {
|
||||
for ( ReferenceOrderedDatum datum : rodData ) {
|
||||
if ( datum != null && datum instanceof rodDbSNP) {
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
//System.out.printf(" DBSNP %s on %s => %s%n", dbsnp.toSimpleString(), dbsnp.strand, Utils.join("/", dbsnp.getAllelesFWD()));
|
||||
System.out.printf("ROD: %s ",dbsnp.toMediumString());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
if (debug) System.out.println();
|
||||
if (debug >= 1) System.out.format(" %s\n", base_string);
|
||||
if (debug >= 2) System.out.println();
|
||||
|
||||
return 1;
|
||||
alleleMetrics.calculateMetrics(rodData, alleleFreq);
|
||||
|
||||
return alleleFreq;
|
||||
}
|
||||
|
||||
static void AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum, boolean debug) {
|
||||
AlleleFrequencyEstimate AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum, int debug) {
|
||||
|
||||
// q = hypothetical %nonref
|
||||
// qstar = true %nonref
|
||||
|
|
@ -83,47 +97,138 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
|
|||
// alt = alternate hypothesis base (most frequent nonref base)
|
||||
// ref = reference base
|
||||
|
||||
// b = number of bases at locus
|
||||
|
||||
|
||||
double epsilon = 0; // 1e-2;
|
||||
double qstar;
|
||||
if (debug) {
|
||||
System.out.format("%4s | ", "q ");
|
||||
int qstar_N;
|
||||
if (debug >= 2) {
|
||||
System.out.format("%4s ", "q ");
|
||||
for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N)
|
||||
System.out.format("%5s%12s %12s %12s | ", "qstar", "p(D|q) ", "p(q|G) ", "posterior ");
|
||||
if (debug) System.out.println();
|
||||
System.out.format("%5s%10s %10s %10s ", "qstar", "p(D|q) ", "p(q|G) ", "posterior ");
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
double highest_posterior = -1.0;
|
||||
double highest_qstar = -1.0;
|
||||
double highest_q = -1.0;
|
||||
MixtureLikelihood[] bestMixtures = { new MixtureLikelihood(-1,-1,-1), new MixtureLikelihood(-1,-1,-1), new MixtureLikelihood(-1,-1,-1) };
|
||||
/*double[] highest_posterior = new double[] {-1.0, -1.0, -1.0};
|
||||
double[] highest_qstar = new double[] {-1.0, -1.0, -1.0};
|
||||
double[] highest_q = new double[] {-1.0, -1.0, -1.0};*/
|
||||
|
||||
double qstep = 0.01;
|
||||
double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating
|
||||
for (double q=0.0; q <= qend; q += qstep) {
|
||||
if (debug) System.out.format("%.2f | ", q);
|
||||
for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N) {
|
||||
double pDq = P_D_q(bases, quals, q, refnum, altnum);
|
||||
double pqG = P_q_G(bases, N, q, qstar);
|
||||
double pG = P_G(N, qstar, altnum);
|
||||
double posterior = pDq * pqG * pG;
|
||||
if (debug) System.out.format("%.2f %.10f %.10f %.10f | ", qstar, pDq, pqG, posterior);
|
||||
if (posterior > highest_posterior) {
|
||||
highest_posterior = posterior;
|
||||
highest_qstar = qstar;
|
||||
highest_q = q;
|
||||
}
|
||||
}
|
||||
if (debug) System.out.println();
|
||||
}
|
||||
/*System.out.printf("Maximal likelihood posterior: %.6f\n", highest_posterior);
|
||||
System.out.printf("q that maximimizes posterior: %.6f\n", highest_q);
|
||||
System.out.printf("qstar used when calculating max q: %.6f\n", highest_qstar);*/
|
||||
System.out.printf("qhat %.2f | qstar %.2f | ", highest_q, highest_qstar);
|
||||
for (double q=0.0; q <= qend; q += qstep) { // hypothetic allele balance that we sample over
|
||||
if (debug >= 2) System.out.format("%.2f ", q);
|
||||
|
||||
// Print out the called bases
|
||||
long numNonrefBases = Math.round(highest_q * N);
|
||||
long numRefBases = N - numNonrefBases;
|
||||
String genotype = repeat(num2nuc[refnum], numRefBases) + repeat(num2nuc[altnum], numNonrefBases);
|
||||
System.out.printf("gen: %s\n", genotype);
|
||||
long q_R = Math.round(q*bases.length);
|
||||
for (qstar = epsilon, qstar_N = 0; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++) { // qstar - true allele balances
|
||||
// for N=2: these are 0.0 + epsilon, 0.5, 1.0 - epsilon corresponding to reference, het-SNP, homo-SNP
|
||||
//int qstar_N = Math.round((float)qstar*N);
|
||||
double pDq = P_D_q(bases, quals, q, refnum, altnum);
|
||||
double pqG = P_q_G(bases, N, q, qstar, q_R);
|
||||
double pG = P_G(N, qstar_N); //= P_G(N, qstar);
|
||||
double posterior = pDq * pqG * pG;
|
||||
if (debug >= 2) System.out.format("%.2f %10.7f %10.7f %10.7f ", qstar, Math.log10(pDq), Math.log10(pqG), Math.log10(posterior));
|
||||
if (posterior > bestMixtures[qstar_N].posterior)
|
||||
bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q);
|
||||
}
|
||||
if (debug >= 2) System.out.println();
|
||||
}
|
||||
|
||||
// Calculate log-odds difference - must happen before mixtures are sorted by posterior prob.
|
||||
double logOddsVarRef = logOddsNonrefRef(bestMixtures);
|
||||
if (debug >= 1) System.out.printf("LOD(Var,Ref): %6.2f ", logOddsVarRef);
|
||||
|
||||
// Print all mixture likelihoods in order of qstar
|
||||
for (MixtureLikelihood mix : bestMixtures) // outputs: "het 3.26 "
|
||||
if (debug >= 1) System.out.printf("%3s: %6.2f ", genotypeTypeString(mix.qstar, N), Math.log10(mix.posterior / (1-mix.posterior)));
|
||||
|
||||
// Sort by posterior probability
|
||||
Arrays.sort(bestMixtures); // We only used best entry, so this could be a generic max function (optimization)
|
||||
if (debug >= 1) System.out.printf("qhat: %.2f qstar: %.2f ", bestMixtures[0].qhat, bestMixtures[0].qstar); //highest_q, highest_qstar);
|
||||
if (debug >= 1) System.out.printf("prob: %.2e ", bestMixtures[0].posterior);
|
||||
if (debug >= 1) System.out.printf("type: %3s ", genotypeTypeString(bestMixtures[0].qstar, N));
|
||||
|
||||
double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) * P_q_G(bases, N, 0.0, 0.0, 0) * P_G(N, 0);
|
||||
if (debug >= 1) System.out.printf("P(null): %.2e ", posterior_null_hyp);
|
||||
//double LOD = Math.log10(bestMixtures[0].posterior) - Math.log10(posterior_null_hyp);
|
||||
double LOD = logOdds(bestMixtures[0].posterior, posterior_null_hyp);
|
||||
if (debug >= 1) System.out.printf("LOD: %6.2f ", LOD);
|
||||
|
||||
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(num2nuc[refnum], num2nuc[refnum],
|
||||
N, bestMixtures[0].qhat, bestMixtures[0].qstar, logOddsVarRef);
|
||||
if (debug >= 1) System.out.printf("gen: %s", genotypeTypeString(bestMixtures[0].qstar, N));
|
||||
|
||||
return alleleFreq;
|
||||
}
|
||||
|
||||
public class MixtureLikelihood implements Comparable<MixtureLikelihood> {
|
||||
public double posterior;
|
||||
public double qstar;
|
||||
public double qhat;
|
||||
|
||||
MixtureLikelihood(double posterior, double qstar, double qhat) {
|
||||
this.posterior = posterior;
|
||||
this.qstar = qstar;
|
||||
this.qhat = qhat;
|
||||
}
|
||||
|
||||
public int compareTo (MixtureLikelihood other) {
|
||||
// default sorting is REVERSE sorting on posterior prob; gives array with top post. prob. as first element
|
||||
if (posterior < other.posterior) return 1;
|
||||
else if (posterior > other.posterior) return -1;
|
||||
else return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public class AlleleFrequencyEstimate {
|
||||
//AlleleFrequencyEstimate();
|
||||
char ref;
|
||||
char alt;
|
||||
int N;
|
||||
double qhat;
|
||||
double qstar;
|
||||
double logOddsVarRef;
|
||||
|
||||
public double getQstar() {return qstar;}
|
||||
public double getLogOddsVarRef() {return logOddsVarRef;}
|
||||
|
||||
|
||||
public AlleleFrequencyEstimate (char ref, char alt, int N, double qhat, double qstar, double logOddsVarRef) {
|
||||
this.ref = ref;
|
||||
this.alt = alt;
|
||||
this.N = N;
|
||||
this.qhat = qhat;
|
||||
this.qstar = qstar;
|
||||
this.logOddsVarRef = logOddsVarRef;
|
||||
}
|
||||
|
||||
public String asString () {
|
||||
// Print out the called bases
|
||||
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref
|
||||
// base has you calculate qstar = 0.0 and qhat = 0.49 and that leads to a genotype predicition of AG according
|
||||
// to qhat, but AA according to qstar. This needs to be further investigated to see whether we really want
|
||||
// to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1
|
||||
long numNonrefBases = Math.round(qstar * N);
|
||||
long numRefBases = N - numNonrefBases;
|
||||
return repeat(ref, numRefBases) + repeat(alt, numNonrefBases);
|
||||
}
|
||||
}
|
||||
|
||||
static double logOdds(double prob1, double prob2) {
|
||||
return Math.log10(prob1 / (1-prob1)) - Math.log10(prob2 / (1-prob2));
|
||||
}
|
||||
|
||||
static double logOddsNonrefRef(MixtureLikelihood[] mixtures) {
|
||||
// takes list of mixtureLikelihoods SORTED BY QSTAR (default ordering)
|
||||
// Works for N == 2 right now!
|
||||
assert (mixtures.length == 2);
|
||||
MixtureLikelihood bestNonref = ((mixtures[1].posterior > mixtures[2].posterior) ? mixtures[1] : mixtures[2]);
|
||||
return logOdds(bestNonref.posterior, mixtures[0].posterior);
|
||||
|
||||
// Code to calculate LOD using qstar=0 qhat=0
|
||||
//double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) * P_q_G(bases, N, 0.0, 0.0) * P_G(N, 0.0);
|
||||
//double LOD = Math.log10(highest_posterior) - Math.log10(posterior_null_hyp);
|
||||
}
|
||||
|
||||
static double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum) {
|
||||
|
|
@ -136,18 +241,30 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
|
|||
return p;
|
||||
}
|
||||
|
||||
static double P_q_G(byte [] bases, int N, double q, double qstar) {
|
||||
return binomialProb(Math.round(q*bases.length), bases.length, qstar);
|
||||
static double P_q_G(byte [] bases, int N, double q, double qstar, long q_R) {
|
||||
return binomialProb(q_R, bases.length, qstar);
|
||||
}
|
||||
|
||||
static double P_G(int N, double qstar, int altnum) {
|
||||
static double P_G(int N, int qstar_N) {
|
||||
if (N==2) {
|
||||
return p_G_N_2[ Math.round((float)(qstar * N)) ];
|
||||
return p_G_N_2[ qstar_N ];
|
||||
}else{
|
||||
return 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
static String genotypeTypeString(double q, int N){
|
||||
switch (Math.round((float)q*N)) {
|
||||
case (0):
|
||||
return "ref";
|
||||
case (1):
|
||||
return "het";
|
||||
case (2):
|
||||
return "hom";
|
||||
}
|
||||
return "ERROR";
|
||||
}
|
||||
|
||||
static double binomialProb(long k, long n, double p) {
|
||||
// k - numebr of successes
|
||||
// n - number of Bernoulli trials
|
||||
|
|
@ -168,6 +285,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public static String repeat(char letter, long count) {
|
||||
// Repeat a character count times
|
||||
String result = "";
|
||||
for (int i=0; i<count; i++) {
|
||||
result = result + letter;
|
||||
|
|
@ -178,8 +296,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
|
|||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum) {
|
||||
|
||||
//System.out.printf("%s %.2f\n", alleleFreq.asString(), alleleFreq.logOddsVarRef);
|
||||
return 0;//value + sum;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -207,8 +327,17 @@ public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
|
|||
p_G_N_2[0] = 0.999;
|
||||
p_G_N_2[1] = 1e-3;
|
||||
p_G_N_2[2] = 1e-5;
|
||||
|
||||
alleleMetrics = new AlleleFrequencyMetrics();
|
||||
}
|
||||
|
||||
AlleleFrequencyMetrics alleleMetrics;
|
||||
|
||||
public void onTraversalDone() {
|
||||
alleleMetrics.printMetrics();
|
||||
}
|
||||
|
||||
|
||||
void print_base_qual_matrix(double [][]quals, int numReads) {
|
||||
// Print quals for debugging
|
||||
System.out.println("Base quality matrix");
|
||||
|
|
|
|||
|
|
@ -0,0 +1,65 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.AlleleFrequencyWalker;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: andrewk
|
||||
* Date: Mar 18, 2009
|
||||
* Time: 5:28:58 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
|
||||
|
||||
public class AlleleFrequencyMetrics {
|
||||
|
||||
long dbsnp_tp=0;
|
||||
long dbsnp_fp=0;
|
||||
long num_snps=0;
|
||||
long num_loci=0;
|
||||
|
||||
public void calculateMetrics(List<ReferenceOrderedDatum> rodData, AlleleFrequencyWalker.AlleleFrequencyEstimate alleleFreq) {
|
||||
|
||||
boolean is_dbSNP_SNP = false;
|
||||
|
||||
for ( ReferenceOrderedDatum datum : rodData ) {
|
||||
if ( datum != null && datum instanceof rodDbSNP) {
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLogOddsVarRef() >= 5.0) { // we confidently called it a SNP!
|
||||
if (is_dbSNP_SNP) {
|
||||
dbsnp_tp += 1;
|
||||
}else{
|
||||
dbsnp_fp += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (alleleFreq.getQstar() > 0.0) num_snps++;
|
||||
num_loci++;
|
||||
|
||||
}
|
||||
|
||||
public void printMetrics() {
|
||||
System.out.println("\nAllele Frequency Metrics:\n");
|
||||
System.out.printf("Precision of LOD >= 5 SNPs w.r.t dbSNP: %.2f\n", (float)dbsnp_tp / (dbsnp_fp + dbsnp_tp) * 100);
|
||||
System.out.printf("\\--TP: %d\n", dbsnp_tp);
|
||||
System.out.printf("\\--FP: %d\n", dbsnp_fp);
|
||||
System.out.println();
|
||||
System.out.printf("SNPs (LOD > 0): %d\n", num_snps);
|
||||
System.out.printf("Total loci: %d\n", num_loci);
|
||||
System.out.printf("SNPs / loci: 1/%.0f\n", (float)num_loci/num_snps);
|
||||
System.out.println();
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue