diff --git a/playground/java/src/org/broadinstitute/sting/gatk/walkers/AlleleFrequencyWalker.java b/playground/java/src/org/broadinstitute/sting/gatk/walkers/AlleleFrequencyWalker.java index a32deb856..6767adf47 100755 --- a/playground/java/src/org/broadinstitute/sting/gatk/walkers/AlleleFrequencyWalker.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/walkers/AlleleFrequencyWalker.java @@ -3,26 +3,27 @@ package org.broadinstitute.sting.gatk.walkers; //import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.utils.AlleleFrequencyMetrics; import net.sf.samtools.SAMRecord; import java.util.List; +import java.util.Arrays; -public class AlleleFrequencyWalker extends BasicLociWalker { +public class AlleleFrequencyWalker extends BasicLociWalker { - - - - public Integer map(List rodData, char ref, LocusContext context) { + public AlleleFrequencyEstimate map(List rodData, char ref, LocusContext context) { // Convert context data into bases and 4-base quals // Set number of chromosomes, N, to 2 for now int N = 2; - boolean debug = false; + int debug = 0; // Convert bases to CharArray int numReads = context.getReads().size(); //numReads(); byte[] bases = new byte[numReads]; + String base_string = ""; double[][] quals = new double[numReads][4]; int refnum = nuc2num[ref]; @@ -33,6 +34,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker { int offset = offsets.get(i); bases[i] = read.getReadBases()[offset]; + base_string += read.getReadString().charAt(offset); int callednum = nuc2num[bases[i]]; quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0); @@ -60,21 +62,33 @@ public class AlleleFrequencyWalker extends BasicLociWalker { } assert(altnum > 0); - if (bases.length > 0) { //altcount > 2) { - System.out.format("Pos: %s | ref: %c | alt: %c | %2dA | %2dC | %2dT | %2dG | %2d total | ", context.getLocation(), num2nuc[refnum], - num2nuc[altnum], base_counts[0], base_counts[1], base_counts[2], base_counts[3], bases.length); + if (debug >= 1) System.out.format("Pos: %s ref: %c alt: %c ", context.getLocation(), num2nuc[refnum], num2nuc[altnum]); + //System.out.format("%2dA %2dC %2dT %2dG %2d total", base_counts[0], base_counts[1], base_counts[2], base_counts[3], bases.length); - if (debug) print_base_qual_matrix(quals, numReads); + if (debug >= 2) print_base_qual_matrix(quals, numReads); - // Check if we really want to do this one - AlleleFrequencyEstimator(N, bases, quals, refnum, altnum, debug); + AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(N, bases, quals, refnum, altnum, debug); + + // Print dbSNP data if its there + if (debug >= 1) { + for ( ReferenceOrderedDatum datum : rodData ) { + if ( datum != null && datum instanceof rodDbSNP) { + rodDbSNP dbsnp = (rodDbSNP)datum; + //System.out.printf(" DBSNP %s on %s => %s%n", dbsnp.toSimpleString(), dbsnp.strand, Utils.join("/", dbsnp.getAllelesFWD())); + System.out.printf("ROD: %s ",dbsnp.toMediumString()); + } + + } } - if (debug) System.out.println(); + if (debug >= 1) System.out.format(" %s\n", base_string); + if (debug >= 2) System.out.println(); - return 1; + alleleMetrics.calculateMetrics(rodData, alleleFreq); + + return alleleFreq; } - static void AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum, boolean debug) { + AlleleFrequencyEstimate AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum, int debug) { // q = hypothetical %nonref // qstar = true %nonref @@ -83,47 +97,138 @@ public class AlleleFrequencyWalker extends BasicLociWalker { // alt = alternate hypothesis base (most frequent nonref base) // ref = reference base + // b = number of bases at locus + + double epsilon = 0; // 1e-2; double qstar; - if (debug) { - System.out.format("%4s | ", "q "); + int qstar_N; + if (debug >= 2) { + System.out.format("%4s ", "q "); for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N) - System.out.format("%5s%12s %12s %12s | ", "qstar", "p(D|q) ", "p(q|G) ", "posterior "); - if (debug) System.out.println(); + System.out.format("%5s%10s %10s %10s ", "qstar", "p(D|q) ", "p(q|G) ", "posterior "); + System.out.println(); } - double highest_posterior = -1.0; - double highest_qstar = -1.0; - double highest_q = -1.0; + MixtureLikelihood[] bestMixtures = { new MixtureLikelihood(-1,-1,-1), new MixtureLikelihood(-1,-1,-1), new MixtureLikelihood(-1,-1,-1) }; + /*double[] highest_posterior = new double[] {-1.0, -1.0, -1.0}; + double[] highest_qstar = new double[] {-1.0, -1.0, -1.0}; + double[] highest_q = new double[] {-1.0, -1.0, -1.0};*/ double qstep = 0.01; double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating - for (double q=0.0; q <= qend; q += qstep) { - if (debug) System.out.format("%.2f | ", q); - for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N) { - double pDq = P_D_q(bases, quals, q, refnum, altnum); - double pqG = P_q_G(bases, N, q, qstar); - double pG = P_G(N, qstar, altnum); - double posterior = pDq * pqG * pG; - if (debug) System.out.format("%.2f %.10f %.10f %.10f | ", qstar, pDq, pqG, posterior); - if (posterior > highest_posterior) { - highest_posterior = posterior; - highest_qstar = qstar; - highest_q = q; - } - } - if (debug) System.out.println(); - } - /*System.out.printf("Maximal likelihood posterior: %.6f\n", highest_posterior); - System.out.printf("q that maximimizes posterior: %.6f\n", highest_q); - System.out.printf("qstar used when calculating max q: %.6f\n", highest_qstar);*/ - System.out.printf("qhat %.2f | qstar %.2f | ", highest_q, highest_qstar); + for (double q=0.0; q <= qend; q += qstep) { // hypothetic allele balance that we sample over + if (debug >= 2) System.out.format("%.2f ", q); - // Print out the called bases - long numNonrefBases = Math.round(highest_q * N); - long numRefBases = N - numNonrefBases; - String genotype = repeat(num2nuc[refnum], numRefBases) + repeat(num2nuc[altnum], numNonrefBases); - System.out.printf("gen: %s\n", genotype); + long q_R = Math.round(q*bases.length); + for (qstar = epsilon, qstar_N = 0; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++) { // qstar - true allele balances + // for N=2: these are 0.0 + epsilon, 0.5, 1.0 - epsilon corresponding to reference, het-SNP, homo-SNP + //int qstar_N = Math.round((float)qstar*N); + double pDq = P_D_q(bases, quals, q, refnum, altnum); + double pqG = P_q_G(bases, N, q, qstar, q_R); + double pG = P_G(N, qstar_N); //= P_G(N, qstar); + double posterior = pDq * pqG * pG; + if (debug >= 2) System.out.format("%.2f %10.7f %10.7f %10.7f ", qstar, Math.log10(pDq), Math.log10(pqG), Math.log10(posterior)); + if (posterior > bestMixtures[qstar_N].posterior) + bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q); + } + if (debug >= 2) System.out.println(); + } + + // Calculate log-odds difference - must happen before mixtures are sorted by posterior prob. + double logOddsVarRef = logOddsNonrefRef(bestMixtures); + if (debug >= 1) System.out.printf("LOD(Var,Ref): %6.2f ", logOddsVarRef); + + // Print all mixture likelihoods in order of qstar + for (MixtureLikelihood mix : bestMixtures) // outputs: "het 3.26 " + if (debug >= 1) System.out.printf("%3s: %6.2f ", genotypeTypeString(mix.qstar, N), Math.log10(mix.posterior / (1-mix.posterior))); + + // Sort by posterior probability + Arrays.sort(bestMixtures); // We only used best entry, so this could be a generic max function (optimization) + if (debug >= 1) System.out.printf("qhat: %.2f qstar: %.2f ", bestMixtures[0].qhat, bestMixtures[0].qstar); //highest_q, highest_qstar); + if (debug >= 1) System.out.printf("prob: %.2e ", bestMixtures[0].posterior); + if (debug >= 1) System.out.printf("type: %3s ", genotypeTypeString(bestMixtures[0].qstar, N)); + + double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) * P_q_G(bases, N, 0.0, 0.0, 0) * P_G(N, 0); + if (debug >= 1) System.out.printf("P(null): %.2e ", posterior_null_hyp); + //double LOD = Math.log10(bestMixtures[0].posterior) - Math.log10(posterior_null_hyp); + double LOD = logOdds(bestMixtures[0].posterior, posterior_null_hyp); + if (debug >= 1) System.out.printf("LOD: %6.2f ", LOD); + + AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(num2nuc[refnum], num2nuc[refnum], + N, bestMixtures[0].qhat, bestMixtures[0].qstar, logOddsVarRef); + if (debug >= 1) System.out.printf("gen: %s", genotypeTypeString(bestMixtures[0].qstar, N)); + + return alleleFreq; + } + + public class MixtureLikelihood implements Comparable { + public double posterior; + public double qstar; + public double qhat; + + MixtureLikelihood(double posterior, double qstar, double qhat) { + this.posterior = posterior; + this.qstar = qstar; + this.qhat = qhat; + } + + public int compareTo (MixtureLikelihood other) { + // default sorting is REVERSE sorting on posterior prob; gives array with top post. prob. as first element + if (posterior < other.posterior) return 1; + else if (posterior > other.posterior) return -1; + else return 0; + } + } + + public class AlleleFrequencyEstimate { + //AlleleFrequencyEstimate(); + char ref; + char alt; + int N; + double qhat; + double qstar; + double logOddsVarRef; + + public double getQstar() {return qstar;} + public double getLogOddsVarRef() {return logOddsVarRef;} + + + public AlleleFrequencyEstimate (char ref, char alt, int N, double qhat, double qstar, double logOddsVarRef) { + this.ref = ref; + this.alt = alt; + this.N = N; + this.qhat = qhat; + this.qstar = qstar; + this.logOddsVarRef = logOddsVarRef; + } + + public String asString () { + // Print out the called bases + // Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref + // base has you calculate qstar = 0.0 and qhat = 0.49 and that leads to a genotype predicition of AG according + // to qhat, but AA according to qstar. This needs to be further investigated to see whether we really want + // to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1 + long numNonrefBases = Math.round(qstar * N); + long numRefBases = N - numNonrefBases; + return repeat(ref, numRefBases) + repeat(alt, numNonrefBases); + } + } + + static double logOdds(double prob1, double prob2) { + return Math.log10(prob1 / (1-prob1)) - Math.log10(prob2 / (1-prob2)); + } + + static double logOddsNonrefRef(MixtureLikelihood[] mixtures) { + // takes list of mixtureLikelihoods SORTED BY QSTAR (default ordering) + // Works for N == 2 right now! + assert (mixtures.length == 2); + MixtureLikelihood bestNonref = ((mixtures[1].posterior > mixtures[2].posterior) ? mixtures[1] : mixtures[2]); + return logOdds(bestNonref.posterior, mixtures[0].posterior); + + // Code to calculate LOD using qstar=0 qhat=0 + //double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) * P_q_G(bases, N, 0.0, 0.0) * P_G(N, 0.0); + //double LOD = Math.log10(highest_posterior) - Math.log10(posterior_null_hyp); } static double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum) { @@ -136,18 +241,30 @@ public class AlleleFrequencyWalker extends BasicLociWalker { return p; } - static double P_q_G(byte [] bases, int N, double q, double qstar) { - return binomialProb(Math.round(q*bases.length), bases.length, qstar); + static double P_q_G(byte [] bases, int N, double q, double qstar, long q_R) { + return binomialProb(q_R, bases.length, qstar); } - static double P_G(int N, double qstar, int altnum) { + static double P_G(int N, int qstar_N) { if (N==2) { - return p_G_N_2[ Math.round((float)(qstar * N)) ]; + return p_G_N_2[ qstar_N ]; }else{ return 1.0; } } + static String genotypeTypeString(double q, int N){ + switch (Math.round((float)q*N)) { + case (0): + return "ref"; + case (1): + return "het"; + case (2): + return "hom"; + } + return "ERROR"; + } + static double binomialProb(long k, long n, double p) { // k - numebr of successes // n - number of Bernoulli trials @@ -168,6 +285,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker { } public static String repeat(char letter, long count) { + // Repeat a character count times String result = ""; for (int i=0; i { public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) { - return value + sum; + public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum) { + + //System.out.printf("%s %.2f\n", alleleFreq.asString(), alleleFreq.logOddsVarRef); + return 0;//value + sum; } @@ -207,8 +327,17 @@ public class AlleleFrequencyWalker extends BasicLociWalker { p_G_N_2[0] = 0.999; p_G_N_2[1] = 1e-3; p_G_N_2[2] = 1e-5; + + alleleMetrics = new AlleleFrequencyMetrics(); } + AlleleFrequencyMetrics alleleMetrics; + + public void onTraversalDone() { + alleleMetrics.printMetrics(); + } + + void print_base_qual_matrix(double [][]quals, int numReads) { // Print quals for debugging System.out.println("Base quality matrix"); diff --git a/playground/java/src/org/broadinstitute/sting/utils/AlleleFrequencyMetrics.java b/playground/java/src/org/broadinstitute/sting/utils/AlleleFrequencyMetrics.java new file mode 100755 index 000000000..07c1a8ced --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/utils/AlleleFrequencyMetrics.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.gatk.walkers.AlleleFrequencyWalker; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: andrewk + * Date: Mar 18, 2009 + * Time: 5:28:58 PM + * To change this template use File | Settings | File Templates. + */ + + +public class AlleleFrequencyMetrics { + + long dbsnp_tp=0; + long dbsnp_fp=0; + long num_snps=0; + long num_loci=0; + + public void calculateMetrics(List rodData, AlleleFrequencyWalker.AlleleFrequencyEstimate alleleFreq) { + + boolean is_dbSNP_SNP = false; + + for ( ReferenceOrderedDatum datum : rodData ) { + if ( datum != null && datum instanceof rodDbSNP) { + rodDbSNP dbsnp = (rodDbSNP)datum; + if (dbsnp.isSNP()) is_dbSNP_SNP = true; + } + } + + if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLogOddsVarRef() >= 5.0) { // we confidently called it a SNP! + if (is_dbSNP_SNP) { + dbsnp_tp += 1; + }else{ + dbsnp_fp += 1; + } + } + + if (alleleFreq.getQstar() > 0.0) num_snps++; + num_loci++; + + } + + public void printMetrics() { + System.out.println("\nAllele Frequency Metrics:\n"); + System.out.printf("Precision of LOD >= 5 SNPs w.r.t dbSNP: %.2f\n", (float)dbsnp_tp / (dbsnp_fp + dbsnp_tp) * 100); + System.out.printf("\\--TP: %d\n", dbsnp_tp); + System.out.printf("\\--FP: %d\n", dbsnp_fp); + System.out.println(); + System.out.printf("SNPs (LOD > 0): %d\n", num_snps); + System.out.printf("Total loci: %d\n", num_loci); + System.out.printf("SNPs / loci: 1/%.0f\n", (float)num_loci/num_snps); + System.out.println(); + + + + + } + +}