From 851254970c6fa47296eef37b248b8d9eaf4dd47d Mon Sep 17 00:00:00 2001 From: andrewk Date: Fri, 13 Mar 2009 04:10:43 +0000 Subject: [PATCH] First somewhat functional version of AlleleFrequency caller! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@44 348d0f76-0448-11de-a6fe-93d51630548a --- .../broadinstitute/sting/atk/AnalysisTK.java | 2 +- .../sting/atk/GenotypeEvidence.java | 78 ------- .../sting/atk/LocusIteratorByHanger.java | 2 +- .../atk/modules/AlleleFrequencyWalker.java | 207 ++++++++++++++++++ .../sting/atk/modules/GenotypeWalker.java | 89 -------- 5 files changed, 209 insertions(+), 169 deletions(-) delete mode 100755 playground/java/src/org/broadinstitute/sting/atk/GenotypeEvidence.java create mode 100755 playground/java/src/org/broadinstitute/sting/atk/modules/AlleleFrequencyWalker.java delete mode 100755 playground/java/src/org/broadinstitute/sting/atk/modules/GenotypeWalker.java diff --git a/playground/java/src/org/broadinstitute/sting/atk/AnalysisTK.java b/playground/java/src/org/broadinstitute/sting/atk/AnalysisTK.java index 6b58ed926..bf69f1069 100644 --- a/playground/java/src/org/broadinstitute/sting/atk/AnalysisTK.java +++ b/playground/java/src/org/broadinstitute/sting/atk/AnalysisTK.java @@ -37,7 +37,7 @@ public class AnalysisTK extends CommandLineProgram { addModule("CountReads", new CountReadsWalker()); addModule("PrintReads", new PrintReadsWalker()); addModule("Base_Quality_Histogram", new BaseQualityHistoWalker()); - addModule("Genotype", new GenotypeWalker()); + addModule("AlleleFrequency", new AlleleFrequencyWalker()); addModule("SingleSampleGenotyper", new SingleSampleGenotyper()); addModule("Null", new NullWalker()); addModule("DepthOfCoverage", new DepthOfCoverageWalker()); diff --git a/playground/java/src/org/broadinstitute/sting/atk/GenotypeEvidence.java b/playground/java/src/org/broadinstitute/sting/atk/GenotypeEvidence.java deleted file mode 100755 index 918123acf..000000000 --- a/playground/java/src/org/broadinstitute/sting/atk/GenotypeEvidence.java +++ /dev/null @@ -1,78 +0,0 @@ -package org.broadinstitute.sting.atk; - -/** - * Created by IntelliJ IDEA. - * User: andrewk - * Date: Mar 9, 2009 - * Time: 3:34:08 PM - * To change this template use File | Settings | File Templates. - */ -public class GenotypeEvidence { - - int[] nuc2num = new int[128]; - int[] nucs = new int[4]; - int a = nucs[0]; - int c = nucs[1]; - int t = nucs[2]; - int g = nucs[3]; - float[] nuc_pcnt = new float[4]; - char ref; - public float q; // % non-reference alleles - public int refbases; - public int allbases; - - public GenotypeEvidence(String bases, char ref){ - this.ref = ref; - nuc2num['A'] = 0; - nuc2num['C'] = 1; - nuc2num['T'] = 2; - nuc2num['G'] = 3; - nuc2num['a'] = 0; - nuc2num['c'] = 1; - nuc2num['t'] = 2; - nuc2num['g'] = 3; - - for (char b : bases.toCharArray()) { - nucs[nuc2num[b]] += 1; - /*switch (b) { - case 'A': nucs[0] += 1; break; - case 'C': nucs[1] += 1; break; - case 'T': nucs[2] += 1; break; - case 'G': nucs[3] += 1; break; - } */ - } - - // Calculate q = ref. bases / nonref. bases - refbases = nucs[nuc2num[ref]]; - allbases = bases.length(); - q = 1 - ((float)refbases / allbases); - - /*for (int i=0; i<4; i++) { - nuc_pcnt[i] = (float)nucs[i] / len; - //if - }*/ - } - - - public boolean SigNonref(float cutoff_fraction) { - /* for (char nuc : nucs) { - - }*/ - - - return true; - } - - public void print() { - - System.out.format("A %2d | ", nucs[0]); - System.out.format("C %2d | ", nucs[1]); - System.out.format("T %2d | ", nucs[2]); - System.out.format("G %2d | ", nucs[3]); - System.out.format("Ref %s | ", ref); - - } - - - -} diff --git a/playground/java/src/org/broadinstitute/sting/atk/LocusIteratorByHanger.java b/playground/java/src/org/broadinstitute/sting/atk/LocusIteratorByHanger.java index e3d56977a..a6f35895b 100755 --- a/playground/java/src/org/broadinstitute/sting/atk/LocusIteratorByHanger.java +++ b/playground/java/src/org/broadinstitute/sting/atk/LocusIteratorByHanger.java @@ -47,7 +47,7 @@ public class LocusIteratorByHanger extends LocusIterator { public List getReads() { return reads; } public List getOffsets() { return offsets; } - public int numReads() { return readHanger.getLeft().data.size(); } + public int numReads() { return reads.size(); } } // ----------------------------------------------------------------------------------------------------------------- diff --git a/playground/java/src/org/broadinstitute/sting/atk/modules/AlleleFrequencyWalker.java b/playground/java/src/org/broadinstitute/sting/atk/modules/AlleleFrequencyWalker.java new file mode 100755 index 000000000..53247657a --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/atk/modules/AlleleFrequencyWalker.java @@ -0,0 +1,207 @@ +package org.broadinstitute.sting.atk.modules; + +import org.broadinstitute.sting.atk.LocusIterator; +import org.broadinstitute.sting.atk.LocusContext; +import org.broadinstitute.sting.utils.ReferenceOrderedDatum; +import net.sf.samtools.SAMRecord; + +import java.util.List; + +public class AlleleFrequencyWalker extends BasicLociWalker { + + + + public Integer map(List rodData, char ref, LocusContext context) { + + // Convert context data into bases and 4-base quals + + // Set number of chromosomes, N, to 2 for now + int N = 2; + + // Convert bases to CharArray + int numReads = context.getReads().size(); //numReads(); + byte[] bases = new byte[numReads]; + double[][] quals = new double[numReads][4]; + int refnum = nuc2num[ref]; + + List reads = context.getReads(); + List offsets = context.getOffsets(); + for (int i =0; i < numReads; i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + + bases[i] = read.getReadBases()[offset]; + int callednum = nuc2num[bases[i]]; + quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0); + + // Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual + double nonref_quals = (1.0 - quals[i][callednum]) / 3; + for (int b=0; b<4; b++) + if (b != callednum) + quals[i][b] = nonref_quals; + } + + // Print quals for debugging + System.out.println("Base quality matrix"); + for (int b=0; b<4; b++) { + System.out.print(num2nuc[b]); + for (int i=0; i < numReads; i++){ + System.out.format(" %.4f", quals[i][b]); + } + System.out.println(); + } + + // Count bases + int[] base_counts = new int[4]; + for (byte b : bases) + base_counts[nuc2num[b]]++; + + // Find alternate allele - 2nd most frequent non-ref allele + // (maybe we should check for ties and eval both or check most common including quality scores) + int altnum = -1; // start with first base numerical identity (0,1,2,3) + double altcount = -1; // start with first base count + + for (int b=0; b<4; b++) { + if ((b != refnum) && (base_counts[b] > altcount)) { + altnum = b; altcount = base_counts[b]; + } + } + assert(altnum > 0); + + if (true) { //altcount > 2) { + System.out.format("Pos: %s | ref: %c | alt: %c | %2dA | %2dC | %2dT | %2dG | %2d total\n", context.getLocation(), num2nuc[refnum], num2nuc[altnum], + base_counts[0], base_counts[1], base_counts[2], base_counts[3], bases.length); + + // Check if we really want to do this one + if (bases.length == 0) { + System.out.println("No reads at position; no call being made"); + }else{ + AlleleFrequencyEstimator(N, bases, quals, refnum, altnum); + } + } + System.out.println(); + + return 1; + } + + static void AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum) { + + // q = hypothetical %nonref + // qstar = true %nonref + // G = N, qstar, alt + // N = number of chromosomes + // alt = alternate hypothesis base (most frequent nonref base) + // ref = reference base + + double epsilon = 0; // 1e-2; + double qstar; + System.out.format("%4s | ", "q "); + for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N) + System.out.format("%5s%12s %12s %12s | ", "qstar", "p(D|q) ", "p(q|G) ", "posterior "); + System.out.println(); + + double highest_posterior = -1.0; + double highest_qstar = -1.0; + double highest_q = -1.0; + + double qstep = 0.01; + double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating + for (double q=0.0; q <= qend; q += qstep) { + System.out.format("%.2f | ", q); + for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N) { + double pDq = P_D_q(bases, quals, q, refnum, altnum); + double pqG = P_q_G(bases, N, q, qstar); + double pG = P_G(N, qstar, altnum); + double posterior = pDq * pqG * pG; + System.out.format("%.2f %.10f %.10f %.10f | ", qstar, pDq, pqG, posterior); + if (posterior > highest_posterior) { + highest_posterior = posterior; + highest_qstar = qstar; + highest_q = q; + } + } + System.out.println(); + } + System.out.printf("Maximal likelihood posterior: %.6f\n", highest_posterior); + System.out.printf("q that maximimizes posterior: %.6f\n", highest_q); + System.out.printf("qstar used when calculating max q: %.6f\n", highest_qstar); + + // Print out the called bases + long numNonrefBases = Math.round(highest_q * N); + long numRefBases = N - numNonrefBases; + String genotype = repeat(num2nuc[refnum], numRefBases) + repeat(num2nuc[altnum], numNonrefBases); + System.out.printf("Maximal likelihood genotype: %s\n", genotype); + } + + static double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum) { + double p = 1.0; + + for (int i=0; i k; i--, j++) + t = t * i / j; + return t; + } + + public static String repeat(char letter, long count) { + String result = ""; + for (int i=0; i { - public Integer map(List rodData, char ref, LocusContext context) { - //char[] = new char(26); - long start_tm = currentTimeMillis(); - List reads = context.getReads(); - List offsets = context.getOffsets(); - String bases = ""; - String quals = ""; - //String offsetString = ""; - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - //if ( offset >= read.getReadString().length() ) - // System.out.printf(" [%2d] [%s] %s%n", offset, read.format(), read.getReadString()); - - bases += read.getReadString().charAt(offset); - //quals += read.getBaseQualityString().charAt(offset); - //offsetString += i; - //System.out.printf(" [%2d] [%s] %s%n", offset, read.getReadString().charAt(offset), read.getReadString()); - } - - GenotypeEvidence all = new GenotypeEvidence(bases, ref); - - // P(q|G) - prob of nonref mixture given the genotype - float qobs = all.q; // observed percent of non-ref bases - double G; // % non-ref bases in observed - if (qobs >= 0.1) { - all.print(); - System.out.format("q %.2f | ", all.q); - System.out.format("%s | ", context.getLocation()); - System.out.format("Total %4d | ", context.numReads()); - System.out.println(); - for (int q = 0; q < all.allbases; q ++) { - for (G = 0.01; G <= 1.0; G += 0.49) { // iterate over: ref (0%), het (50%) and hom (100%) nonref bases observed - //double pqG = binomialProb(all.allbases - all.refbases, all.allbases, G); - double pqG = binomialProb(q, all.allbases, G); - //all.print(); - System.out.format("P(q|G) %.3f | ", pqG); - } - System.out.println(); - } - long stop_tm = currentTimeMillis(); - System.out.format("%.3fs\n", (float)(stop_tm - start_tm) / 1000); - } - return 1; - } - - static double binomialProb(int k, int n, double p) { - // k - numebr of successes - // n - number of Bernoulli trials - // p - probability of success - - return (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k); - } - - static int nchoosek(int n, int k) { - int t = 1; - - int m = n - k; - if (k < m) { - k = m; - } - - for (int i = n, j = 1; i > k; i--, j++) { - t = t * i / j; - } - - return t; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } -}