First somewhat functional version of AlleleFrequency caller!

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@44 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-03-13 04:10:43 +00:00
parent 7c7d4b3a95
commit 851254970c
5 changed files with 209 additions and 169 deletions

View File

@ -37,7 +37,7 @@ public class AnalysisTK extends CommandLineProgram {
addModule("CountReads", new CountReadsWalker());
addModule("PrintReads", new PrintReadsWalker());
addModule("Base_Quality_Histogram", new BaseQualityHistoWalker());
addModule("Genotype", new GenotypeWalker());
addModule("AlleleFrequency", new AlleleFrequencyWalker());
addModule("SingleSampleGenotyper", new SingleSampleGenotyper());
addModule("Null", new NullWalker());
addModule("DepthOfCoverage", new DepthOfCoverageWalker());

View File

@ -1,78 +0,0 @@
package org.broadinstitute.sting.atk;
/**
* Created by IntelliJ IDEA.
* User: andrewk
* Date: Mar 9, 2009
* Time: 3:34:08 PM
* To change this template use File | Settings | File Templates.
*/
public class GenotypeEvidence {
int[] nuc2num = new int[128];
int[] nucs = new int[4];
int a = nucs[0];
int c = nucs[1];
int t = nucs[2];
int g = nucs[3];
float[] nuc_pcnt = new float[4];
char ref;
public float q; // % non-reference alleles
public int refbases;
public int allbases;
public GenotypeEvidence(String bases, char ref){
this.ref = ref;
nuc2num['A'] = 0;
nuc2num['C'] = 1;
nuc2num['T'] = 2;
nuc2num['G'] = 3;
nuc2num['a'] = 0;
nuc2num['c'] = 1;
nuc2num['t'] = 2;
nuc2num['g'] = 3;
for (char b : bases.toCharArray()) {
nucs[nuc2num[b]] += 1;
/*switch (b) {
case 'A': nucs[0] += 1; break;
case 'C': nucs[1] += 1; break;
case 'T': nucs[2] += 1; break;
case 'G': nucs[3] += 1; break;
} */
}
// Calculate q = ref. bases / nonref. bases
refbases = nucs[nuc2num[ref]];
allbases = bases.length();
q = 1 - ((float)refbases / allbases);
/*for (int i=0; i<4; i++) {
nuc_pcnt[i] = (float)nucs[i] / len;
//if
}*/
}
public boolean SigNonref(float cutoff_fraction) {
/* for (char nuc : nucs) {
}*/
return true;
}
public void print() {
System.out.format("A %2d | ", nucs[0]);
System.out.format("C %2d | ", nucs[1]);
System.out.format("T %2d | ", nucs[2]);
System.out.format("G %2d | ", nucs[3]);
System.out.format("Ref %s | ", ref);
}
}

View File

@ -47,7 +47,7 @@ public class LocusIteratorByHanger extends LocusIterator {
public List<SAMRecord> getReads() { return reads; }
public List<Integer> getOffsets() { return offsets; }
public int numReads() { return readHanger.getLeft().data.size(); }
public int numReads() { return reads.size(); }
}
// -----------------------------------------------------------------------------------------------------------------

View File

@ -0,0 +1,207 @@
package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import net.sf.samtools.SAMRecord;
import java.util.List;
public class AlleleFrequencyWalker extends BasicLociWalker<Integer, Integer> {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
// Convert context data into bases and 4-base quals
// Set number of chromosomes, N, to 2 for now
int N = 2;
// Convert bases to CharArray
int numReads = context.getReads().size(); //numReads();
byte[] bases = new byte[numReads];
double[][] quals = new double[numReads][4];
int refnum = nuc2num[ref];
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < numReads; i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
bases[i] = read.getReadBases()[offset];
int callednum = nuc2num[bases[i]];
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
for (int b=0; b<4; b++)
if (b != callednum)
quals[i][b] = nonref_quals;
}
// Print quals for debugging
System.out.println("Base quality matrix");
for (int b=0; b<4; b++) {
System.out.print(num2nuc[b]);
for (int i=0; i < numReads; i++){
System.out.format(" %.4f", quals[i][b]);
}
System.out.println();
}
// Count bases
int[] base_counts = new int[4];
for (byte b : bases)
base_counts[nuc2num[b]]++;
// Find alternate allele - 2nd most frequent non-ref allele
// (maybe we should check for ties and eval both or check most common including quality scores)
int altnum = -1; // start with first base numerical identity (0,1,2,3)
double altcount = -1; // start with first base count
for (int b=0; b<4; b++) {
if ((b != refnum) && (base_counts[b] > altcount)) {
altnum = b; altcount = base_counts[b];
}
}
assert(altnum > 0);
if (true) { //altcount > 2) {
System.out.format("Pos: %s | ref: %c | alt: %c | %2dA | %2dC | %2dT | %2dG | %2d total\n", context.getLocation(), num2nuc[refnum], num2nuc[altnum],
base_counts[0], base_counts[1], base_counts[2], base_counts[3], bases.length);
// Check if we really want to do this one
if (bases.length == 0) {
System.out.println("No reads at position; no call being made");
}else{
AlleleFrequencyEstimator(N, bases, quals, refnum, altnum);
}
}
System.out.println();
return 1;
}
static void AlleleFrequencyEstimator(int N, byte[] bases, double[][] quals, int refnum, int altnum) {
// q = hypothetical %nonref
// qstar = true %nonref
// G = N, qstar, alt
// N = number of chromosomes
// alt = alternate hypothesis base (most frequent nonref base)
// ref = reference base
double epsilon = 0; // 1e-2;
double qstar;
System.out.format("%4s | ", "q ");
for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N)
System.out.format("%5s%12s %12s %12s | ", "qstar", "p(D|q) ", "p(q|G) ", "posterior ");
System.out.println();
double highest_posterior = -1.0;
double highest_qstar = -1.0;
double highest_q = -1.0;
double qstep = 0.01;
double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating
for (double q=0.0; q <= qend; q += qstep) {
System.out.format("%.2f | ", q);
for (qstar = epsilon; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N) {
double pDq = P_D_q(bases, quals, q, refnum, altnum);
double pqG = P_q_G(bases, N, q, qstar);
double pG = P_G(N, qstar, altnum);
double posterior = pDq * pqG * pG;
System.out.format("%.2f %.10f %.10f %.10f | ", qstar, pDq, pqG, posterior);
if (posterior > highest_posterior) {
highest_posterior = posterior;
highest_qstar = qstar;
highest_q = q;
}
}
System.out.println();
}
System.out.printf("Maximal likelihood posterior: %.6f\n", highest_posterior);
System.out.printf("q that maximimizes posterior: %.6f\n", highest_q);
System.out.printf("qstar used when calculating max q: %.6f\n", highest_qstar);
// Print out the called bases
long numNonrefBases = Math.round(highest_q * N);
long numRefBases = N - numNonrefBases;
String genotype = repeat(num2nuc[refnum], numRefBases) + repeat(num2nuc[altnum], numNonrefBases);
System.out.printf("Maximal likelihood genotype: %s\n", genotype);
}
static double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum) {
double p = 1.0;
for (int i=0; i<bases.length; i++) {
p *= (1-q) * quals[i][refnum] + q * quals[i][altnum];
}
return p;
}
static double P_q_G(byte [] bases, int N, double q, double qstar) {
return binomialProb(Math.round(q*bases.length), bases.length, qstar);
}
static double P_G(int N, double qstar, int altnum) {
return 1.0;
}
static double binomialProb(long k, long n, double p) {
// k - numebr of successes
// n - number of Bernoulli trials
// p - probability of success
return (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
}
static long nchoosek(long n, long k) {
long m = n - k;
if (k < m)
k = m;
long t = 1;
for (long i = n, j = 1; i > k; i--, j++)
t = t * i / j;
return t;
}
public static String repeat(char letter, long count) {
String result = "";
for (int i=0; i<count; i++) {
result = result + letter;
}
return result;
}
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
static int nuc2num[];
static char num2nuc[];
public AlleleFrequencyWalker() {
nuc2num = new int[128];
nuc2num['A'] = 0;
nuc2num['C'] = 1;
nuc2num['T'] = 2;
nuc2num['G'] = 3;
nuc2num['a'] = 0;
nuc2num['c'] = 1;
nuc2num['t'] = 2;
nuc2num['g'] = 3;
num2nuc = new char[4];
num2nuc[0] = 'A';
num2nuc[1] = 'C';
num2nuc[2] = 'T';
num2nuc[3] = 'G';
}
}

View File

@ -1,89 +0,0 @@
package org.broadinstitute.sting.atk.modules;
import org.broadinstitute.sting.atk.LocusIterator;
import org.broadinstitute.sting.atk.GenotypeEvidence;
import org.broadinstitute.sting.atk.LocusContext;
import org.broadinstitute.sting.utils.ReferenceOrderedDatum;
import net.sf.samtools.SAMRecord;
import java.util.List;
import static java.lang.System.currentTimeMillis;
public class GenotypeWalker extends BasicLociWalker<Integer, Integer> {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
//char[] = new char(26);
long start_tm = currentTimeMillis();
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
String bases = "";
String quals = "";
//String offsetString = "";
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
//if ( offset >= read.getReadString().length() )
// System.out.printf(" [%2d] [%s] %s%n", offset, read.format(), read.getReadString());
bases += read.getReadString().charAt(offset);
//quals += read.getBaseQualityString().charAt(offset);
//offsetString += i;
//System.out.printf(" [%2d] [%s] %s%n", offset, read.getReadString().charAt(offset), read.getReadString());
}
GenotypeEvidence all = new GenotypeEvidence(bases, ref);
// P(q|G) - prob of nonref mixture given the genotype
float qobs = all.q; // observed percent of non-ref bases
double G; // % non-ref bases in observed
if (qobs >= 0.1) {
all.print();
System.out.format("q %.2f | ", all.q);
System.out.format("%s | ", context.getLocation());
System.out.format("Total %4d | ", context.numReads());
System.out.println();
for (int q = 0; q < all.allbases; q ++) {
for (G = 0.01; G <= 1.0; G += 0.49) { // iterate over: ref (0%), het (50%) and hom (100%) nonref bases observed
//double pqG = binomialProb(all.allbases - all.refbases, all.allbases, G);
double pqG = binomialProb(q, all.allbases, G);
//all.print();
System.out.format("P(q|G) %.3f | ", pqG);
}
System.out.println();
}
long stop_tm = currentTimeMillis();
System.out.format("%.3fs\n", (float)(stop_tm - start_tm) / 1000);
}
return 1;
}
static double binomialProb(int k, int n, double p) {
// k - numebr of successes
// n - number of Bernoulli trials
// p - probability of success
return (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
}
static int nchoosek(int n, int k) {
int t = 1;
int m = n - k;
if (k < m) {
k = m;
}
for (int i = n, j = 1; i > k; i--, j++) {
t = t * i / j;
}
return t;
}
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
}