2009-03-24 07:19:54 +08:00
|
|
|
package org.broadinstitute.sting.playground.utils;
|
2009-03-20 06:06:01 +08:00
|
|
|
|
2009-03-24 07:19:54 +08:00
|
|
|
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
2009-04-13 20:29:51 +08:00
|
|
|
import java.util.Arrays;
|
|
|
|
|
import java.lang.Math;
|
2009-04-03 10:09:10 +08:00
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
2009-03-20 06:06:01 +08:00
|
|
|
|
|
|
|
|
public class AlleleFrequencyEstimate {
|
2009-04-19 23:35:07 +08:00
|
|
|
|
2009-06-15 15:20:22 +08:00
|
|
|
/*
|
2009-04-19 23:35:07 +08:00
|
|
|
static
|
|
|
|
|
{
|
|
|
|
|
boolean assertsEnabled = false;
|
|
|
|
|
assert assertsEnabled = true; // Intentional side effect!!!
|
|
|
|
|
if (!assertsEnabled)
|
|
|
|
|
{
|
|
|
|
|
System.err.printf("\n\n\nERROR: You must run with asserts enabled. \"java -ea\".\n\n\n");
|
|
|
|
|
throw new RuntimeException("Asserts must be enabled!");
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-06-15 15:20:22 +08:00
|
|
|
*/
|
2009-04-19 23:35:07 +08:00
|
|
|
|
2009-03-20 06:06:01 +08:00
|
|
|
//AlleleFrequencyEstimate();
|
2009-04-03 10:09:10 +08:00
|
|
|
public GenomeLoc location;
|
2009-03-21 00:44:17 +08:00
|
|
|
public char ref;
|
|
|
|
|
public char alt;
|
|
|
|
|
public int N;
|
|
|
|
|
public double qhat;
|
|
|
|
|
public double qstar;
|
2009-03-26 10:10:18 +08:00
|
|
|
public double lodVsRef;
|
|
|
|
|
public double lodVsNextBest;
|
2009-04-13 20:29:51 +08:00
|
|
|
public double pBest;
|
|
|
|
|
public double pRef;
|
2009-03-23 00:47:49 +08:00
|
|
|
public int depth;
|
2009-03-25 23:18:10 +08:00
|
|
|
public String notes;
|
2009-04-14 23:23:00 +08:00
|
|
|
public String bases;
|
|
|
|
|
public double[][] quals;
|
2009-04-19 23:35:07 +08:00
|
|
|
public double[] posteriors;
|
2009-05-12 23:10:17 +08:00
|
|
|
public String sample_name;
|
2009-06-02 01:13:21 +08:00
|
|
|
public int n_ref;
|
|
|
|
|
public int n_het;
|
|
|
|
|
public int n_hom;
|
2009-03-20 06:06:01 +08:00
|
|
|
|
2009-06-17 04:03:24 +08:00
|
|
|
public GenotypeLikelihoods genotypeLikelihoods = null;
|
|
|
|
|
|
2009-04-03 10:09:10 +08:00
|
|
|
GenomeLoc l;
|
|
|
|
|
|
2009-05-12 23:10:17 +08:00
|
|
|
|
|
|
|
|
public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth, String bases, double[][] quals, double[] posteriors, String sample_name)
|
2009-03-23 00:47:49 +08:00
|
|
|
{
|
2009-06-15 15:20:22 +08:00
|
|
|
/*
|
2009-04-24 01:21:58 +08:00
|
|
|
if( Double.isNaN(lodVsRef)) { System.out.printf("%s: lodVsRef is NaN\n", location.toString()); }
|
2009-06-02 01:13:21 +08:00
|
|
|
if( Double.isNaN(lodVsNextBest)) { System.out.printf("%s lodVsNextBest is NaN\n", location.toString()); }
|
|
|
|
|
if( Double.isNaN(qhat)) { System.out.printf("%s qhat is NaN\n", location.toString()); }
|
|
|
|
|
if( Double.isNaN(qstar)) { System.out.printf("%s qstar is NaN\n", location.toString()); }
|
|
|
|
|
if( Double.isNaN(pBest)) { System.out.printf("%s pBest is NaN\n", location.toString()); }
|
|
|
|
|
if( Double.isNaN(pRef)) { System.out.printf("%s pRef is NaN (%c %s)\n", location.toString(), ref, bases); }
|
2009-04-19 23:35:07 +08:00
|
|
|
|
2009-04-20 01:52:24 +08:00
|
|
|
if( Double.isInfinite(lodVsRef))
|
|
|
|
|
{
|
2009-04-24 01:21:58 +08:00
|
|
|
System.out.printf("lodVsRef is Infinite: %s %c %s\n", location.toString(), ref, bases);
|
2009-04-20 01:52:24 +08:00
|
|
|
for (int i = 0; i < posteriors.length; i++)
|
|
|
|
|
{
|
|
|
|
|
System.out.printf("POSTERIOR %d %f\n", i, posteriors[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-04-19 23:35:07 +08:00
|
|
|
if( Double.isInfinite(lodVsNextBest)) { System.out.printf("lodVsNextBest is Infinite\n"); }
|
|
|
|
|
if( Double.isInfinite(qhat)) { System.out.printf("qhat is Infinite\n"); }
|
|
|
|
|
if( Double.isInfinite(qstar)) { System.out.printf("qstar is Infinite\n"); }
|
|
|
|
|
if( Double.isInfinite(pBest)) { System.out.printf("pBest is Infinite\n"); }
|
|
|
|
|
if( Double.isInfinite(pRef)) { System.out.printf("pRef is Infinite\n"); }
|
|
|
|
|
|
|
|
|
|
assert(! Double.isNaN(lodVsRef));
|
|
|
|
|
assert(! Double.isNaN(lodVsNextBest));
|
|
|
|
|
assert(! Double.isNaN(qhat));
|
|
|
|
|
assert(! Double.isNaN(qstar));
|
|
|
|
|
assert(! Double.isNaN(pBest));
|
|
|
|
|
assert(! Double.isNaN(pRef));
|
|
|
|
|
|
|
|
|
|
assert(! Double.isInfinite(lodVsRef));
|
|
|
|
|
assert(! Double.isInfinite(lodVsNextBest));
|
|
|
|
|
assert(! Double.isInfinite(qhat));
|
|
|
|
|
assert(! Double.isInfinite(qstar));
|
|
|
|
|
assert(! Double.isInfinite(pBest));
|
|
|
|
|
assert(! Double.isInfinite(pRef));
|
2009-06-15 15:20:22 +08:00
|
|
|
*/
|
2009-04-19 23:35:07 +08:00
|
|
|
|
2009-03-23 00:47:49 +08:00
|
|
|
this.location = location;
|
2009-03-20 06:06:01 +08:00
|
|
|
this.ref = ref;
|
|
|
|
|
this.alt = alt;
|
|
|
|
|
this.N = N;
|
|
|
|
|
this.qhat = qhat;
|
|
|
|
|
this.qstar = qstar;
|
2009-03-26 10:10:18 +08:00
|
|
|
this.lodVsRef = lodVsRef;
|
|
|
|
|
this.lodVsNextBest = lodVsNextBest;
|
2009-05-12 23:10:17 +08:00
|
|
|
this.pBest = pBest;
|
|
|
|
|
this.pRef = pRef;
|
2009-03-23 00:47:49 +08:00
|
|
|
this.depth = depth;
|
2009-03-25 23:18:10 +08:00
|
|
|
this.notes = "";
|
2009-04-14 23:23:00 +08:00
|
|
|
this.bases = bases;
|
|
|
|
|
this.quals = quals;
|
2009-04-19 23:35:07 +08:00
|
|
|
this.posteriors = posteriors;
|
2009-05-12 23:10:17 +08:00
|
|
|
this.sample_name = sample_name;
|
2009-03-20 06:06:01 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-12 23:10:17 +08:00
|
|
|
public boolean isREF() { return (this.lodVsRef <= -5.0); }
|
|
|
|
|
public boolean isSNP() { return (this.lodVsRef >= 5.0); }
|
|
|
|
|
|
2009-04-13 20:29:51 +08:00
|
|
|
/** Return the most likely genotype. */
|
|
|
|
|
public String genotype()
|
|
|
|
|
{
|
|
|
|
|
int alt_count = (int)(qstar * N);
|
|
|
|
|
int ref_count = N-alt_count;
|
|
|
|
|
char[] alleles = new char[N];
|
|
|
|
|
int i;
|
|
|
|
|
for (i = 0; i < ref_count; i++) { alleles[i] = ref; }
|
|
|
|
|
for (; i < N; i++) { alleles[i] = alt; }
|
|
|
|
|
Arrays.sort(alleles);
|
|
|
|
|
return new String(alleles);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public double emperical_allele_frequency()
|
|
|
|
|
{
|
2009-04-19 23:35:07 +08:00
|
|
|
return (double)Math.round((double)qstar * (double)N) / (double)N;
|
2009-04-13 20:29:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public double emperical_allele_frequency(int N)
|
|
|
|
|
{
|
2009-04-19 23:35:07 +08:00
|
|
|
return (double)Math.round((double)qstar * (double)N) / (double)N;
|
2009-04-13 20:29:51 +08:00
|
|
|
}
|
|
|
|
|
|
2009-03-31 01:47:35 +08:00
|
|
|
public String asGFFString()
|
|
|
|
|
{
|
2009-04-13 23:21:23 +08:00
|
|
|
String s = "";
|
|
|
|
|
s += String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\t",
|
2009-04-03 10:09:10 +08:00
|
|
|
location.getContig(),
|
|
|
|
|
location.getStart(),
|
|
|
|
|
location.getStart(),
|
2009-04-13 23:21:23 +08:00
|
|
|
lodVsRef);
|
2009-05-12 23:10:17 +08:00
|
|
|
s += String.format("\t;\tSAMPLE %s", sample_name);
|
2009-04-14 23:23:00 +08:00
|
|
|
s += String.format("\t;\tREF %c", ref);
|
|
|
|
|
s += String.format("\t;\tALT %c", alt);
|
|
|
|
|
s += String.format("\t;\tFREQ %f", qstar);
|
2009-05-12 23:10:17 +08:00
|
|
|
s += String.format("\t;\tGENOTYPE %s", this.genotype());
|
2009-04-14 23:23:00 +08:00
|
|
|
s += String.format("\t;\tDEPTH %d", depth);
|
|
|
|
|
s += String.format("\t;\tLODvsREF %f", lodVsRef);
|
|
|
|
|
s += String.format("\t;\tLODvsNEXTBEST %f", lodVsNextBest);
|
|
|
|
|
s += String.format("\t;\tQHAT %f", qhat);
|
|
|
|
|
s += String.format("\t;\tQSTAR %f", qstar);
|
|
|
|
|
s += String.format("\t;\tBASES %s", bases);
|
2009-04-13 23:21:23 +08:00
|
|
|
|
2009-04-14 23:23:00 +08:00
|
|
|
s += ";\n";
|
2009-04-13 23:21:23 +08:00
|
|
|
|
2009-04-14 23:23:00 +08:00
|
|
|
// add quals.
|
2009-04-13 23:21:23 +08:00
|
|
|
|
|
|
|
|
return s;
|
2009-03-31 01:47:35 +08:00
|
|
|
}
|
|
|
|
|
|
2009-05-22 04:35:50 +08:00
|
|
|
public static String asTabularStringHeader()
|
2009-05-17 00:52:40 +08:00
|
|
|
{
|
|
|
|
|
return "location sample_name ref alt genotype qhat qstar lodVsRef lodVsNextBest depth bases";
|
|
|
|
|
}
|
|
|
|
|
|
2009-06-01 22:32:12 +08:00
|
|
|
public static String geliHeaderString() {
|
|
|
|
|
return "#Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod dbSNP AA AC AG AT CC CG CT GG GT TT";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String asGeliString()
|
|
|
|
|
{
|
|
|
|
|
// #Sequence Position ReferenceBase NumberOfReads MaxMappingQuality BestGenotype BtrLod BtnbLod dbSNP AA AC AG AT CC CG CT GG GT TT
|
|
|
|
|
// chr1 7764136 A 48 99 CC 83.650421 9.18159 -92.83638 -18.367548 -96.91729 -96.614204 -9.185958 -23.33643 -23.033337 -101.282059 -101.583092 -101.279999
|
|
|
|
|
|
|
|
|
|
// chr pos ref Nreads maxMapQ genotype BtrLod BtnbLod dbSNP AA AC AG AT CC CG CT GG GT TT
|
|
|
|
|
return String.format("%s %16d %c %8d %d %s %.6f %.6f 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0",
|
|
|
|
|
location.getContig(),
|
|
|
|
|
location.getStart(),
|
|
|
|
|
ref,
|
|
|
|
|
depth,
|
|
|
|
|
-1,
|
|
|
|
|
genotype(),
|
|
|
|
|
lodVsRef,
|
|
|
|
|
lodVsNextBest);
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-17 00:52:40 +08:00
|
|
|
public String asTabularString()
|
|
|
|
|
{
|
|
|
|
|
return String.format("%s %s %c %c %s %f %f %f %f %d %s",
|
2009-03-25 09:12:05 +08:00
|
|
|
location,
|
2009-05-17 00:52:40 +08:00
|
|
|
sample_name,
|
2009-03-25 09:12:05 +08:00
|
|
|
ref,
|
|
|
|
|
alt,
|
2009-05-17 00:52:40 +08:00
|
|
|
genotype(),
|
2009-03-25 09:12:05 +08:00
|
|
|
qhat,
|
|
|
|
|
qstar,
|
2009-03-26 10:10:18 +08:00
|
|
|
lodVsRef,
|
|
|
|
|
lodVsNextBest,
|
2009-03-25 23:18:10 +08:00
|
|
|
depth,
|
2009-05-17 00:52:40 +08:00
|
|
|
bases);
|
2009-03-25 09:12:05 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-02 22:09:14 +08:00
|
|
|
public String toString() { return asTabularString(); }
|
|
|
|
|
|
2009-03-20 06:06:01 +08:00
|
|
|
public String asString() {
|
|
|
|
|
// Print out the called bases
|
|
|
|
|
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref
|
|
|
|
|
// base has you calculate qstar = 0.0 and qhat = 0.49 and that leads to a genotype predicition of AG according
|
|
|
|
|
// to qhat, but AA according to qstar. This needs to be further investigated to see whether we really want
|
|
|
|
|
// to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1
|
|
|
|
|
long numNonrefBases = Math.round(qstar * N);
|
|
|
|
|
long numRefBases = N - numNonrefBases;
|
2009-03-24 11:58:03 +08:00
|
|
|
if (ref < alt) { // order bases alphabetically
|
|
|
|
|
return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases);
|
|
|
|
|
}else{
|
|
|
|
|
return AlleleFrequencyWalker.repeat(alt, numNonrefBases) + AlleleFrequencyWalker.repeat(ref, numRefBases);
|
|
|
|
|
}
|
2009-03-20 06:06:01 +08:00
|
|
|
}
|
2009-04-21 13:38:04 +08:00
|
|
|
|
2009-06-02 01:13:21 +08:00
|
|
|
public String asPoolTabularString()
|
|
|
|
|
{
|
|
|
|
|
return String.format("%s %c %c %f %f %f %s %f %d %d %d %d",
|
|
|
|
|
location,
|
|
|
|
|
ref,
|
|
|
|
|
alt,
|
|
|
|
|
qstar,
|
|
|
|
|
pBest,
|
|
|
|
|
pRef,
|
|
|
|
|
"NA",
|
|
|
|
|
lodVsRef,
|
|
|
|
|
N,
|
|
|
|
|
n_ref,
|
|
|
|
|
n_het,
|
|
|
|
|
n_hom);
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-21 13:38:04 +08:00
|
|
|
|
|
|
|
|
public double posterior()
|
|
|
|
|
{
|
|
|
|
|
return this.posteriors[(int)this.qstar * this.N];
|
|
|
|
|
}
|
2009-03-21 00:44:17 +08:00
|
|
|
}
|