Unified caller is go.
AlleleFrequencyWalker and related classes work equally well for 2 or 200 chromosomes.
Single Sample Calling:
Allele Frequency Metrics (LOD >= 5)
-------------------------------------------------
Total loci : 171575
Total called with confidence : 168615 (98.27%)
Number of variants : 111 (0.07%) (1/1519)
Fraction of variant sites in dbSNP : 87.39%
-------------------------------------------------
Hapmap metrics are coming up all zero. Will fix.
Pooled Calling:
AAF r-squared after EM is 0.99.
AAF r-squared after EM for alleles < 20% (in pools of ~100-200 chromosomes) is 0.95 (0.75 before EM)
Still not using fractional genotype counts in EM. That should improve r-squared for low frequency alleles.
Chores still outstanding:
- make a real pooled caller walker (as opposed to my experiment framework).
- add fractional genotype counts to EM cycle.
- add pool metrics to the metrics class? *shrug* we don't really have truth outside of a contrived experiment...
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@380 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f39092526d
commit
6e180ed44e
|
|
@ -31,25 +31,39 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
Random random;
|
||||
PrintStream output;
|
||||
|
||||
private boolean initalized = false;
|
||||
public void initalize()
|
||||
{
|
||||
if (initalized) { return; }
|
||||
|
||||
try
|
||||
{
|
||||
this.random = new java.util.Random(0);
|
||||
if ( GFF_OUTPUT_FILE.equals("-") )
|
||||
this.output = out;
|
||||
else
|
||||
this.output = new PrintStream(GFF_OUTPUT_FILE);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
System.exit(-1);
|
||||
}
|
||||
|
||||
initalized = true;
|
||||
}
|
||||
|
||||
public boolean requiresReads() { return true; }
|
||||
|
||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||
{
|
||||
// Convert context data into bases and 4-base quals
|
||||
String bases = getBases(context);
|
||||
double quals[][] = getQuals(context);
|
||||
// init in the map because GATK doesn't appear to be initing me today. no problemo.
|
||||
this.initalize();
|
||||
|
||||
/*
|
||||
// DEBUG: print the data for a read
|
||||
{
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
for (int i = 0; i < reads.size(); i++)
|
||||
{
|
||||
String cigar = reads.get(i).getCigarString();
|
||||
System.out.println("DEBUG " + cigar);
|
||||
}
|
||||
}
|
||||
*/
|
||||
// Convert context data into bases and 4-base quals
|
||||
String bases = getBases(context);
|
||||
double quals[][] = getQuals(context);
|
||||
//String[] indels = getIndels(context);
|
||||
|
||||
logger.debug(String.format("In alleleFrequnecy walker: N=%d, d=%d", N, DOWNSAMPLE));
|
||||
|
||||
|
|
@ -63,6 +77,14 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
while (downsampled_bases.length() < DOWNSAMPLE)
|
||||
{
|
||||
int choice;
|
||||
|
||||
/*
|
||||
System.out.printf("DBG %b %b %b\n",
|
||||
random == null,
|
||||
bases == null,
|
||||
picked_bases == null);
|
||||
*/
|
||||
|
||||
for (choice = random.nextInt(bases.length()); picked_bases[choice] == 1; choice = random.nextInt(bases.length()));
|
||||
picked_bases[choice] = 1;
|
||||
downsampled_bases += bases.charAt(choice);
|
||||
|
|
@ -137,6 +159,56 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
return new String(bases);
|
||||
}
|
||||
|
||||
/*
|
||||
static public String[] getIndels(LocusContext context)
|
||||
{
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
String[] indels = new String[reads.size()];
|
||||
|
||||
for (int i = 0; i < reads.size(); i++)
|
||||
{
|
||||
SAMRecord read = reads.get(i);
|
||||
Cigar cigar = read.getCigar();
|
||||
|
||||
int k = 0;
|
||||
for (int j = 0; j < cigar.numCigarElements(); j++)
|
||||
{
|
||||
CigarOperator operator = cigar.getCigarElement(j).getOperator();
|
||||
int length = cigar.getCigarElement(j).getLength();
|
||||
if (operator == CigarOperator.M)
|
||||
{
|
||||
k += length;
|
||||
}
|
||||
else if ((k != offset) && (operator == CigarOperator.I))
|
||||
{
|
||||
k += length;
|
||||
}
|
||||
else if ((k == offset) && (operator == CigarOperator.I))
|
||||
{
|
||||
// this insertion is associated with this offset.
|
||||
|
||||
break;
|
||||
}
|
||||
else if ((k == offset) && (operator == CigarOperator.D))
|
||||
{
|
||||
|
||||
break;
|
||||
}
|
||||
else if ((k == offset) &&
|
||||
((operator == CigarOperator.I) || (operator == CigarOperator.D)))
|
||||
{
|
||||
// no indel here.
|
||||
indels[i] = "";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return indels;
|
||||
}
|
||||
*/
|
||||
|
||||
static public double[][] getQuals (LocusContext context)
|
||||
{
|
||||
int numReads = context.getReads().size(); //numReads();
|
||||
|
|
@ -220,51 +292,48 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
// alt = alternate hypothesis base (most frequent nonref base)
|
||||
// ref = reference base
|
||||
|
||||
// b = number of bases at locus
|
||||
|
||||
double epsilon = 0; // 1e-2;
|
||||
double qstar;
|
||||
int qstar_N;
|
||||
|
||||
double qstep = 0.001;
|
||||
double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating
|
||||
|
||||
double posterior_null_hyp;
|
||||
|
||||
// Initialize empyt MixtureLikelihood holders for each possible qstar (N+1)
|
||||
MixtureLikelihood[] bestMixtures = new MixtureLikelihood[N+1];
|
||||
// Fill 1..N ML positions with 0 valued MLs
|
||||
for (int i=1; i<=N; i++) bestMixtures[i] = new MixtureLikelihood();
|
||||
// Calculate null hypothesis for qstar = 0, qhat = 0
|
||||
double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, epsilon, 0) + P_G(N, 0);
|
||||
// Put null hypothesis into ML[0] because that is our prob of that theory and we don't loop over it below
|
||||
bestMixtures[0] = new MixtureLikelihood(posterior_null_hyp, 0.0, 0.0);
|
||||
|
||||
// hypothetic allele balance that we sample over
|
||||
for (double q=0.0; q <= qend; q += qstep)
|
||||
{
|
||||
long q_R = Math.round(q*bases.length);
|
||||
double q = ML_q(bases, quals, refnum, altnum);
|
||||
double pDq = P_D_q(bases, quals, q, refnum, altnum);
|
||||
long q_R = Math.round(q*bases.length);
|
||||
|
||||
posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, 0, 0) + P_G(N, 0);
|
||||
bestMixtures[0] = new MixtureLikelihood(posterior_null_hyp, 0.0, 0.0);
|
||||
|
||||
// qstar - true allele balances
|
||||
for (qstar = epsilon + ((1.0 - 2*epsilon)/N), qstar_N = 1; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++)
|
||||
//for (qstar = epsilon + ((1.0 - 2*epsilon)/N), qstar_N = 1; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++)
|
||||
for (qstar_N = 1; qstar_N <= N; qstar_N += 1)
|
||||
{
|
||||
qstar = (double)qstar_N / (double)N;
|
||||
|
||||
double pqG = P_q_G(bases, N, q, qstar, q_R);
|
||||
double pG = P_G(N, qstar_N);
|
||||
double posterior = pDq + pqG + pG;
|
||||
|
||||
if (posterior > bestMixtures[qstar_N].posterior)
|
||||
bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q);
|
||||
bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q);
|
||||
|
||||
//System.out.format("%.2f %.2f %5.2f %5.2f %5.2f %5.2f\n", q, qstar, pDq, pqG, pG, posterior);
|
||||
//System.out.format("DBG %s %.2f %.2f %5.2f %5.2f %5.2f %5.2f %d %d %s\n", location, q, qstar, pDq, pqG, pG, posterior, (int)(q*bases.length), (int)((1.0-q)*bases.length), new String(bases));
|
||||
}
|
||||
}
|
||||
|
||||
// First reverse sort NONREF mixtures according to highest posterior probabililty
|
||||
Arrays.sort(bestMixtures, 1, N+1);
|
||||
Arrays.sort(bestMixtures);
|
||||
|
||||
// Calculate Lod of any variant call versus the reference call
|
||||
// Answers how confident are we in the best variant (nonref) mixture versus the null hypothesis
|
||||
// reference mixture - qhat = qstar = 0.0
|
||||
double lodVarVsRef = bestMixtures[1].posterior - posterior_null_hyp;
|
||||
double lodVarVsRef = bestMixtures[0].posterior - posterior_null_hyp;
|
||||
|
||||
// Now reverse sort ALL mixtures according to highest posterior probability
|
||||
Arrays.sort(bestMixtures);
|
||||
|
|
@ -273,6 +342,8 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
// Answers how confident are we in the best mixture versus the nextPosition best mixture
|
||||
double lodBestVsNextBest = bestMixtures[0].posterior - bestMixtures[1].posterior;
|
||||
|
||||
if (lodVarVsRef == 0) { lodVarVsRef = -1.0 * lodBestVsNextBest; }
|
||||
|
||||
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location,
|
||||
num2nuc[refnum],
|
||||
num2nuc[altnum],
|
||||
|
|
@ -281,6 +352,8 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
bestMixtures[0].qstar,
|
||||
lodVarVsRef,
|
||||
lodBestVsNextBest,
|
||||
bestMixtures[0].posterior,
|
||||
posterior_null_hyp,
|
||||
depth);
|
||||
return alleleFreq;
|
||||
}
|
||||
|
|
@ -310,7 +383,19 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
}
|
||||
}
|
||||
|
||||
static double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum)
|
||||
double ML_q(byte[] bases, double[][]quals, int refnum, int altnum)
|
||||
{
|
||||
double ref_count = 0;
|
||||
double alt_count = 0;
|
||||
for (int i=0; i<bases.length; i++)
|
||||
{
|
||||
if (bases[i] == num2nuc[refnum]) { ref_count += 1; }
|
||||
if (bases[i] == num2nuc[altnum]) { alt_count += 1; }
|
||||
}
|
||||
return alt_count / (alt_count + ref_count);
|
||||
}
|
||||
|
||||
double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum)
|
||||
{
|
||||
double p = 0.0;
|
||||
|
||||
|
|
@ -322,15 +407,25 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
return p;
|
||||
}
|
||||
|
||||
static double P_q_G(byte [] bases, int N, double q, double qstar, long q_R)
|
||||
double P_q_G(byte [] bases, int N, double q, double qstar, long q_R)
|
||||
{
|
||||
double epsilon = 1e-3;
|
||||
if (qstar == 0) { qstar = epsilon; }
|
||||
if (qstar == 1) { qstar = 1.0 - epsilon; }
|
||||
|
||||
if (N != 2) { return 0.0; }
|
||||
else { return Math.log10(binomialProb(q_R, bases.length, qstar)); }
|
||||
}
|
||||
|
||||
static double P_G(int N, int qstar_N)
|
||||
double P_G(int N, int qstar_N)
|
||||
{
|
||||
// badly hard coded right now.
|
||||
|
||||
if ((N == 2) && (prior_alt_frequency != -1))
|
||||
{
|
||||
return ((double)qstar_N * prior_alt_frequency) + (((double)N-(double)qstar_N)*(1.0-prior_alt_frequency));
|
||||
}
|
||||
|
||||
if (qstar_N == 0) { return Math.log10(0.999); }
|
||||
else if (qstar_N == N) { return Math.log10(1e-5); }
|
||||
else { return Math.log10(1e-3); }
|
||||
|
|
@ -462,6 +557,12 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
return "";
|
||||
}
|
||||
|
||||
private double prior_alt_frequency = -1.0;
|
||||
public void setAlleleFrequencyPrior(double frequency)
|
||||
{
|
||||
this.prior_alt_frequency = frequency;
|
||||
}
|
||||
|
||||
static int nuc2num[];
|
||||
static char num2nuc[];
|
||||
public AlleleFrequencyWalker() {
|
||||
|
|
@ -481,20 +582,6 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
|||
num2nuc[2] = 'T';
|
||||
num2nuc[3] = 'G';
|
||||
|
||||
try
|
||||
{
|
||||
this.random = new java.util.Random(0);
|
||||
if ( GFF_OUTPUT_FILE.equals("-") )
|
||||
this.output = out;
|
||||
else
|
||||
this.output = new PrintStream(GFF_OUTPUT_FILE);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
System.exit(-1);
|
||||
}
|
||||
|
||||
//if (System.getenv("N") != null) { this.N = (new Integer(System.getenv("N"))).intValue(); }
|
||||
//else { this.N = 2; }
|
||||
//
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||
|
|
@ -9,27 +12,258 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
|||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class PoolCallingExperiment extends LocusWalker<AlleleFrequencyEstimate, String>
|
||||
{
|
||||
List<AlleleFrequencyWalker> deep_callers;
|
||||
List<AlleleFrequencyWalker> shallow_callers;
|
||||
AlleleFrequencyWalker pooled_caller;
|
||||
List<AlleleFrequencyWalker> deep_callers = null;
|
||||
List<AlleleFrequencyWalker> shallow_callers = null;
|
||||
AlleleFrequencyWalker pooled_caller = null;
|
||||
List<String> sample_names = null;
|
||||
|
||||
@Argument public int DOWNSAMPLE;
|
||||
//@Argument public int DOWNSAMPLE;
|
||||
public int DOWNSAMPLE = 4;
|
||||
public int DOWNSAMPLE_NOISE = 3;
|
||||
public boolean LOG_METRICS = true;
|
||||
|
||||
private Random random;
|
||||
|
||||
public void initialize()
|
||||
{
|
||||
GenomeAnalysisTK toolkit = this.getToolkit();
|
||||
SAMFileHeader header = toolkit.getSamReader().getFileHeader();
|
||||
List<SAMReadGroupRecord> read_groups = header.getReadGroups();
|
||||
|
||||
sample_names = new ArrayList<String>();
|
||||
deep_callers = new ArrayList<AlleleFrequencyWalker>();
|
||||
shallow_callers = new ArrayList<AlleleFrequencyWalker>();
|
||||
|
||||
random = new Random(666);
|
||||
|
||||
for (int i = 0; i < read_groups.size(); i++)
|
||||
{
|
||||
String sample_name = read_groups.get(i).getSample();
|
||||
//System.out.println("SAMPLE: " + sample_name);
|
||||
|
||||
AlleleFrequencyWalker deep = new AlleleFrequencyWalker();
|
||||
AlleleFrequencyWalker shallow = new AlleleFrequencyWalker();
|
||||
|
||||
deep.N = 2;
|
||||
deep.DOWNSAMPLE = 0;
|
||||
deep.GFF_OUTPUT_FILE = "/dev/null";
|
||||
deep.initalize();
|
||||
|
||||
shallow.N = 2;
|
||||
shallow.DOWNSAMPLE = 0;
|
||||
shallow.GFF_OUTPUT_FILE = "/dev/null";
|
||||
shallow.initalize();
|
||||
|
||||
sample_names.add(sample_name);
|
||||
deep_callers.add(deep);
|
||||
shallow_callers.add(shallow);
|
||||
}
|
||||
|
||||
pooled_caller = new AlleleFrequencyWalker();
|
||||
pooled_caller.N = sample_names.size() * 2;
|
||||
pooled_caller.DOWNSAMPLE = 0;
|
||||
pooled_caller.GFF_OUTPUT_FILE = "/dev/null";
|
||||
//pooled_caller.LOG_METRICS = LOG_METRICS;
|
||||
//pooled_caller.METRICS_OUTPUT_FILE = "metrics.out";
|
||||
//pooled_caller.METRICS_INTERVAL = 1;
|
||||
pooled_caller.initalize();
|
||||
pooled_caller.reduceInit();
|
||||
}
|
||||
|
||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||
{
|
||||
// 1. seperate each context.
|
||||
LocusContext[] deep_contexts = new LocusContext[sample_names.size()];
|
||||
for (int i = 0; i < sample_names.size(); i++)
|
||||
{
|
||||
deep_contexts[i] = filterLocusContext(context, sample_names.get(i), 0);
|
||||
}
|
||||
|
||||
// 2. Pool the deep contexts. (for the hell of it)
|
||||
LocusContext deep_pooled_context = poolLocusContext(deep_contexts);
|
||||
|
||||
// 3. Call individuals from deep coverage.
|
||||
AlleleFrequencyEstimate[] deep_calls = new AlleleFrequencyEstimate[sample_names.size()];
|
||||
for (int i = 0; i < sample_names.size(); i++)
|
||||
{
|
||||
deep_calls[i] = deep_callers.get(i).map(tracker, ref, deep_contexts[i]);
|
||||
}
|
||||
|
||||
// 4. Count "true" allele frequency from deep calls, and compare to the estimated frequency from the pool.
|
||||
// 4.1. Count "true" allele frequency from deep calls, and downsample the individuals with solid truth.
|
||||
double true_alt_freq = 0.0;
|
||||
double true_N = 0.0;
|
||||
LocusContext[] downsampled_contexts = new LocusContext[sample_names.size()];
|
||||
for (int i = 0; i < deep_calls.length; i++)
|
||||
{
|
||||
downsampled_contexts[i] = null;
|
||||
if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0))
|
||||
{
|
||||
true_alt_freq += deep_calls[i].emperical_allele_frequency() * deep_calls[i].N;
|
||||
true_N += 2;
|
||||
downsampled_contexts[i] = filterLocusContext(context, sample_names.get(i), DOWNSAMPLE + (int)(random.nextGaussian()*DOWNSAMPLE_NOISE));
|
||||
}
|
||||
}
|
||||
true_alt_freq /= true_N;
|
||||
|
||||
if (true_N == 0.0) { return null; } // just bail if true_N == 0.
|
||||
|
||||
// 4.2. Pool just the contexts that have truth data.
|
||||
LocusContext pooled_context = poolLocusContext(downsampled_contexts);
|
||||
|
||||
|
||||
// EM Loop:
|
||||
AlleleFrequencyEstimate pooled_call = null;
|
||||
|
||||
double correct_shallow_calls = 0;
|
||||
double total_shallow_calls = 0;
|
||||
AlleleFrequencyEstimate[] shallow_calls = null;
|
||||
double EM_alt_freq = 0;
|
||||
double EM_N = 0;
|
||||
double shallow_calls_fraction_correct = 0;
|
||||
|
||||
// 5. Call the pool. (this step is the EM init)
|
||||
pooled_caller.N = (int)true_N;
|
||||
pooled_call = pooled_caller.map(tracker, ref, pooled_context);
|
||||
System.out.print("POOLED_CALL " + pooled_call.asTabularString());
|
||||
|
||||
// (this loop is the EM cycle)
|
||||
EM_alt_freq = pooled_call.qhat;
|
||||
int num_iterations = 10;
|
||||
double[] trajectory = new double[num_iterations + 1]; trajectory[0] = EM_alt_freq;
|
||||
double[] likelihood_trajectory = new double[num_iterations + 1];
|
||||
for (int iterations = 0; iterations < num_iterations; iterations++)
|
||||
{
|
||||
// 6. Re-call from shallow coverage using the estimated frequency as a prior,
|
||||
// and compare to true deep calls,
|
||||
// and compute new MAF estimate.
|
||||
correct_shallow_calls = 0;
|
||||
total_shallow_calls = 0;
|
||||
shallow_calls = new AlleleFrequencyEstimate[sample_names.size()];
|
||||
EM_N = 0.0;
|
||||
double EM_sum = 0.0;
|
||||
double likelihood = 0.0;
|
||||
|
||||
for (int i = 0; i < deep_calls.length; i++)
|
||||
{
|
||||
// Only shallow-call things where we know the truth!
|
||||
if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0))
|
||||
{
|
||||
shallow_callers.get(i).setAlleleFrequencyPrior(EM_alt_freq);
|
||||
shallow_calls[i] = shallow_callers.get(i).map(tracker, ref, filterLocusContext(context, sample_names.get(i), DOWNSAMPLE + (int)(random.nextGaussian() * DOWNSAMPLE_NOISE)));
|
||||
String deep_genotype = deep_calls[i].genotype();
|
||||
String shallow_genotype = shallow_calls[i].genotype();
|
||||
|
||||
/*
|
||||
System.out.printf("DBG: %f %f %f %f\n",
|
||||
deep_calls[i].lodVsNextBest,
|
||||
deep_calls[i].lodVsRef,
|
||||
shallow_calls[i].lodVsNextBest,
|
||||
shallow_calls[i].lodVsRef);
|
||||
*/
|
||||
|
||||
if (deep_genotype.equals(shallow_genotype)) { correct_shallow_calls += 1; }
|
||||
total_shallow_calls += 1;
|
||||
|
||||
EM_sum += shallow_calls[i].emperical_allele_frequency() * shallow_calls[i].N;
|
||||
EM_N += 2;
|
||||
}
|
||||
}
|
||||
EM_alt_freq = EM_sum / EM_N;
|
||||
shallow_calls_fraction_correct = correct_shallow_calls / total_shallow_calls;
|
||||
trajectory[iterations+1] = EM_alt_freq;
|
||||
}
|
||||
|
||||
// 7. Compare to estimation from the pool.
|
||||
System.out.printf("EVAL %s %f %f %f %f %f %f %d %d %f %f %f %f\n",
|
||||
pooled_call.location,
|
||||
pooled_call.lodVsRef,
|
||||
true_alt_freq,
|
||||
pooled_call.qhat,
|
||||
pooled_call.qstar,
|
||||
true_alt_freq * true_N,
|
||||
pooled_call.emperical_allele_frequency() * true_N,
|
||||
pooled_call.N,
|
||||
pooled_call.depth,
|
||||
total_shallow_calls,
|
||||
correct_shallow_calls,
|
||||
shallow_calls_fraction_correct,
|
||||
EM_alt_freq);
|
||||
System.out.print("TRAJECTORY ");
|
||||
for (int i = 0; i < trajectory.length; i++)
|
||||
{
|
||||
System.out.printf("%f ", trajectory[i]);
|
||||
}
|
||||
System.out.print("\n");
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private LocusContext poolLocusContext(LocusContext[] contexts)
|
||||
{
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
|
||||
GenomeLoc location = null;
|
||||
|
||||
for (int i = 0; i < contexts.length; i++)
|
||||
{
|
||||
if (contexts[i] != null)
|
||||
{
|
||||
location = contexts[i].getLocation();
|
||||
reads.addAll(contexts[i].getReads());
|
||||
offsets.addAll(contexts[i].getOffsets());
|
||||
}
|
||||
}
|
||||
|
||||
return new LocusContext(location, reads, offsets);
|
||||
}
|
||||
|
||||
private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample)
|
||||
{
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
|
||||
for (int i = 0; i < context.getReads().size(); i++)
|
||||
{
|
||||
String read_group = (String)(context.getReads().get(i).getAttribute("RG"));
|
||||
String sample = context.getReads().get(i).getHeader().getReadGroup(read_group).READ_GROUP_SAMPLE_TAG;
|
||||
System.out.println("RG: " + read_group + " SAMPLE: " + sample);
|
||||
}
|
||||
return null;
|
||||
SAMRecord read = context.getReads().get(i);
|
||||
Integer offset = context.getOffsets().get(i);
|
||||
String RG = (String)(read.getAttribute("RG"));
|
||||
String sample = read.getHeader().getReadGroup(RG).getSample();
|
||||
if (sample == sample_name)
|
||||
{
|
||||
reads.add(read);
|
||||
offsets.add(offset);
|
||||
}
|
||||
}
|
||||
|
||||
if (downsample != 0)
|
||||
{
|
||||
List<Integer> perm = new ArrayList<Integer>();
|
||||
for (int i = 0; i < reads.size(); i++) { perm.add(i); }
|
||||
perm = Utils.RandomSubset(perm, downsample);
|
||||
|
||||
ArrayList<SAMRecord> downsampled_reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> downsampled_offsets = new ArrayList<Integer>();
|
||||
|
||||
for (int i = 0; i < perm.size(); i++)
|
||||
{
|
||||
downsampled_reads.add(reads.get(perm.get(i)));
|
||||
downsampled_offsets.add(offsets.get(perm.get(i)));
|
||||
}
|
||||
|
||||
reads = downsampled_reads;
|
||||
offsets = downsampled_offsets;
|
||||
}
|
||||
|
||||
return new LocusContext(context.getLocation(), reads, offsets);
|
||||
}
|
||||
|
||||
public void onTraversalDone()
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
package org.broadinstitute.sting.playground.utils;
|
||||
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||
import java.util.Arrays;
|
||||
import java.lang.Math;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
public class AlleleFrequencyEstimate {
|
||||
|
|
@ -13,12 +15,14 @@ public class AlleleFrequencyEstimate {
|
|||
public double qstar;
|
||||
public double lodVsRef;
|
||||
public double lodVsNextBest;
|
||||
public double pBest;
|
||||
public double pRef;
|
||||
public int depth;
|
||||
public String notes;
|
||||
|
||||
GenomeLoc l;
|
||||
|
||||
public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, int depth)
|
||||
public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth)
|
||||
{
|
||||
this.location = location;
|
||||
this.ref = ref;
|
||||
|
|
@ -32,6 +36,29 @@ public class AlleleFrequencyEstimate {
|
|||
this.notes = "";
|
||||
}
|
||||
|
||||
/** Return the most likely genotype. */
|
||||
public String genotype()
|
||||
{
|
||||
int alt_count = (int)(qstar * N);
|
||||
int ref_count = N-alt_count;
|
||||
char[] alleles = new char[N];
|
||||
int i;
|
||||
for (i = 0; i < ref_count; i++) { alleles[i] = ref; }
|
||||
for (; i < N; i++) { alleles[i] = alt; }
|
||||
Arrays.sort(alleles);
|
||||
return new String(alleles);
|
||||
}
|
||||
|
||||
public double emperical_allele_frequency()
|
||||
{
|
||||
return (double)Math.round((double)qhat * (double)N) / (double)N;
|
||||
}
|
||||
|
||||
public double emperical_allele_frequency(int N)
|
||||
{
|
||||
return (double)Math.round((double)qhat * (double)N) / (double)N;
|
||||
}
|
||||
|
||||
public String asGFFString()
|
||||
{
|
||||
return String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\tREF %c\t;\tALT %c\t;\tFREQ %f\n",
|
||||
|
|
|
|||
|
|
@ -143,26 +143,26 @@ public class AlleleMetrics {
|
|||
if (num_loci_total == 0) { return; }
|
||||
|
||||
out.printf("\n");
|
||||
out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
|
||||
out.printf("METRICS -------------------------------------------------\n");
|
||||
out.printf("METRICS Total loci : %d\n", num_loci_total);
|
||||
out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
||||
out.printf("Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
|
||||
out.printf("-------------------------------------------------\n");
|
||||
out.printf("Total loci : %d\n", num_loci_total);
|
||||
out.printf("Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
||||
if (num_variants != 0)
|
||||
{
|
||||
out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||
out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||
out.printf("METRICS -------------------------------------------------\n");
|
||||
out.printf("METRICS -- Hapmap Genotyping performance --\n");
|
||||
out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
|
||||
out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
|
||||
out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
|
||||
out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
|
||||
out.printf("METRICS -------------------------------------------------\n");
|
||||
out.printf("METRICS -- Hapmap Reference/Variant performance --\n");
|
||||
out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
|
||||
out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
|
||||
out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
|
||||
out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
|
||||
out.printf("Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||
out.printf("Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||
out.printf("-------------------------------------------------\n");
|
||||
out.printf("-- Hapmap Genotyping performance --\n");
|
||||
out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
|
||||
out.printf("Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
|
||||
out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
|
||||
out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
|
||||
out.printf("-------------------------------------------------\n");
|
||||
out.printf("-- Hapmap Reference/Variant performance --\n");
|
||||
out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
|
||||
out.printf("Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
|
||||
out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
|
||||
out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
|
||||
}
|
||||
out.println();
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue