Unified caller is go.
AlleleFrequencyWalker and related classes work equally well for 2 or 200 chromosomes.
Single Sample Calling:
Allele Frequency Metrics (LOD >= 5)
-------------------------------------------------
Total loci : 171575
Total called with confidence : 168615 (98.27%)
Number of variants : 111 (0.07%) (1/1519)
Fraction of variant sites in dbSNP : 87.39%
-------------------------------------------------
Hapmap metrics are coming up all zero. Will fix.
Pooled Calling:
AAF r-squared after EM is 0.99.
AAF r-squared after EM for alleles < 20% (in pools of ~100-200 chromosomes) is 0.95 (0.75 before EM)
Still not using fractional genotype counts in EM. That should improve r-squared for low frequency alleles.
Chores still outstanding:
- make a real pooled caller walker (as opposed to my experiment framework).
- add fractional genotype counts to EM cycle.
- add pool metrics to the metrics class? *shrug* we don't really have truth outside of a contrived experiment...
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@380 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f39092526d
commit
6e180ed44e
|
|
@ -31,25 +31,39 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
Random random;
|
Random random;
|
||||||
PrintStream output;
|
PrintStream output;
|
||||||
|
|
||||||
|
private boolean initalized = false;
|
||||||
|
public void initalize()
|
||||||
|
{
|
||||||
|
if (initalized) { return; }
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
this.random = new java.util.Random(0);
|
||||||
|
if ( GFF_OUTPUT_FILE.equals("-") )
|
||||||
|
this.output = out;
|
||||||
|
else
|
||||||
|
this.output = new PrintStream(GFF_OUTPUT_FILE);
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
e.printStackTrace();
|
||||||
|
System.exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
initalized = true;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean requiresReads() { return true; }
|
public boolean requiresReads() { return true; }
|
||||||
|
|
||||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||||
{
|
{
|
||||||
// Convert context data into bases and 4-base quals
|
// init in the map because GATK doesn't appear to be initing me today. no problemo.
|
||||||
String bases = getBases(context);
|
this.initalize();
|
||||||
double quals[][] = getQuals(context);
|
|
||||||
|
|
||||||
/*
|
// Convert context data into bases and 4-base quals
|
||||||
// DEBUG: print the data for a read
|
String bases = getBases(context);
|
||||||
{
|
double quals[][] = getQuals(context);
|
||||||
List<SAMRecord> reads = context.getReads();
|
//String[] indels = getIndels(context);
|
||||||
for (int i = 0; i < reads.size(); i++)
|
|
||||||
{
|
|
||||||
String cigar = reads.get(i).getCigarString();
|
|
||||||
System.out.println("DEBUG " + cigar);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
logger.debug(String.format("In alleleFrequnecy walker: N=%d, d=%d", N, DOWNSAMPLE));
|
logger.debug(String.format("In alleleFrequnecy walker: N=%d, d=%d", N, DOWNSAMPLE));
|
||||||
|
|
||||||
|
|
@ -63,6 +77,14 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
while (downsampled_bases.length() < DOWNSAMPLE)
|
while (downsampled_bases.length() < DOWNSAMPLE)
|
||||||
{
|
{
|
||||||
int choice;
|
int choice;
|
||||||
|
|
||||||
|
/*
|
||||||
|
System.out.printf("DBG %b %b %b\n",
|
||||||
|
random == null,
|
||||||
|
bases == null,
|
||||||
|
picked_bases == null);
|
||||||
|
*/
|
||||||
|
|
||||||
for (choice = random.nextInt(bases.length()); picked_bases[choice] == 1; choice = random.nextInt(bases.length()));
|
for (choice = random.nextInt(bases.length()); picked_bases[choice] == 1; choice = random.nextInt(bases.length()));
|
||||||
picked_bases[choice] = 1;
|
picked_bases[choice] = 1;
|
||||||
downsampled_bases += bases.charAt(choice);
|
downsampled_bases += bases.charAt(choice);
|
||||||
|
|
@ -137,6 +159,56 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
return new String(bases);
|
return new String(bases);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
static public String[] getIndels(LocusContext context)
|
||||||
|
{
|
||||||
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
List<Integer> offsets = context.getOffsets();
|
||||||
|
String[] indels = new String[reads.size()];
|
||||||
|
|
||||||
|
for (int i = 0; i < reads.size(); i++)
|
||||||
|
{
|
||||||
|
SAMRecord read = reads.get(i);
|
||||||
|
Cigar cigar = read.getCigar();
|
||||||
|
|
||||||
|
int k = 0;
|
||||||
|
for (int j = 0; j < cigar.numCigarElements(); j++)
|
||||||
|
{
|
||||||
|
CigarOperator operator = cigar.getCigarElement(j).getOperator();
|
||||||
|
int length = cigar.getCigarElement(j).getLength();
|
||||||
|
if (operator == CigarOperator.M)
|
||||||
|
{
|
||||||
|
k += length;
|
||||||
|
}
|
||||||
|
else if ((k != offset) && (operator == CigarOperator.I))
|
||||||
|
{
|
||||||
|
k += length;
|
||||||
|
}
|
||||||
|
else if ((k == offset) && (operator == CigarOperator.I))
|
||||||
|
{
|
||||||
|
// this insertion is associated with this offset.
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if ((k == offset) && (operator == CigarOperator.D))
|
||||||
|
{
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if ((k == offset) &&
|
||||||
|
((operator == CigarOperator.I) || (operator == CigarOperator.D)))
|
||||||
|
{
|
||||||
|
// no indel here.
|
||||||
|
indels[i] = "";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return indels;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
static public double[][] getQuals (LocusContext context)
|
static public double[][] getQuals (LocusContext context)
|
||||||
{
|
{
|
||||||
int numReads = context.getReads().size(); //numReads();
|
int numReads = context.getReads().size(); //numReads();
|
||||||
|
|
@ -220,51 +292,48 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
// alt = alternate hypothesis base (most frequent nonref base)
|
// alt = alternate hypothesis base (most frequent nonref base)
|
||||||
// ref = reference base
|
// ref = reference base
|
||||||
|
|
||||||
// b = number of bases at locus
|
|
||||||
|
|
||||||
double epsilon = 0; // 1e-2;
|
|
||||||
double qstar;
|
double qstar;
|
||||||
int qstar_N;
|
int qstar_N;
|
||||||
|
|
||||||
double qstep = 0.001;
|
double qstep = 0.001;
|
||||||
double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating
|
double qend = 1.0 + qstep / 10; // makes sure we get to 1.0 even with rounding error of qsteps accumulating
|
||||||
|
|
||||||
|
double posterior_null_hyp;
|
||||||
|
|
||||||
// Initialize empyt MixtureLikelihood holders for each possible qstar (N+1)
|
// Initialize empyt MixtureLikelihood holders for each possible qstar (N+1)
|
||||||
MixtureLikelihood[] bestMixtures = new MixtureLikelihood[N+1];
|
MixtureLikelihood[] bestMixtures = new MixtureLikelihood[N+1];
|
||||||
// Fill 1..N ML positions with 0 valued MLs
|
|
||||||
for (int i=1; i<=N; i++) bestMixtures[i] = new MixtureLikelihood();
|
|
||||||
// Calculate null hypothesis for qstar = 0, qhat = 0
|
|
||||||
double posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, epsilon, 0) + P_G(N, 0);
|
|
||||||
// Put null hypothesis into ML[0] because that is our prob of that theory and we don't loop over it below
|
|
||||||
bestMixtures[0] = new MixtureLikelihood(posterior_null_hyp, 0.0, 0.0);
|
|
||||||
|
|
||||||
// hypothetic allele balance that we sample over
|
|
||||||
for (double q=0.0; q <= qend; q += qstep)
|
|
||||||
{
|
{
|
||||||
long q_R = Math.round(q*bases.length);
|
double q = ML_q(bases, quals, refnum, altnum);
|
||||||
double pDq = P_D_q(bases, quals, q, refnum, altnum);
|
double pDq = P_D_q(bases, quals, q, refnum, altnum);
|
||||||
|
long q_R = Math.round(q*bases.length);
|
||||||
|
|
||||||
|
posterior_null_hyp = P_D_q(bases, quals, 0.0, refnum, altnum) + P_q_G(bases, N, 0.0, 0, 0) + P_G(N, 0);
|
||||||
|
bestMixtures[0] = new MixtureLikelihood(posterior_null_hyp, 0.0, 0.0);
|
||||||
|
|
||||||
// qstar - true allele balances
|
// qstar - true allele balances
|
||||||
for (qstar = epsilon + ((1.0 - 2*epsilon)/N), qstar_N = 1; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++)
|
//for (qstar = epsilon + ((1.0 - 2*epsilon)/N), qstar_N = 1; qstar <= 1.0; qstar += (1.0 - 2*epsilon)/N, qstar_N++)
|
||||||
|
for (qstar_N = 1; qstar_N <= N; qstar_N += 1)
|
||||||
{
|
{
|
||||||
|
qstar = (double)qstar_N / (double)N;
|
||||||
|
|
||||||
double pqG = P_q_G(bases, N, q, qstar, q_R);
|
double pqG = P_q_G(bases, N, q, qstar, q_R);
|
||||||
double pG = P_G(N, qstar_N);
|
double pG = P_G(N, qstar_N);
|
||||||
double posterior = pDq + pqG + pG;
|
double posterior = pDq + pqG + pG;
|
||||||
|
|
||||||
if (posterior > bestMixtures[qstar_N].posterior)
|
bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q);
|
||||||
bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q);
|
|
||||||
|
|
||||||
//System.out.format("%.2f %.2f %5.2f %5.2f %5.2f %5.2f\n", q, qstar, pDq, pqG, pG, posterior);
|
//System.out.format("DBG %s %.2f %.2f %5.2f %5.2f %5.2f %5.2f %d %d %s\n", location, q, qstar, pDq, pqG, pG, posterior, (int)(q*bases.length), (int)((1.0-q)*bases.length), new String(bases));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// First reverse sort NONREF mixtures according to highest posterior probabililty
|
// First reverse sort NONREF mixtures according to highest posterior probabililty
|
||||||
Arrays.sort(bestMixtures, 1, N+1);
|
Arrays.sort(bestMixtures);
|
||||||
|
|
||||||
// Calculate Lod of any variant call versus the reference call
|
// Calculate Lod of any variant call versus the reference call
|
||||||
// Answers how confident are we in the best variant (nonref) mixture versus the null hypothesis
|
// Answers how confident are we in the best variant (nonref) mixture versus the null hypothesis
|
||||||
// reference mixture - qhat = qstar = 0.0
|
// reference mixture - qhat = qstar = 0.0
|
||||||
double lodVarVsRef = bestMixtures[1].posterior - posterior_null_hyp;
|
double lodVarVsRef = bestMixtures[0].posterior - posterior_null_hyp;
|
||||||
|
|
||||||
// Now reverse sort ALL mixtures according to highest posterior probability
|
// Now reverse sort ALL mixtures according to highest posterior probability
|
||||||
Arrays.sort(bestMixtures);
|
Arrays.sort(bestMixtures);
|
||||||
|
|
@ -273,6 +342,8 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
// Answers how confident are we in the best mixture versus the nextPosition best mixture
|
// Answers how confident are we in the best mixture versus the nextPosition best mixture
|
||||||
double lodBestVsNextBest = bestMixtures[0].posterior - bestMixtures[1].posterior;
|
double lodBestVsNextBest = bestMixtures[0].posterior - bestMixtures[1].posterior;
|
||||||
|
|
||||||
|
if (lodVarVsRef == 0) { lodVarVsRef = -1.0 * lodBestVsNextBest; }
|
||||||
|
|
||||||
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location,
|
AlleleFrequencyEstimate alleleFreq = new AlleleFrequencyEstimate(location,
|
||||||
num2nuc[refnum],
|
num2nuc[refnum],
|
||||||
num2nuc[altnum],
|
num2nuc[altnum],
|
||||||
|
|
@ -281,6 +352,8 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
bestMixtures[0].qstar,
|
bestMixtures[0].qstar,
|
||||||
lodVarVsRef,
|
lodVarVsRef,
|
||||||
lodBestVsNextBest,
|
lodBestVsNextBest,
|
||||||
|
bestMixtures[0].posterior,
|
||||||
|
posterior_null_hyp,
|
||||||
depth);
|
depth);
|
||||||
return alleleFreq;
|
return alleleFreq;
|
||||||
}
|
}
|
||||||
|
|
@ -310,7 +383,19 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum)
|
double ML_q(byte[] bases, double[][]quals, int refnum, int altnum)
|
||||||
|
{
|
||||||
|
double ref_count = 0;
|
||||||
|
double alt_count = 0;
|
||||||
|
for (int i=0; i<bases.length; i++)
|
||||||
|
{
|
||||||
|
if (bases[i] == num2nuc[refnum]) { ref_count += 1; }
|
||||||
|
if (bases[i] == num2nuc[altnum]) { alt_count += 1; }
|
||||||
|
}
|
||||||
|
return alt_count / (alt_count + ref_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
double P_D_q(byte[] bases, double[][]quals, double q, int refnum, int altnum)
|
||||||
{
|
{
|
||||||
double p = 0.0;
|
double p = 0.0;
|
||||||
|
|
||||||
|
|
@ -322,15 +407,25 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
static double P_q_G(byte [] bases, int N, double q, double qstar, long q_R)
|
double P_q_G(byte [] bases, int N, double q, double qstar, long q_R)
|
||||||
{
|
{
|
||||||
|
double epsilon = 1e-3;
|
||||||
|
if (qstar == 0) { qstar = epsilon; }
|
||||||
|
if (qstar == 1) { qstar = 1.0 - epsilon; }
|
||||||
|
|
||||||
if (N != 2) { return 0.0; }
|
if (N != 2) { return 0.0; }
|
||||||
else { return Math.log10(binomialProb(q_R, bases.length, qstar)); }
|
else { return Math.log10(binomialProb(q_R, bases.length, qstar)); }
|
||||||
}
|
}
|
||||||
|
|
||||||
static double P_G(int N, int qstar_N)
|
double P_G(int N, int qstar_N)
|
||||||
{
|
{
|
||||||
// badly hard coded right now.
|
// badly hard coded right now.
|
||||||
|
|
||||||
|
if ((N == 2) && (prior_alt_frequency != -1))
|
||||||
|
{
|
||||||
|
return ((double)qstar_N * prior_alt_frequency) + (((double)N-(double)qstar_N)*(1.0-prior_alt_frequency));
|
||||||
|
}
|
||||||
|
|
||||||
if (qstar_N == 0) { return Math.log10(0.999); }
|
if (qstar_N == 0) { return Math.log10(0.999); }
|
||||||
else if (qstar_N == N) { return Math.log10(1e-5); }
|
else if (qstar_N == N) { return Math.log10(1e-5); }
|
||||||
else { return Math.log10(1e-3); }
|
else { return Math.log10(1e-3); }
|
||||||
|
|
@ -462,6 +557,12 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private double prior_alt_frequency = -1.0;
|
||||||
|
public void setAlleleFrequencyPrior(double frequency)
|
||||||
|
{
|
||||||
|
this.prior_alt_frequency = frequency;
|
||||||
|
}
|
||||||
|
|
||||||
static int nuc2num[];
|
static int nuc2num[];
|
||||||
static char num2nuc[];
|
static char num2nuc[];
|
||||||
public AlleleFrequencyWalker() {
|
public AlleleFrequencyWalker() {
|
||||||
|
|
@ -481,20 +582,6 @@ public class AlleleFrequencyWalker extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
num2nuc[2] = 'T';
|
num2nuc[2] = 'T';
|
||||||
num2nuc[3] = 'G';
|
num2nuc[3] = 'G';
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
this.random = new java.util.Random(0);
|
|
||||||
if ( GFF_OUTPUT_FILE.equals("-") )
|
|
||||||
this.output = out;
|
|
||||||
else
|
|
||||||
this.output = new PrintStream(GFF_OUTPUT_FILE);
|
|
||||||
}
|
|
||||||
catch (Exception e)
|
|
||||||
{
|
|
||||||
e.printStackTrace();
|
|
||||||
System.exit(-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
//if (System.getenv("N") != null) { this.N = (new Integer(System.getenv("N"))).intValue(); }
|
//if (System.getenv("N") != null) { this.N = (new Integer(System.getenv("N"))).intValue(); }
|
||||||
//else { this.N = 2; }
|
//else { this.N = 2; }
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,9 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||||
|
|
||||||
|
import net.sf.samtools.*;
|
||||||
|
import org.broadinstitute.sting.gatk.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||||
|
|
@ -9,27 +12,258 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
|
|
||||||
public class PoolCallingExperiment extends LocusWalker<AlleleFrequencyEstimate, String>
|
public class PoolCallingExperiment extends LocusWalker<AlleleFrequencyEstimate, String>
|
||||||
{
|
{
|
||||||
List<AlleleFrequencyWalker> deep_callers;
|
List<AlleleFrequencyWalker> deep_callers = null;
|
||||||
List<AlleleFrequencyWalker> shallow_callers;
|
List<AlleleFrequencyWalker> shallow_callers = null;
|
||||||
AlleleFrequencyWalker pooled_caller;
|
AlleleFrequencyWalker pooled_caller = null;
|
||||||
|
List<String> sample_names = null;
|
||||||
|
|
||||||
@Argument public int DOWNSAMPLE;
|
//@Argument public int DOWNSAMPLE;
|
||||||
|
public int DOWNSAMPLE = 4;
|
||||||
|
public int DOWNSAMPLE_NOISE = 3;
|
||||||
|
public boolean LOG_METRICS = true;
|
||||||
|
|
||||||
|
private Random random;
|
||||||
|
|
||||||
|
public void initialize()
|
||||||
|
{
|
||||||
|
GenomeAnalysisTK toolkit = this.getToolkit();
|
||||||
|
SAMFileHeader header = toolkit.getSamReader().getFileHeader();
|
||||||
|
List<SAMReadGroupRecord> read_groups = header.getReadGroups();
|
||||||
|
|
||||||
|
sample_names = new ArrayList<String>();
|
||||||
|
deep_callers = new ArrayList<AlleleFrequencyWalker>();
|
||||||
|
shallow_callers = new ArrayList<AlleleFrequencyWalker>();
|
||||||
|
|
||||||
|
random = new Random(666);
|
||||||
|
|
||||||
|
for (int i = 0; i < read_groups.size(); i++)
|
||||||
|
{
|
||||||
|
String sample_name = read_groups.get(i).getSample();
|
||||||
|
//System.out.println("SAMPLE: " + sample_name);
|
||||||
|
|
||||||
|
AlleleFrequencyWalker deep = new AlleleFrequencyWalker();
|
||||||
|
AlleleFrequencyWalker shallow = new AlleleFrequencyWalker();
|
||||||
|
|
||||||
|
deep.N = 2;
|
||||||
|
deep.DOWNSAMPLE = 0;
|
||||||
|
deep.GFF_OUTPUT_FILE = "/dev/null";
|
||||||
|
deep.initalize();
|
||||||
|
|
||||||
|
shallow.N = 2;
|
||||||
|
shallow.DOWNSAMPLE = 0;
|
||||||
|
shallow.GFF_OUTPUT_FILE = "/dev/null";
|
||||||
|
shallow.initalize();
|
||||||
|
|
||||||
|
sample_names.add(sample_name);
|
||||||
|
deep_callers.add(deep);
|
||||||
|
shallow_callers.add(shallow);
|
||||||
|
}
|
||||||
|
|
||||||
|
pooled_caller = new AlleleFrequencyWalker();
|
||||||
|
pooled_caller.N = sample_names.size() * 2;
|
||||||
|
pooled_caller.DOWNSAMPLE = 0;
|
||||||
|
pooled_caller.GFF_OUTPUT_FILE = "/dev/null";
|
||||||
|
//pooled_caller.LOG_METRICS = LOG_METRICS;
|
||||||
|
//pooled_caller.METRICS_OUTPUT_FILE = "metrics.out";
|
||||||
|
//pooled_caller.METRICS_INTERVAL = 1;
|
||||||
|
pooled_caller.initalize();
|
||||||
|
pooled_caller.reduceInit();
|
||||||
|
}
|
||||||
|
|
||||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||||
{
|
{
|
||||||
|
// 1. seperate each context.
|
||||||
|
LocusContext[] deep_contexts = new LocusContext[sample_names.size()];
|
||||||
|
for (int i = 0; i < sample_names.size(); i++)
|
||||||
|
{
|
||||||
|
deep_contexts[i] = filterLocusContext(context, sample_names.get(i), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Pool the deep contexts. (for the hell of it)
|
||||||
|
LocusContext deep_pooled_context = poolLocusContext(deep_contexts);
|
||||||
|
|
||||||
|
// 3. Call individuals from deep coverage.
|
||||||
|
AlleleFrequencyEstimate[] deep_calls = new AlleleFrequencyEstimate[sample_names.size()];
|
||||||
|
for (int i = 0; i < sample_names.size(); i++)
|
||||||
|
{
|
||||||
|
deep_calls[i] = deep_callers.get(i).map(tracker, ref, deep_contexts[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Count "true" allele frequency from deep calls, and compare to the estimated frequency from the pool.
|
||||||
|
// 4.1. Count "true" allele frequency from deep calls, and downsample the individuals with solid truth.
|
||||||
|
double true_alt_freq = 0.0;
|
||||||
|
double true_N = 0.0;
|
||||||
|
LocusContext[] downsampled_contexts = new LocusContext[sample_names.size()];
|
||||||
|
for (int i = 0; i < deep_calls.length; i++)
|
||||||
|
{
|
||||||
|
downsampled_contexts[i] = null;
|
||||||
|
if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0))
|
||||||
|
{
|
||||||
|
true_alt_freq += deep_calls[i].emperical_allele_frequency() * deep_calls[i].N;
|
||||||
|
true_N += 2;
|
||||||
|
downsampled_contexts[i] = filterLocusContext(context, sample_names.get(i), DOWNSAMPLE + (int)(random.nextGaussian()*DOWNSAMPLE_NOISE));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true_alt_freq /= true_N;
|
||||||
|
|
||||||
|
if (true_N == 0.0) { return null; } // just bail if true_N == 0.
|
||||||
|
|
||||||
|
// 4.2. Pool just the contexts that have truth data.
|
||||||
|
LocusContext pooled_context = poolLocusContext(downsampled_contexts);
|
||||||
|
|
||||||
|
|
||||||
|
// EM Loop:
|
||||||
|
AlleleFrequencyEstimate pooled_call = null;
|
||||||
|
|
||||||
|
double correct_shallow_calls = 0;
|
||||||
|
double total_shallow_calls = 0;
|
||||||
|
AlleleFrequencyEstimate[] shallow_calls = null;
|
||||||
|
double EM_alt_freq = 0;
|
||||||
|
double EM_N = 0;
|
||||||
|
double shallow_calls_fraction_correct = 0;
|
||||||
|
|
||||||
|
// 5. Call the pool. (this step is the EM init)
|
||||||
|
pooled_caller.N = (int)true_N;
|
||||||
|
pooled_call = pooled_caller.map(tracker, ref, pooled_context);
|
||||||
|
System.out.print("POOLED_CALL " + pooled_call.asTabularString());
|
||||||
|
|
||||||
|
// (this loop is the EM cycle)
|
||||||
|
EM_alt_freq = pooled_call.qhat;
|
||||||
|
int num_iterations = 10;
|
||||||
|
double[] trajectory = new double[num_iterations + 1]; trajectory[0] = EM_alt_freq;
|
||||||
|
double[] likelihood_trajectory = new double[num_iterations + 1];
|
||||||
|
for (int iterations = 0; iterations < num_iterations; iterations++)
|
||||||
|
{
|
||||||
|
// 6. Re-call from shallow coverage using the estimated frequency as a prior,
|
||||||
|
// and compare to true deep calls,
|
||||||
|
// and compute new MAF estimate.
|
||||||
|
correct_shallow_calls = 0;
|
||||||
|
total_shallow_calls = 0;
|
||||||
|
shallow_calls = new AlleleFrequencyEstimate[sample_names.size()];
|
||||||
|
EM_N = 0.0;
|
||||||
|
double EM_sum = 0.0;
|
||||||
|
double likelihood = 0.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < deep_calls.length; i++)
|
||||||
|
{
|
||||||
|
// Only shallow-call things where we know the truth!
|
||||||
|
if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0))
|
||||||
|
{
|
||||||
|
shallow_callers.get(i).setAlleleFrequencyPrior(EM_alt_freq);
|
||||||
|
shallow_calls[i] = shallow_callers.get(i).map(tracker, ref, filterLocusContext(context, sample_names.get(i), DOWNSAMPLE + (int)(random.nextGaussian() * DOWNSAMPLE_NOISE)));
|
||||||
|
String deep_genotype = deep_calls[i].genotype();
|
||||||
|
String shallow_genotype = shallow_calls[i].genotype();
|
||||||
|
|
||||||
|
/*
|
||||||
|
System.out.printf("DBG: %f %f %f %f\n",
|
||||||
|
deep_calls[i].lodVsNextBest,
|
||||||
|
deep_calls[i].lodVsRef,
|
||||||
|
shallow_calls[i].lodVsNextBest,
|
||||||
|
shallow_calls[i].lodVsRef);
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (deep_genotype.equals(shallow_genotype)) { correct_shallow_calls += 1; }
|
||||||
|
total_shallow_calls += 1;
|
||||||
|
|
||||||
|
EM_sum += shallow_calls[i].emperical_allele_frequency() * shallow_calls[i].N;
|
||||||
|
EM_N += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EM_alt_freq = EM_sum / EM_N;
|
||||||
|
shallow_calls_fraction_correct = correct_shallow_calls / total_shallow_calls;
|
||||||
|
trajectory[iterations+1] = EM_alt_freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 7. Compare to estimation from the pool.
|
||||||
|
System.out.printf("EVAL %s %f %f %f %f %f %f %d %d %f %f %f %f\n",
|
||||||
|
pooled_call.location,
|
||||||
|
pooled_call.lodVsRef,
|
||||||
|
true_alt_freq,
|
||||||
|
pooled_call.qhat,
|
||||||
|
pooled_call.qstar,
|
||||||
|
true_alt_freq * true_N,
|
||||||
|
pooled_call.emperical_allele_frequency() * true_N,
|
||||||
|
pooled_call.N,
|
||||||
|
pooled_call.depth,
|
||||||
|
total_shallow_calls,
|
||||||
|
correct_shallow_calls,
|
||||||
|
shallow_calls_fraction_correct,
|
||||||
|
EM_alt_freq);
|
||||||
|
System.out.print("TRAJECTORY ");
|
||||||
|
for (int i = 0; i < trajectory.length; i++)
|
||||||
|
{
|
||||||
|
System.out.printf("%f ", trajectory[i]);
|
||||||
|
}
|
||||||
|
System.out.print("\n");
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private LocusContext poolLocusContext(LocusContext[] contexts)
|
||||||
|
{
|
||||||
|
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||||
|
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||||
|
|
||||||
|
GenomeLoc location = null;
|
||||||
|
|
||||||
|
for (int i = 0; i < contexts.length; i++)
|
||||||
|
{
|
||||||
|
if (contexts[i] != null)
|
||||||
|
{
|
||||||
|
location = contexts[i].getLocation();
|
||||||
|
reads.addAll(contexts[i].getReads());
|
||||||
|
offsets.addAll(contexts[i].getOffsets());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new LocusContext(location, reads, offsets);
|
||||||
|
}
|
||||||
|
|
||||||
|
private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample)
|
||||||
|
{
|
||||||
|
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||||
|
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||||
|
|
||||||
for (int i = 0; i < context.getReads().size(); i++)
|
for (int i = 0; i < context.getReads().size(); i++)
|
||||||
{
|
{
|
||||||
String read_group = (String)(context.getReads().get(i).getAttribute("RG"));
|
SAMRecord read = context.getReads().get(i);
|
||||||
String sample = context.getReads().get(i).getHeader().getReadGroup(read_group).READ_GROUP_SAMPLE_TAG;
|
Integer offset = context.getOffsets().get(i);
|
||||||
System.out.println("RG: " + read_group + " SAMPLE: " + sample);
|
String RG = (String)(read.getAttribute("RG"));
|
||||||
|
String sample = read.getHeader().getReadGroup(RG).getSample();
|
||||||
|
if (sample == sample_name)
|
||||||
|
{
|
||||||
|
reads.add(read);
|
||||||
|
offsets.add(offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return null;
|
|
||||||
|
if (downsample != 0)
|
||||||
|
{
|
||||||
|
List<Integer> perm = new ArrayList<Integer>();
|
||||||
|
for (int i = 0; i < reads.size(); i++) { perm.add(i); }
|
||||||
|
perm = Utils.RandomSubset(perm, downsample);
|
||||||
|
|
||||||
|
ArrayList<SAMRecord> downsampled_reads = new ArrayList<SAMRecord>();
|
||||||
|
ArrayList<Integer> downsampled_offsets = new ArrayList<Integer>();
|
||||||
|
|
||||||
|
for (int i = 0; i < perm.size(); i++)
|
||||||
|
{
|
||||||
|
downsampled_reads.add(reads.get(perm.get(i)));
|
||||||
|
downsampled_offsets.add(offsets.get(perm.get(i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
reads = downsampled_reads;
|
||||||
|
offsets = downsampled_offsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new LocusContext(context.getLocation(), reads, offsets);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onTraversalDone()
|
public void onTraversalDone()
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
package org.broadinstitute.sting.playground.utils;
|
package org.broadinstitute.sting.playground.utils;
|
||||||
|
|
||||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.lang.Math;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
|
||||||
public class AlleleFrequencyEstimate {
|
public class AlleleFrequencyEstimate {
|
||||||
|
|
@ -13,12 +15,14 @@ public class AlleleFrequencyEstimate {
|
||||||
public double qstar;
|
public double qstar;
|
||||||
public double lodVsRef;
|
public double lodVsRef;
|
||||||
public double lodVsNextBest;
|
public double lodVsNextBest;
|
||||||
|
public double pBest;
|
||||||
|
public double pRef;
|
||||||
public int depth;
|
public int depth;
|
||||||
public String notes;
|
public String notes;
|
||||||
|
|
||||||
GenomeLoc l;
|
GenomeLoc l;
|
||||||
|
|
||||||
public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, int depth)
|
public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth)
|
||||||
{
|
{
|
||||||
this.location = location;
|
this.location = location;
|
||||||
this.ref = ref;
|
this.ref = ref;
|
||||||
|
|
@ -32,6 +36,29 @@ public class AlleleFrequencyEstimate {
|
||||||
this.notes = "";
|
this.notes = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Return the most likely genotype. */
|
||||||
|
public String genotype()
|
||||||
|
{
|
||||||
|
int alt_count = (int)(qstar * N);
|
||||||
|
int ref_count = N-alt_count;
|
||||||
|
char[] alleles = new char[N];
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < ref_count; i++) { alleles[i] = ref; }
|
||||||
|
for (; i < N; i++) { alleles[i] = alt; }
|
||||||
|
Arrays.sort(alleles);
|
||||||
|
return new String(alleles);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double emperical_allele_frequency()
|
||||||
|
{
|
||||||
|
return (double)Math.round((double)qhat * (double)N) / (double)N;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double emperical_allele_frequency(int N)
|
||||||
|
{
|
||||||
|
return (double)Math.round((double)qhat * (double)N) / (double)N;
|
||||||
|
}
|
||||||
|
|
||||||
public String asGFFString()
|
public String asGFFString()
|
||||||
{
|
{
|
||||||
return String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\tREF %c\t;\tALT %c\t;\tFREQ %f\n",
|
return String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\tREF %c\t;\tALT %c\t;\tFREQ %f\n",
|
||||||
|
|
|
||||||
|
|
@ -143,26 +143,26 @@ public class AlleleMetrics {
|
||||||
if (num_loci_total == 0) { return; }
|
if (num_loci_total == 0) { return; }
|
||||||
|
|
||||||
out.printf("\n");
|
out.printf("\n");
|
||||||
out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
|
out.printf("Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
|
||||||
out.printf("METRICS -------------------------------------------------\n");
|
out.printf("-------------------------------------------------\n");
|
||||||
out.printf("METRICS Total loci : %d\n", num_loci_total);
|
out.printf("Total loci : %d\n", num_loci_total);
|
||||||
out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
out.printf("Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
||||||
if (num_variants != 0)
|
if (num_variants != 0)
|
||||||
{
|
{
|
||||||
out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
out.printf("Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||||
out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
out.printf("Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||||
out.printf("METRICS -------------------------------------------------\n");
|
out.printf("-------------------------------------------------\n");
|
||||||
out.printf("METRICS -- Hapmap Genotyping performance --\n");
|
out.printf("-- Hapmap Genotyping performance --\n");
|
||||||
out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
|
out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
|
||||||
out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
|
out.printf("Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
|
||||||
out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
|
out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
|
||||||
out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
|
out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
|
||||||
out.printf("METRICS -------------------------------------------------\n");
|
out.printf("-------------------------------------------------\n");
|
||||||
out.printf("METRICS -- Hapmap Reference/Variant performance --\n");
|
out.printf("-- Hapmap Reference/Variant performance --\n");
|
||||||
out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
|
out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
|
||||||
out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
|
out.printf("Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
|
||||||
out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
|
out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
|
||||||
out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
|
out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
|
||||||
}
|
}
|
||||||
out.println();
|
out.println();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue