First draft of actual pooled EM caller.
Produces sane looking output on region of 1kG pilot1: CALL NA12813.SRP000031.2009_02.bam CC 0.609084 0.609084 CALL NA12003.SRP000031.2009_02.bam CC 2.114234 2.114234 CCCCC CALL NA06994.SRP000031.2009_02.bam CC 0.910114 0.910114 C CALL NA18940.SRP000031.2009_02.bam CT 2.589749 0.910114 T CALL NA18555.SRP000031.2009_02.bam CC 0.609084 0.609084 Next up, eval vs. Baseline pilot1 calls and pilot3 deep-coverage truth. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@526 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dd408a2a9a
commit
4e4fd33584
|
|
@ -0,0 +1,228 @@
|
|||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||
import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
|
||||
{
|
||||
List<SingleSampleGenotyper> callers = null;
|
||||
List<String> sample_names = null;
|
||||
|
||||
//@Argument(required=false, shortName="log_metrics", defaultValue="true") public boolean LOG_METRICS;
|
||||
@Argument(required=false, shortName="fractional_counts", defaultValue="false") public boolean FRACTIONAL_COUNTS;
|
||||
|
||||
private Random random;
|
||||
|
||||
private SAMFileHeader header;
|
||||
|
||||
public void initialize()
|
||||
{
|
||||
GenomeAnalysisTK toolkit = this.getToolkit();
|
||||
this.header = toolkit.getEngine().getSAMHeader();
|
||||
List<SAMReadGroupRecord> read_groups = header.getReadGroups();
|
||||
|
||||
/*
|
||||
GenomeAnalysisTK toolkit = this.getToolkit();
|
||||
SAMFileHeader header = toolkit.getSamReader().getFileHeader();
|
||||
List<SAMReadGroupRecord> read_groups = header.getReadGroups();
|
||||
*/
|
||||
|
||||
sample_names = new ArrayList<String>();
|
||||
callers = new ArrayList<SingleSampleGenotyper>();
|
||||
|
||||
random = new Random(42);
|
||||
|
||||
for (int i = 0; i < read_groups.size(); i++)
|
||||
{
|
||||
String sample_name = read_groups.get(i).getSample();
|
||||
sample_names.add(sample_name);
|
||||
//System.out.println("SAMPLE: " + sample_name);
|
||||
|
||||
SingleSampleGenotyper caller = new SingleSampleGenotyper();
|
||||
caller.metricsFileName = "/dev/null";
|
||||
caller.lodThreshold = 5.0;
|
||||
caller.fourBaseMode = false;
|
||||
caller.initialize();
|
||||
callers.add(caller);
|
||||
}
|
||||
}
|
||||
|
||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||
{
|
||||
// 1. seperate each context.
|
||||
LocusContext[] contexts = new LocusContext[sample_names.size()];
|
||||
for (int i = 0; i < sample_names.size(); i++)
|
||||
{
|
||||
contexts[i] = filterLocusContext(context, sample_names.get(i), 0);
|
||||
//System.out.printf("DEPTH %s %d\n", sample_names.get(i), contexts[i].getReads().size());
|
||||
}
|
||||
//System.out.printf("DEPTH %s %d\n", "TOTAL", context.getReads().size());
|
||||
|
||||
// EM Loop:
|
||||
AlleleFrequencyEstimate[] calls = null;
|
||||
double EM_alt_freq = 0;
|
||||
double EM_N = 0;
|
||||
|
||||
// (this loop is the EM cycle)
|
||||
EM_alt_freq = 0.5;
|
||||
int num_iterations = 10;
|
||||
double[] trajectory = new double[num_iterations + 1]; trajectory[0] = EM_alt_freq;
|
||||
double[] likelihood_trajectory = new double[num_iterations + 1]; likelihood_trajectory[0] = 0.0;
|
||||
for (int iterations = 0; iterations < num_iterations; iterations++)
|
||||
{
|
||||
// 6. Re-call from shallow coverage using the estimated frequency as a prior,
|
||||
// and compare to true deep calls,
|
||||
// and compute new MAF estimate.
|
||||
calls = new AlleleFrequencyEstimate[sample_names.size()];
|
||||
EM_N = 0.0;
|
||||
double EM_sum = 0.0;
|
||||
double likelihood = 0.0;
|
||||
|
||||
for (int i = 0; i < sample_names.size(); i++)
|
||||
{
|
||||
callers.get(i).setAlleleFrequencyPrior(EM_alt_freq);
|
||||
calls[i] = callers.get(i).map(tracker, ref, contexts[i]);
|
||||
String genotype = calls[i].genotype();
|
||||
|
||||
likelihood += calls[i].posterior();
|
||||
|
||||
//System.out.printf("DBG: %f %f %f %f\n",
|
||||
// deep_calls[i].lodVsNextBest,
|
||||
// deep_calls[i].lodVsRef,
|
||||
// shallow_calls[i].lodVsNextBest,
|
||||
// shallow_calls[i].lodVsRef);
|
||||
|
||||
if (! FRACTIONAL_COUNTS)
|
||||
{
|
||||
EM_sum += calls[i].emperical_allele_frequency() * calls[i].N;
|
||||
EM_N += calls[i].N;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = 0; j <= calls[i].N; j++)
|
||||
{
|
||||
if (Double.isInfinite(calls[i].posteriors[j])) { calls[i].posteriors[j] = -10000; }
|
||||
System.out.printf("DBG3: %d %f %d\n", j, calls[i].posteriors[j], calls[i].N);
|
||||
EM_sum += Math.pow(10,calls[i].posteriors[j]) * (double)j;
|
||||
EM_N += calls[i].N;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
EM_alt_freq = EM_sum / EM_N;
|
||||
trajectory[iterations+1] = EM_alt_freq;
|
||||
likelihood_trajectory[iterations+1] = likelihood/(double)EM_N;
|
||||
|
||||
//System.out.printf("DBGTRAJ %f %f %f %f\n", EM_sum, EM_N, trajectory[iterations], trajectory[iterations+1]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < sample_names.size(); i++)
|
||||
{
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts[i]);
|
||||
System.out.printf("CALL %s %s %f %f %s\n", sample_names.get(i), calls[i].genotype(), calls[i].lodVsRef, calls[i].lodVsNextBest, pileup.getBases());
|
||||
}
|
||||
|
||||
// 7. Compare to estimation from the pool.
|
||||
System.out.printf("EVAL %s %f\n",
|
||||
context.getLocation(),
|
||||
EM_alt_freq);
|
||||
//for (int i = 0; i < likelihood_trajectory.length; i++)
|
||||
//{
|
||||
// System.out.printf("TRAJECTORY %f %f\n", trajectory[i], likelihood_trajectory[i]);
|
||||
//}
|
||||
System.out.print("\n\n");
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private LocusContext poolLocusContext(LocusContext[] contexts)
|
||||
{
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
|
||||
GenomeLoc location = null;
|
||||
|
||||
for (int i = 0; i < contexts.length; i++)
|
||||
{
|
||||
if (contexts[i] != null)
|
||||
{
|
||||
location = contexts[i].getLocation();
|
||||
reads.addAll(contexts[i].getReads());
|
||||
offsets.addAll(contexts[i].getOffsets());
|
||||
}
|
||||
}
|
||||
|
||||
return new LocusContext(location, reads, offsets);
|
||||
}
|
||||
|
||||
private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample)
|
||||
{
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
|
||||
for (int i = 0; i < context.getReads().size(); i++)
|
||||
{
|
||||
SAMRecord read = context.getReads().get(i);
|
||||
Integer offset = context.getOffsets().get(i);
|
||||
String RG = (String)(read.getAttribute("RG"));
|
||||
String sample = header.getReadGroup(RG).getSample();
|
||||
if (sample == sample_name)
|
||||
{
|
||||
reads.add(read);
|
||||
offsets.add(offset);
|
||||
}
|
||||
}
|
||||
|
||||
if (downsample != 0)
|
||||
{
|
||||
List<Integer> perm = new ArrayList<Integer>();
|
||||
for (int i = 0; i < reads.size(); i++) { perm.add(i); }
|
||||
perm = Utils.RandomSubset(perm, downsample);
|
||||
|
||||
ArrayList<SAMRecord> downsampled_reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> downsampled_offsets = new ArrayList<Integer>();
|
||||
|
||||
for (int i = 0; i < perm.size(); i++)
|
||||
{
|
||||
downsampled_reads.add(reads.get(perm.get(i)));
|
||||
downsampled_offsets.add(offsets.get(perm.get(i)));
|
||||
}
|
||||
|
||||
reads = downsampled_reads;
|
||||
offsets = downsampled_offsets;
|
||||
}
|
||||
|
||||
return new LocusContext(context.getLocation(), reads, offsets);
|
||||
}
|
||||
|
||||
public void onTraversalDone()
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
public String reduceInit()
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue