From 4e4fd33584f35828fdf059df168a156818480784 Mon Sep 17 00:00:00 2001 From: jmaguire Date: Fri, 24 Apr 2009 13:43:41 +0000 Subject: [PATCH] First draft of actual pooled EM caller. Produces sane looking output on region of 1kG pilot1: CALL NA12813.SRP000031.2009_02.bam CC 0.609084 0.609084 CALL NA12003.SRP000031.2009_02.bam CC 2.114234 2.114234 CCCCC CALL NA06994.SRP000031.2009_02.bam CC 0.910114 0.910114 C CALL NA18940.SRP000031.2009_02.bam CT 2.589749 0.910114 T CALL NA18555.SRP000031.2009_02.bam CC 0.609084 0.609084 Next up, eval vs. Baseline pilot1 calls and pilot3 deep-coverage truth. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@526 348d0f76-0448-11de-a6fe-93d51630548a --- .../playground/gatk/walkers/PoolCaller.java | 228 ++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java new file mode 100644 index 000000000..1815866f3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java @@ -0,0 +1,228 @@ + +package org.broadinstitute.sting.playground.gatk.walkers; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.*; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.gatk.refdata.rodGFF; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; +import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.*; + +public class PoolCaller extends LocusWalker +{ + List callers = null; + List sample_names = null; + + //@Argument(required=false, shortName="log_metrics", defaultValue="true") public boolean LOG_METRICS; + @Argument(required=false, shortName="fractional_counts", defaultValue="false") public boolean FRACTIONAL_COUNTS; + + private Random random; + + private SAMFileHeader header; + + public void initialize() + { + GenomeAnalysisTK toolkit = this.getToolkit(); + this.header = toolkit.getEngine().getSAMHeader(); + List read_groups = header.getReadGroups(); + + /* + GenomeAnalysisTK toolkit = this.getToolkit(); + SAMFileHeader header = toolkit.getSamReader().getFileHeader(); + List read_groups = header.getReadGroups(); + */ + + sample_names = new ArrayList(); + callers = new ArrayList(); + + random = new Random(42); + + for (int i = 0; i < read_groups.size(); i++) + { + String sample_name = read_groups.get(i).getSample(); + sample_names.add(sample_name); + //System.out.println("SAMPLE: " + sample_name); + + SingleSampleGenotyper caller = new SingleSampleGenotyper(); + caller.metricsFileName = "/dev/null"; + caller.lodThreshold = 5.0; + caller.fourBaseMode = false; + caller.initialize(); + callers.add(caller); + } + } + + public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) + { + // 1. seperate each context. + LocusContext[] contexts = new LocusContext[sample_names.size()]; + for (int i = 0; i < sample_names.size(); i++) + { + contexts[i] = filterLocusContext(context, sample_names.get(i), 0); + //System.out.printf("DEPTH %s %d\n", sample_names.get(i), contexts[i].getReads().size()); + } + //System.out.printf("DEPTH %s %d\n", "TOTAL", context.getReads().size()); + + // EM Loop: + AlleleFrequencyEstimate[] calls = null; + double EM_alt_freq = 0; + double EM_N = 0; + + // (this loop is the EM cycle) + EM_alt_freq = 0.5; + int num_iterations = 10; + double[] trajectory = new double[num_iterations + 1]; trajectory[0] = EM_alt_freq; + double[] likelihood_trajectory = new double[num_iterations + 1]; likelihood_trajectory[0] = 0.0; + for (int iterations = 0; iterations < num_iterations; iterations++) + { + // 6. Re-call from shallow coverage using the estimated frequency as a prior, + // and compare to true deep calls, + // and compute new MAF estimate. + calls = new AlleleFrequencyEstimate[sample_names.size()]; + EM_N = 0.0; + double EM_sum = 0.0; + double likelihood = 0.0; + + for (int i = 0; i < sample_names.size(); i++) + { + callers.get(i).setAlleleFrequencyPrior(EM_alt_freq); + calls[i] = callers.get(i).map(tracker, ref, contexts[i]); + String genotype = calls[i].genotype(); + + likelihood += calls[i].posterior(); + + //System.out.printf("DBG: %f %f %f %f\n", + // deep_calls[i].lodVsNextBest, + // deep_calls[i].lodVsRef, + // shallow_calls[i].lodVsNextBest, + // shallow_calls[i].lodVsRef); + + if (! FRACTIONAL_COUNTS) + { + EM_sum += calls[i].emperical_allele_frequency() * calls[i].N; + EM_N += calls[i].N; + } + else + { + for (int j = 0; j <= calls[i].N; j++) + { + if (Double.isInfinite(calls[i].posteriors[j])) { calls[i].posteriors[j] = -10000; } + System.out.printf("DBG3: %d %f %d\n", j, calls[i].posteriors[j], calls[i].N); + EM_sum += Math.pow(10,calls[i].posteriors[j]) * (double)j; + EM_N += calls[i].N; + } + } + + } + EM_alt_freq = EM_sum / EM_N; + trajectory[iterations+1] = EM_alt_freq; + likelihood_trajectory[iterations+1] = likelihood/(double)EM_N; + + //System.out.printf("DBGTRAJ %f %f %f %f\n", EM_sum, EM_N, trajectory[iterations], trajectory[iterations+1]); + } + + for (int i = 0; i < sample_names.size(); i++) + { + ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts[i]); + System.out.printf("CALL %s %s %f %f %s\n", sample_names.get(i), calls[i].genotype(), calls[i].lodVsRef, calls[i].lodVsNextBest, pileup.getBases()); + } + + // 7. Compare to estimation from the pool. + System.out.printf("EVAL %s %f\n", + context.getLocation(), + EM_alt_freq); + //for (int i = 0; i < likelihood_trajectory.length; i++) + //{ + // System.out.printf("TRAJECTORY %f %f\n", trajectory[i], likelihood_trajectory[i]); + //} + System.out.print("\n\n"); + + return null; + } + + private LocusContext poolLocusContext(LocusContext[] contexts) + { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + GenomeLoc location = null; + + for (int i = 0; i < contexts.length; i++) + { + if (contexts[i] != null) + { + location = contexts[i].getLocation(); + reads.addAll(contexts[i].getReads()); + offsets.addAll(contexts[i].getOffsets()); + } + } + + return new LocusContext(location, reads, offsets); + } + + private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample) + { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + for (int i = 0; i < context.getReads().size(); i++) + { + SAMRecord read = context.getReads().get(i); + Integer offset = context.getOffsets().get(i); + String RG = (String)(read.getAttribute("RG")); + String sample = header.getReadGroup(RG).getSample(); + if (sample == sample_name) + { + reads.add(read); + offsets.add(offset); + } + } + + if (downsample != 0) + { + List perm = new ArrayList(); + for (int i = 0; i < reads.size(); i++) { perm.add(i); } + perm = Utils.RandomSubset(perm, downsample); + + ArrayList downsampled_reads = new ArrayList(); + ArrayList downsampled_offsets = new ArrayList(); + + for (int i = 0; i < perm.size(); i++) + { + downsampled_reads.add(reads.get(perm.get(i))); + downsampled_offsets.add(offsets.get(perm.get(i))); + } + + reads = downsampled_reads; + offsets = downsampled_offsets; + } + + return new LocusContext(context.getLocation(), reads, offsets); + } + + public void onTraversalDone() + { + return; + } + + public String reduceInit() + { + return ""; + } + + public String reduce(AlleleFrequencyEstimate alleleFreq, String sum) + { + return ""; + } + + +}