diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java index 91ddf973c..1a083dbd8 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java @@ -31,25 +31,39 @@ public class AlleleFrequencyWalker extends LocusWalker reads = context.getReads(); - for (int i = 0; i < reads.size(); i++) - { - String cigar = reads.get(i).getCigarString(); - System.out.println("DEBUG " + cigar); - } - } - */ + // Convert context data into bases and 4-base quals + String bases = getBases(context); + double quals[][] = getQuals(context); + //String[] indels = getIndels(context); logger.debug(String.format("In alleleFrequnecy walker: N=%d, d=%d", N, DOWNSAMPLE)); @@ -63,6 +77,14 @@ public class AlleleFrequencyWalker extends LocusWalker reads = context.getReads(); + List offsets = context.getOffsets(); + String[] indels = new String[reads.size()]; + + for (int i = 0; i < reads.size(); i++) + { + SAMRecord read = reads.get(i); + Cigar cigar = read.getCigar(); + + int k = 0; + for (int j = 0; j < cigar.numCigarElements(); j++) + { + CigarOperator operator = cigar.getCigarElement(j).getOperator(); + int length = cigar.getCigarElement(j).getLength(); + if (operator == CigarOperator.M) + { + k += length; + } + else if ((k != offset) && (operator == CigarOperator.I)) + { + k += length; + } + else if ((k == offset) && (operator == CigarOperator.I)) + { + // this insertion is associated with this offset. + + break; + } + else if ((k == offset) && (operator == CigarOperator.D)) + { + + break; + } + else if ((k == offset) && + ((operator == CigarOperator.I) || (operator == CigarOperator.D))) + { + // no indel here. + indels[i] = ""; + break; + } + } + } + + return indels; + } + */ + static public double[][] getQuals (LocusContext context) { int numReads = context.getReads().size(); //numReads(); @@ -220,51 +292,48 @@ public class AlleleFrequencyWalker extends LocusWalker bestMixtures[qstar_N].posterior) - bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q); + bestMixtures[qstar_N] = new MixtureLikelihood(posterior, qstar, q); - //System.out.format("%.2f %.2f %5.2f %5.2f %5.2f %5.2f\n", q, qstar, pDq, pqG, pG, posterior); + //System.out.format("DBG %s %.2f %.2f %5.2f %5.2f %5.2f %5.2f %d %d %s\n", location, q, qstar, pDq, pqG, pG, posterior, (int)(q*bases.length), (int)((1.0-q)*bases.length), new String(bases)); } } // First reverse sort NONREF mixtures according to highest posterior probabililty - Arrays.sort(bestMixtures, 1, N+1); + Arrays.sort(bestMixtures); // Calculate Lod of any variant call versus the reference call // Answers how confident are we in the best variant (nonref) mixture versus the null hypothesis // reference mixture - qhat = qstar = 0.0 - double lodVarVsRef = bestMixtures[1].posterior - posterior_null_hyp; + double lodVarVsRef = bestMixtures[0].posterior - posterior_null_hyp; // Now reverse sort ALL mixtures according to highest posterior probability Arrays.sort(bestMixtures); @@ -273,6 +342,8 @@ public class AlleleFrequencyWalker extends LocusWalker { - List deep_callers; - List shallow_callers; - AlleleFrequencyWalker pooled_caller; + List deep_callers = null; + List shallow_callers = null; + AlleleFrequencyWalker pooled_caller = null; + List sample_names = null; - @Argument public int DOWNSAMPLE; + //@Argument public int DOWNSAMPLE; + public int DOWNSAMPLE = 4; + public int DOWNSAMPLE_NOISE = 3; + public boolean LOG_METRICS = true; + + private Random random; + + public void initialize() + { + GenomeAnalysisTK toolkit = this.getToolkit(); + SAMFileHeader header = toolkit.getSamReader().getFileHeader(); + List read_groups = header.getReadGroups(); + + sample_names = new ArrayList(); + deep_callers = new ArrayList(); + shallow_callers = new ArrayList(); + + random = new Random(666); + + for (int i = 0; i < read_groups.size(); i++) + { + String sample_name = read_groups.get(i).getSample(); + //System.out.println("SAMPLE: " + sample_name); + + AlleleFrequencyWalker deep = new AlleleFrequencyWalker(); + AlleleFrequencyWalker shallow = new AlleleFrequencyWalker(); + + deep.N = 2; + deep.DOWNSAMPLE = 0; + deep.GFF_OUTPUT_FILE = "/dev/null"; + deep.initalize(); + + shallow.N = 2; + shallow.DOWNSAMPLE = 0; + shallow.GFF_OUTPUT_FILE = "/dev/null"; + shallow.initalize(); + + sample_names.add(sample_name); + deep_callers.add(deep); + shallow_callers.add(shallow); + } + + pooled_caller = new AlleleFrequencyWalker(); + pooled_caller.N = sample_names.size() * 2; + pooled_caller.DOWNSAMPLE = 0; + pooled_caller.GFF_OUTPUT_FILE = "/dev/null"; + //pooled_caller.LOG_METRICS = LOG_METRICS; + //pooled_caller.METRICS_OUTPUT_FILE = "metrics.out"; + //pooled_caller.METRICS_INTERVAL = 1; + pooled_caller.initalize(); + pooled_caller.reduceInit(); + } public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) { + // 1. seperate each context. + LocusContext[] deep_contexts = new LocusContext[sample_names.size()]; + for (int i = 0; i < sample_names.size(); i++) + { + deep_contexts[i] = filterLocusContext(context, sample_names.get(i), 0); + } + + // 2. Pool the deep contexts. (for the hell of it) + LocusContext deep_pooled_context = poolLocusContext(deep_contexts); + + // 3. Call individuals from deep coverage. + AlleleFrequencyEstimate[] deep_calls = new AlleleFrequencyEstimate[sample_names.size()]; + for (int i = 0; i < sample_names.size(); i++) + { + deep_calls[i] = deep_callers.get(i).map(tracker, ref, deep_contexts[i]); + } + + // 4. Count "true" allele frequency from deep calls, and compare to the estimated frequency from the pool. + // 4.1. Count "true" allele frequency from deep calls, and downsample the individuals with solid truth. + double true_alt_freq = 0.0; + double true_N = 0.0; + LocusContext[] downsampled_contexts = new LocusContext[sample_names.size()]; + for (int i = 0; i < deep_calls.length; i++) + { + downsampled_contexts[i] = null; + if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0)) + { + true_alt_freq += deep_calls[i].emperical_allele_frequency() * deep_calls[i].N; + true_N += 2; + downsampled_contexts[i] = filterLocusContext(context, sample_names.get(i), DOWNSAMPLE + (int)(random.nextGaussian()*DOWNSAMPLE_NOISE)); + } + } + true_alt_freq /= true_N; + + if (true_N == 0.0) { return null; } // just bail if true_N == 0. + + // 4.2. Pool just the contexts that have truth data. + LocusContext pooled_context = poolLocusContext(downsampled_contexts); + + + // EM Loop: + AlleleFrequencyEstimate pooled_call = null; + + double correct_shallow_calls = 0; + double total_shallow_calls = 0; + AlleleFrequencyEstimate[] shallow_calls = null; + double EM_alt_freq = 0; + double EM_N = 0; + double shallow_calls_fraction_correct = 0; + + // 5. Call the pool. (this step is the EM init) + pooled_caller.N = (int)true_N; + pooled_call = pooled_caller.map(tracker, ref, pooled_context); + System.out.print("POOLED_CALL " + pooled_call.asTabularString()); + + // (this loop is the EM cycle) + EM_alt_freq = pooled_call.qhat; + int num_iterations = 10; + double[] trajectory = new double[num_iterations + 1]; trajectory[0] = EM_alt_freq; + double[] likelihood_trajectory = new double[num_iterations + 1]; + for (int iterations = 0; iterations < num_iterations; iterations++) + { + // 6. Re-call from shallow coverage using the estimated frequency as a prior, + // and compare to true deep calls, + // and compute new MAF estimate. + correct_shallow_calls = 0; + total_shallow_calls = 0; + shallow_calls = new AlleleFrequencyEstimate[sample_names.size()]; + EM_N = 0.0; + double EM_sum = 0.0; + double likelihood = 0.0; + + for (int i = 0; i < deep_calls.length; i++) + { + // Only shallow-call things where we know the truth! + if ((deep_calls[i].lodVsNextBest >= 5.0) || (deep_calls[i].lodVsRef <= -5.0)) + { + shallow_callers.get(i).setAlleleFrequencyPrior(EM_alt_freq); + shallow_calls[i] = shallow_callers.get(i).map(tracker, ref, filterLocusContext(context, sample_names.get(i), DOWNSAMPLE + (int)(random.nextGaussian() * DOWNSAMPLE_NOISE))); + String deep_genotype = deep_calls[i].genotype(); + String shallow_genotype = shallow_calls[i].genotype(); + + /* + System.out.printf("DBG: %f %f %f %f\n", + deep_calls[i].lodVsNextBest, + deep_calls[i].lodVsRef, + shallow_calls[i].lodVsNextBest, + shallow_calls[i].lodVsRef); + */ + + if (deep_genotype.equals(shallow_genotype)) { correct_shallow_calls += 1; } + total_shallow_calls += 1; + + EM_sum += shallow_calls[i].emperical_allele_frequency() * shallow_calls[i].N; + EM_N += 2; + } + } + EM_alt_freq = EM_sum / EM_N; + shallow_calls_fraction_correct = correct_shallow_calls / total_shallow_calls; + trajectory[iterations+1] = EM_alt_freq; + } + + // 7. Compare to estimation from the pool. + System.out.printf("EVAL %s %f %f %f %f %f %f %d %d %f %f %f %f\n", + pooled_call.location, + pooled_call.lodVsRef, + true_alt_freq, + pooled_call.qhat, + pooled_call.qstar, + true_alt_freq * true_N, + pooled_call.emperical_allele_frequency() * true_N, + pooled_call.N, + pooled_call.depth, + total_shallow_calls, + correct_shallow_calls, + shallow_calls_fraction_correct, + EM_alt_freq); + System.out.print("TRAJECTORY "); + for (int i = 0; i < trajectory.length; i++) + { + System.out.printf("%f ", trajectory[i]); + } + System.out.print("\n"); + + return null; + } + + private LocusContext poolLocusContext(LocusContext[] contexts) + { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + GenomeLoc location = null; + + for (int i = 0; i < contexts.length; i++) + { + if (contexts[i] != null) + { + location = contexts[i].getLocation(); + reads.addAll(contexts[i].getReads()); + offsets.addAll(contexts[i].getOffsets()); + } + } + + return new LocusContext(location, reads, offsets); + } + + private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample) + { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + for (int i = 0; i < context.getReads().size(); i++) { - String read_group = (String)(context.getReads().get(i).getAttribute("RG")); - String sample = context.getReads().get(i).getHeader().getReadGroup(read_group).READ_GROUP_SAMPLE_TAG; - System.out.println("RG: " + read_group + " SAMPLE: " + sample); - } - return null; + SAMRecord read = context.getReads().get(i); + Integer offset = context.getOffsets().get(i); + String RG = (String)(read.getAttribute("RG")); + String sample = read.getHeader().getReadGroup(RG).getSample(); + if (sample == sample_name) + { + reads.add(read); + offsets.add(offset); + } + } + + if (downsample != 0) + { + List perm = new ArrayList(); + for (int i = 0; i < reads.size(); i++) { perm.add(i); } + perm = Utils.RandomSubset(perm, downsample); + + ArrayList downsampled_reads = new ArrayList(); + ArrayList downsampled_offsets = new ArrayList(); + + for (int i = 0; i < perm.size(); i++) + { + downsampled_reads.add(reads.get(perm.get(i))); + downsampled_offsets.add(offsets.get(perm.get(i))); + } + + reads = downsampled_reads; + offsets = downsampled_offsets; + } + + return new LocusContext(context.getLocation(), reads, offsets); } public void onTraversalDone() diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java index 6973cb233..001d4c369 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java +++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.playground.utils; import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; +import java.util.Arrays; +import java.lang.Math; import org.broadinstitute.sting.utils.GenomeLoc; public class AlleleFrequencyEstimate { @@ -13,12 +15,14 @@ public class AlleleFrequencyEstimate { public double qstar; public double lodVsRef; public double lodVsNextBest; + public double pBest; + public double pRef; public int depth; public String notes; GenomeLoc l; - public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, int depth) + public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth) { this.location = location; this.ref = ref; @@ -32,6 +36,29 @@ public class AlleleFrequencyEstimate { this.notes = ""; } + /** Return the most likely genotype. */ + public String genotype() + { + int alt_count = (int)(qstar * N); + int ref_count = N-alt_count; + char[] alleles = new char[N]; + int i; + for (i = 0; i < ref_count; i++) { alleles[i] = ref; } + for (; i < N; i++) { alleles[i] = alt; } + Arrays.sort(alleles); + return new String(alleles); + } + + public double emperical_allele_frequency() + { + return (double)Math.round((double)qhat * (double)N) / (double)N; + } + + public double emperical_allele_frequency(int N) + { + return (double)Math.round((double)qhat * (double)N) / (double)N; + } + public String asGFFString() { return String.format("%s\tCALLER\tVARIANT\t%s\t%s\t%f\t.\t.\tREF %c\t;\tALT %c\t;\tFREQ %f\n", diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java index 8f026d1ad..3fd7668df 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java +++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java @@ -143,26 +143,26 @@ public class AlleleMetrics { if (num_loci_total == 0) { return; } out.printf("\n"); - out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff); - out.printf("METRICS -------------------------------------------------\n"); - out.printf("METRICS Total loci : %d\n", num_loci_total); - out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total); + out.printf("Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff); + out.printf("-------------------------------------------------\n"); + out.printf("Total loci : %d\n", num_loci_total); + out.printf("Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total); if (num_variants != 0) { - out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants); - out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants); - out.printf("METRICS -------------------------------------------------\n"); - out.printf("METRICS -- Hapmap Genotyping performance --\n"); - out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect); - out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct); - out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect); - out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect)); - out.printf("METRICS -------------------------------------------------\n"); - out.printf("METRICS -- Hapmap Reference/Variant performance --\n"); - out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect); - out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct); - out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect); - out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect)); + out.printf("Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants); + out.printf("Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants); + out.printf("-------------------------------------------------\n"); + out.printf("-- Hapmap Genotyping performance --\n"); + out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect); + out.printf("Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct); + out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect); + out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect)); + out.printf("-------------------------------------------------\n"); + out.printf("-- Hapmap Reference/Variant performance --\n"); + out.printf("Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect); + out.printf("Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct); + out.printf("Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect); + out.printf("%% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect)); } out.println(); }