From 417f5b145e72d8ae53944d05cc3e835990e39f7a Mon Sep 17 00:00:00 2001 From: jmaguire Date: Mon, 1 Jun 2009 17:13:21 +0000 Subject: [PATCH] Strand test and misc touch-ups git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@871 348d0f76-0448-11de-a6fe-93d51630548a --- .../playground/gatk/walkers/PoolCaller.java | 124 +++++++++++++++--- .../utils/AlleleFrequencyEstimate.java | 30 ++++- .../playground/utils/GenotypeLikelihoods.java | 27 ++-- 3 files changed, 145 insertions(+), 36 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java index cc64fd3d7..0dd210e24 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java @@ -101,8 +101,35 @@ public class PoolCaller extends LocusWalker public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) { if (ref == 'N') { return null; } + ref = Character.toUpperCase(ref); - // 1. seperate each context. + // seperate each context. + LocusContext forward = filterLocusContextByStrand(context, "+"); + LocusContext backward = filterLocusContextByStrand(context, "-"); + + if (forward.getReads().size() == 0) { return null; } + if (backward.getReads().size() == 0) { return null; } + + AlleleFrequencyEstimate estimate_both = EM(tracker, ref, context); + AlleleFrequencyEstimate estimate_forward = EM(tracker, ref, forward); + AlleleFrequencyEstimate estimate_backward = EM(tracker, ref, backward); + + discovery_output_file.printf("%s %c %f %c %f\n", + estimate_both.asPoolTabularString(), + estimate_forward.alt, + estimate_forward.lodVsRef, + estimate_backward.alt, + estimate_backward.lodVsRef); + //discovery_output_file.printf("%s\n", estimate_forward.asPoolTabularString()); + //discovery_output_file.printf("%s\n", estimate_backward.asPoolTabularString()); + //discovery_output_file.printf("\n"); + + return null; + } + + private AlleleFrequencyEstimate EM(RefMetaDataTracker tracker, char ref, LocusContext context) + { + if (context.getReads().size() == 0) { return null; } LocusContext[] contexts = filterLocusContext(context, sample_names, 0); // EM Loop: @@ -160,6 +187,8 @@ public class PoolCaller extends LocusWalker calls[i] = callers.get(i).map(tracker, ref, contexts[i]); String genotype = calls[i].genotype(); + if (calls[i].depth == 0) { continue; } + likelihood += calls[i].pBest; if (! FRACTIONAL_COUNTS) @@ -200,33 +229,61 @@ public class PoolCaller extends LocusWalker } // 7. Output some statistics. - double discovery_posterior = 0; + double discovery_likelihood = 0; double discovery_null = 0; + double discovery_prior = 0; + double discovery_null_prior = 0; + int n_ref = 0; + int n_het = 0; + int n_hom = 0; + for (int i = 1; i < EM_N-1; i++) + { + //discovery_prior += 1e-3/(double)i; + } + //discovery_prior = Math.log10(discovery_prior); + //discovery_null_prior = Math.log10(1.0 - discovery_prior); for (int i = 0; i < sample_names.size(); i++) { - discovery_posterior += calls[i].pBest; - discovery_null += calls[i].pRef; - //System.out.printf("DBG %f %f %c %s\n", calls[i].pBest, calls[i].pRef, ref, calls[i].bases); - } - double discovery_lod = discovery_posterior - discovery_null; - if (discovery_lod == 0) { alt = 'N'; } - discovery_output_file.printf("%s %c %c %f %f %f %f\n", context.getLocation(), ref, alt, EM_alt_freq, discovery_posterior, discovery_null, discovery_lod); + if (calls[i].depth == 0) { continue; } + + discovery_likelihood += calls[i].pBest; + discovery_null += calls[i].pRef; + //System.out.printf("DBG %f %f %c %s\n", calls[i].pBest, calls[i].pRef, ref, calls[i].bases); + + if (calls[i].qhat == 0.0) { n_ref += 1; } + if (calls[i].qhat == 0.5) { n_het += 1; } + if (calls[i].qhat == 1.0) { n_hom += 1; } + } + double discovery_lod = (discovery_likelihood + discovery_prior) - (discovery_null + discovery_null_prior); + if (discovery_lod <= 0) { alt = 'N'; } + //discovery_output_file.printf("%s %c %c %f %f %f %f %f %f %d %d %d\n", context.getLocation(), ref, alt, EM_alt_freq, discovery_likelihood, discovery_null, discovery_prior, discovery_lod, EM_N, n_ref, n_het, n_hom); + + AlleleFrequencyEstimate estimate = new AlleleFrequencyEstimate(context.getLocation(), + ref, + alt, + (int)EM_N, + EM_alt_freq, + EM_alt_freq, + discovery_lod, + 0.0, + discovery_likelihood + discovery_prior, + discovery_null + discovery_prior, + context.getReads().size(), + (String)null, + (double[][])null, + (double[])null, + (String)null); + estimate.n_ref = n_ref; // HACK + estimate.n_het = n_het; // HACK + estimate.n_hom = n_hom; // HACK + return estimate; + - for (int i = 0; i < sample_names.size(); i++) - { - ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts[i]); - if (calls[i].depth == 0) { continue; } - //if (calls[i].lodVsRef < lodThreshold) { continue; } - //discovery_output_file.printf("%s %s %c %f %s %f %f %f %f %f %s\n", context.getLocation(), sample_names.get(i), ref, EM_alt_freq, calls[i].genotype(), calls[i].lodVsRef, calls[i].lodVsNextBest, calls[i].pBest, calls[i].pRef, discovery_lod, pileup.getBases()); - } - //for (int i = 0; i < likelihood_trajectory.length; i++) //{ // System.out.printf("TRAJECTORY %f %f\n", trajectory[i], likelihood_trajectory[i]); //} //System.out.print("\n\n"); - - return null; } private LocusContext poolLocusContext(LocusContext[] contexts) @@ -249,6 +306,25 @@ public class PoolCaller extends LocusWalker return new LocusContext(location, reads, offsets); } + private LocusContext filterLocusContextByStrand(LocusContext context, String strand) + { + ArrayList reads = new ArrayList(); + ArrayList offsets = new ArrayList(); + + for (int i = 0; i < context.getReads().size(); i++) + { + SAMRecord read = context.getReads().get(i); + Integer offset = context.getOffsets().get(i); + + // Filter for strandedness + if ((!strand.contains("+")) && (read.getReadNegativeStrandFlag() == false)) { continue; } + if ((!strand.contains("-")) && (read.getReadNegativeStrandFlag() == true)) { continue; } + reads.add(read); + offsets.add(offset); + } + return new LocusContext(context.getLocation(), reads, offsets); + } + private LocusContext[] filterLocusContext(LocusContext context, List sample_names, int downsample) { HashMap index = new HashMap(); @@ -315,9 +391,15 @@ public class PoolCaller extends LocusWalker return contexts; } + private void CollectStrandInformation(char ref, LocusContext context) + { + List reads = context.getReads(); + List offsets = context.getOffsets(); + + } + public void onTraversalDone(String result) { - System.out.printf("OTD\n"); try { discovery_output_file.flush(); @@ -334,7 +416,7 @@ public class PoolCaller extends LocusWalker public String reduceInit() { - discovery_output_file.printf("loc ref alt EM_alt_freq discovery_posterior discovery_null discovery_lod\n"); + discovery_output_file.printf("loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom fw_alt fw_lod bw_alt bw_lod\n"); for (int i = 0; i < callers.size(); i++) { callers.get(i).reduceInit(); diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java index 407bdfba8..b030f2b9d 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java +++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java @@ -35,6 +35,9 @@ public class AlleleFrequencyEstimate { public double[][] quals; public double[] posteriors; public String sample_name; + public int n_ref; + public int n_het; + public int n_hom; GenomeLoc l; @@ -42,11 +45,11 @@ public class AlleleFrequencyEstimate { public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth, String bases, double[][] quals, double[] posteriors, String sample_name) { if( Double.isNaN(lodVsRef)) { System.out.printf("%s: lodVsRef is NaN\n", location.toString()); } - if( Double.isNaN(lodVsNextBest)) { System.out.printf("lodVsNextBest is NaN\n"); } - if( Double.isNaN(qhat)) { System.out.printf("qhat is NaN\n"); } - if( Double.isNaN(qstar)) { System.out.printf("qstar is NaN\n"); } - if( Double.isNaN(pBest)) { System.out.printf("pBest is NaN\n"); } - if( Double.isNaN(pRef)) { System.out.printf("pRef is NaN (%c %s)\n", ref, bases); } + if( Double.isNaN(lodVsNextBest)) { System.out.printf("%s lodVsNextBest is NaN\n", location.toString()); } + if( Double.isNaN(qhat)) { System.out.printf("%s qhat is NaN\n", location.toString()); } + if( Double.isNaN(qstar)) { System.out.printf("%s qstar is NaN\n", location.toString()); } + if( Double.isNaN(pBest)) { System.out.printf("%s pBest is NaN\n", location.toString()); } + if( Double.isNaN(pRef)) { System.out.printf("%s pRef is NaN (%c %s)\n", location.toString(), ref, bases); } if( Double.isInfinite(lodVsRef)) { @@ -206,6 +209,23 @@ public class AlleleFrequencyEstimate { } } + public String asPoolTabularString() + { + return String.format("%s %c %c %f %f %f %s %f %d %d %d %d", + location, + ref, + alt, + qstar, + pBest, + pRef, + "NA", + lodVsRef, + N, + n_ref, + n_het, + n_hom); + } + public double posterior() { diff --git a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java index 5397513a3..b0c541acc 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java @@ -81,8 +81,10 @@ public class GenotypeLikelihoods { offNextBestBasePriors.put("TT", 0.505); } - public void add(char ref, char read, byte qual) { - for (int i = 0; i < genotypes.length; i++) { + public void add(char ref, char read, byte qual) + { + for (int i = 0; i < genotypes.length; i++) + { likelihoods[i] += calculateAlleleLikelihood(ref, read, genotypes[i], qual); } } @@ -96,14 +98,19 @@ public class GenotypeLikelihoods { double p_base; - if ((h1 == h2) && (h1 == read)) { - p_base = getOneMinusQual(qual); //Math.log10(1 - p_error); - } else if ((h1 != h2) && (h1 == read) || (h2 == read)) { - p_base = getOneHalfMinusQual(qual); // )Math.log10(0.5 - (p_error / 2.0)); - } else { - // the real math would be - // likelihood += log10(pow(10,(qual/-10.0))); - // but it simplifies to + if ((h1 == h2) && (h1 == read)) + { + // hom + p_base = getOneMinusQual(qual); + } + else if ((h1 != h2) && ((h1 == read) || (h2 == read))) + { + // het + p_base = getOneHalfMinusQual(qual); + } + else + { + // error p_base = qual/-10.0; }