From af6788fa3d23c51cb98c74184d86ee5114fe82cf Mon Sep 17 00:00:00 2001 From: jmaguire Date: Sun, 19 Apr 2009 15:35:07 +0000 Subject: [PATCH] Misc: 1. Added logGamma function to utils 2. Required asserts to be enabled in the allele caller (run with java -ea) 3. put checks and asserts of NaN and Infinity in AlleleFrequencyEstimate 4. Added option FRACTIONAL_COUNTS to the pooled caller (not working right yet) AlleleFrequencyWalker: 5. Made FORCE_1BASE_PROBS not static in AlleleFrequencyWalker (an argument should never be static! Jeez.) 6. changed quality_precision to be 1e-4 (Q40) 7. don't adjust by quality_precision unless the qual is actually zero. 8. added more asserts for NaN and Infinity 9. put in a correction for zero probs in P_D_q 10. changed pG to be hardy-weinberg in the presence of an allele frequency prior (duh) 11. rewrote binomialProb() to not overflow on deep coverage 12. rewrote nchoosek() to behave right on deep coverage 13. put in some binomailProb() tests in the main() routine (they come out right when compared with R) Hunt for loci where 4bp should change things: 14. added FindNonrandomSecondBestBasePiles walker. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@471 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/AlleleFrequencyWalker.java | 102 ++++++++-- .../FindNonrandomSecondBestBasePiles.java | 187 ++++++++++++++++++ .../gatk/walkers/PoolCallingExperiment.java | 23 ++- .../utils/AlleleFrequencyEstimate.java | 48 ++++- .../org/broadinstitute/sting/utils/Utils.java | 12 +- 5 files changed, 347 insertions(+), 25 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/FindNonrandomSecondBestBasePiles.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java index 1c339860c..a78e042cc 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyWalker.java @@ -24,7 +24,7 @@ public class AlleleFrequencyWalker extends LocusWalker n) + return 0; + + if (k > n/2) + k = n-k; // Take advantage of symmetry + + double accum = 1; + for (long i = 1; i <= k; i++) + accum = accum * (n-k+i) / i; + + return accum + 0.5; // avoid rounding error + + /* long m = n - k; if (k < m) k = m; @@ -504,6 +566,7 @@ public class AlleleFrequencyWalker extends LocusWalker k; i--, j++) t = t * i / j; return t; + */ } public static String repeat(char letter, long count) { @@ -690,6 +753,9 @@ public class AlleleFrequencyWalker extends LocusWalker { + @Argument(fullName="verbose",required=false,defaultValue="false") + public boolean VERBOSE; + + private AlleleFrequencyWalker caller_1b; + private AlleleFrequencyWalker caller_4b; + public void initialize() + { + caller_1b = new AlleleFrequencyWalker(); + caller_1b.N = 2; + caller_1b.DOWNSAMPLE = 0; + caller_1b.GFF_OUTPUT_FILE = "/dev/null"; + caller_1b.FORCE_1BASE_PROBS = true; + caller_1b.initalize(); + caller_1b.reduceInit(); + + caller_4b = new AlleleFrequencyWalker(); + caller_4b.N = 2; + caller_4b.DOWNSAMPLE = 0; + caller_4b.GFF_OUTPUT_FILE = "/dev/null"; + caller_4b.FORCE_1BASE_PROBS = false; + caller_4b.initalize(); + caller_4b.reduceInit(); + } + + // Do we actually want to operate on the context? + public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { + return true; // We are keeping all the reads + } + + public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) + { + ReadBackedPileup pileup = new ReadBackedPileup(ref, context); + + // First, call the site, because we're mostly interested in hets + ref = Character.toUpperCase(ref); + AlleleFrequencyEstimate call_1b = caller_1b.map(tracker, ref, context); + AlleleFrequencyEstimate call_4b = caller_4b.map(tracker, ref, context); + + // Only output data for sites that are confident disagreements. + if (Math.abs(call_1b.lodVsRef) < 5.0) { return 0; } + if (Math.abs(call_4b.lodVsRef) < 5.0) { return 0; } + if (call_1b.genotype().equals(call_4b.genotype())) { return 0; } + + List reads = pileup.getReads(); + List offsets = pileup.getOffsets(); + String best_bases = pileup.getBases(); + String second_bases; + double[] quals = new double[reads.size()]; + double[] second_quals = new double[reads.size()]; + { + char[] second_bases_array = new char[reads.size()]; + for ( int i = 0; i < reads.size(); i++ ) + { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + byte qual = (byte)read.getBaseQualities()[offset]; + quals[i] = 1.0 - Math.pow(10,(double)qual/-10.0); + + second_bases_array[i] = BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(((byte[])read.getAttribute("SQ"))[offset])); + second_quals[i] = QualityUtils.compressedQualityToProb(((byte[])read.getAttribute("SQ"))[offset]); + } + second_bases = new String(second_bases_array); + } + + String rodString = ""; + for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) { + if ( datum != null && ! (datum instanceof rodDbSNP)) { + //System.out.printf("rod = %s%n", datum.toSimpleString()); + rodString += datum.toSimpleString(); + //System.out.printf("Rod string %s%n", rodString); + } + } + + rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null); + if ( dbsnp != null ) + rodString += dbsnp.toMediumString(); + + if ( rodString != "" ) + rodString = "[ROD: " + rodString + "]"; + + char[] bases = {'A', 'C', 'G', 'T'}; + double[][] counts = new double[4][]; + double[] totals = new double[4]; + double[][] fractional_counts = new double[4][]; + double[] fractional_totals = new double[4]; + for (int i = 0; i < 4; i++) + { + counts[i] = new double[4]; + fractional_counts[i] = new double[4]; + for (int j = 0; j < best_bases.length(); j++) + { + if (best_bases.charAt(j) == bases[i]) + { + counts[BaseUtils.simpleBaseToBaseIndex(bases[i])][BaseUtils.simpleBaseToBaseIndex(second_bases.charAt(j))] += 1; + totals[BaseUtils.simpleBaseToBaseIndex(bases[i])] += 1; + + fractional_counts[BaseUtils.simpleBaseToBaseIndex(bases[i])][BaseUtils.simpleBaseToBaseIndex(second_bases.charAt(j))] += second_quals[j]; + fractional_totals[BaseUtils.simpleBaseToBaseIndex(bases[i])] += second_quals[j]; + } + } + } + + out.printf("%s\t%c\t%s\n%s\t%c\t%s\n", + pileup.getLocation(), + ref, + best_bases, + pileup.getLocation(), + ref, + second_bases); + out.printf("%s\t%s\t%f\t%f\n", + pileup.getLocation(), + call_1b.genotype(), + call_1b.lodVsRef, + call_1b.lodVsNextBest); + out.printf("%s\t%s\t%f\t%f\n", + pileup.getLocation(), + call_4b.genotype(), + call_4b.lodVsRef, + call_4b.lodVsNextBest); + for (int i = 0; i < quals.length; i++) + { + out.printf("%s\t%c %c %f\t%f\t%f\t%f\t%f\n", + pileup.getLocation(), + best_bases.charAt(i), + second_bases.charAt(i), + quals[i], + second_quals[i], + -10.0 * Math.log10(1.0 - quals[i]), + -10.0 * Math.log10(1.0 - second_quals[i]), + Math.log10((quals[i])/(second_quals[i]))); + } + out.printf("\n"); + for (int i = 0; i < 4; i++) + { + out.printf("%s\t%c ", pileup.getLocation(), bases[i]); + for (int j = 0; j < 4; j++) + { + out.printf("%.03f ", counts[i][j] / totals[i]); + } + out.printf("\n"); + } + /* + out.printf("\n"); + for (int i = 0; i < 4; i++) + { + out.printf("%s\t%c ", pileup.getLocation(), bases[i]); + for (int j = 0; j < 4; j++) + { + out.printf("%.03f ", fractional_counts[i][j] / fractional_totals[i]); + } + out.printf("\n"); + } + */ + out.printf("\n\n"); + + return 1; + } + + // Given result of map function + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCallingExperiment.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCallingExperiment.java index 42afd4b1b..c667ea1d1 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCallingExperiment.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCallingExperiment.java @@ -27,6 +27,7 @@ public class PoolCallingExperiment extends LocusWalker 0 ? (x/base)*100.0 : 0); } public static double percentage(int x, int base) { return (base> 0 ? ((double)x/(double)base)*100.0 : 0); } public static double percentage(long x, long base) { return (base> 0 ? ((double)x/(double)base)*100.0 : 0); }