Basic dbSNP and HapMap frequency aware SNP caller... still in progress

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@282 348d0f76-0448-11de-a6fe-93d51630548a
2009-04-03 14:24:09 +00:00 · 2009-04-03 14:24:09 +00:00 · 9bbce32064
parent f031d882c6
commit 9bbce32064
1 changed files with 422 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PopPriorWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PopPriorWalker.java
@ -0,0 +1,422 @@
+package org.broadinstitute.sting.playground.gatk.walkers;
+
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.gatk.GenomeAnalysisTK;
+import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
+import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
+import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import net.sf.samtools.SAMRecord;
+
+import java.util.List;
+import java.util.Formatter;
+import static java.lang.Math.log10;
+
+import edu.mit.broad.picard.genotype.DiploidGenotype;
+import edu.mit.broad.picard.PicardException;
+
+public class PopPriorWalker extends LocusWalker<Integer, Integer> {
+
+    @Argument(fullName = "quality_score_cutoff", required = false) public Byte qualityScoreCutoff;
+
+    public PopPriorWalker() {
+    }
+
+    public void initialize() {
+    }
+
+    public String walkerType() { return "ByLocus"; }
+
+    // Do we actually want to operate on the context?
+    public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
+        return true;    // We are keeping all the reads
+    }
+
+    protected class GenotypeLikelihoods
+    {
+
+        public double[] likelihoods;
+        public String[] genotypes;
+
+        GenotypeLikelihoods()
+        {
+            likelihoods = new double[10];
+            genotypes   = new String[10];
+
+            genotypes[0] = "AA";
+            genotypes[1] = "AC";
+            genotypes[2] = "AG";
+            genotypes[3] = "AT";
+            genotypes[4] = "CC";
+            genotypes[5] = "CG";
+            genotypes[6] = "CT";
+            genotypes[7] = "GG";
+            genotypes[8] = "GT";
+            genotypes[9] = "TT";
+        }
+
+        void add(char ref, char read, byte qual)
+        {
+            double p_error = Math.pow(10.0, (double)qual / -10);
+            for (int i = 0; i < genotypes.length; i++)
+            {
+                likelihoods[i] += AlleleLikelihood(ref, read, genotypes[i], p_error);
+            }
+        }
+
+        double AlleleLikelihood(char ref, char read, String genotype, double p_error)
+        {
+            char h1 = genotype.charAt(0);
+            char h2 = genotype.charAt(1);
+
+            double p_base;
+
+            if      ((h1 == h2) && (h1 == read))                 { p_base = Math.log10(1-p_error); }
+            else if ((h1 != h2) && (h1 == read) || (h2 == read)) { p_base = Math.log10(0.5 - (p_error/2.0)); }
+            else                                                 { p_base = Math.log10(p_error); }
+
+            return p_base;
+        }
+
+        // TODO: horrible horrible idea... but it's ok for right now.  should really
+        // implement this more cleanly so you can get sorted genotypes without losing the
+        // ability to call add
+        public void doSort() {
+            Integer[] permutation       = Utils.SortPermutation(likelihoods);
+            String[] sorted_genotypes   = Utils.PermuteArray(genotypes, permutation);
+            double[] sorted_likelihoods = Utils.PermuteArray(likelihoods, permutation);
+
+            this.genotypes = sorted_genotypes;
+            this.likelihoods = sorted_likelihoods;
+        }
+
+    }
+
+    // Map over the org.broadinstitute.sting.gatk.LocusContext
+    public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
+        char upRef = Character.toUpperCase(ref);
+        List<SAMRecord> reads = context.getReads();
+        List<Integer> offsets = context.getOffsets();
+
+        // Look up hapmap and dbsnp priors
+        String rodString = "";
+
+        rodDbSNP dbsnpInfo = null;
+        HapMapAlleleFrequenciesROD hapmap = null;
+
+        for ( ReferenceOrderedDatum datum : rodData )
+        {
+            if ( datum != null )
+            {
+                if ( datum instanceof rodDbSNP) {
+                    dbsnpInfo = (rodDbSNP)datum;
+
+                } else if (datum instanceof HapMapAlleleFrequenciesROD) {
+                    hapmap = (HapMapAlleleFrequenciesROD) datum;
+                }
+            }
+        }
+
+        // Accumulate genotype likelihoods
+        int aCount = 0;
+        int cCount = 0;
+        int gCount = 0;
+        int tCount = 0;
+        GenotypeLikelihoods glAll = new GenotypeLikelihoods();
+        GenotypeLikelihoods glForward = new GenotypeLikelihoods();
+        GenotypeLikelihoods glReverse = new GenotypeLikelihoods();
+        for ( int i = 0; i < reads.size(); i++ )
+        {
+            SAMRecord read = reads.get(i);
+
+            // TODO: could this be done better elsewhere?
+            if (read.getNotPrimaryAlignmentFlag() || read.getDuplicateReadFlag() || read.getReadUnmappedFlag()) {
+                continue;
+            }
+
+            int offset = offsets.get(i);
+            char base = read.getReadString().charAt(offset);
+            byte qual = read.getBaseQualities()[offset];
+
+            if (qual == 0) { continue; }
+            
+            if (qualityScoreCutoff != null && qual < qualityScoreCutoff) { continue; }
+
+            if (base == 'A') { aCount++; }
+            if (base == 'C') { cCount++; }
+            if (base == 'G') { gCount++; }
+            if (base == 'T') { tCount++; }
+
+            // System.out.println(read.getReadName() + " " + base + " " + qual );
+            
+            glAll.add(ref, base, qual);
+
+            if (read.getReadNegativeStrandFlag()) {
+                glReverse.add(ref, base, qual);
+            } else {
+                glForward.add(ref, base, qual);
+            }
+        }
+
+        // apply priors
+        String priorType = "NONE";
+
+        double[] priors;
+
+        // hapmap overrides dbsnp which overrides "novel"
+        if (hapmap != null) {
+            List<String> knownAlleles = hapmap.getAllelesFWD();
+            priorType = "HAPMAP";
+            rodString = "[HAPMAP" + hapmap.toSimpleString() + "]";
+
+            priors = getKnownSiteKnownFreqPriors(((byte)(upRef & 0xff)), knownAlleles, hapmap.getVarAlleleFreq());
+        } else if (dbsnpInfo != null && dbsnpInfo.isSNP()) {
+            List<String> knownAlleles = dbsnpInfo.getAllelesFWD();
+            priorType = "DBSNP";
+            rodString = "[DBSNP: " + dbsnpInfo.toMediumString() + "]";
+
+            priors = getKnownSitePriors(((byte)(upRef & 0xff)), knownAlleles);
+        } else {
+            priors = getNovelSitePriors(((byte)(upRef & 0xff)));
+        }
+
+
+
+        StringBuilder priorString = new StringBuilder();
+        priorString.append(priorType);
+        for(int i=0; i<priors.length; i++) {
+
+            // apply the prior!!!
+            double log10prior = log10(priors[i]);
+            glAll.likelihoods[i] += log10prior;
+            glForward.likelihoods[i] += log10prior;
+            glReverse.likelihoods[i] += log10prior;
+
+            priorString.append(" ");
+            priorString
+                    .append(glAll.genotypes[i])
+                    .append(":")
+                    .append(new Formatter().format("%4.2f",log10prior));
+
+        }
+
+
+        // FIXME: horrible way to do this!!!!
+        glAll.doSort();
+        glForward.doSort();
+        glReverse.doSort();
+
+
+        String gt = glAll.genotypes[glAll.genotypes.length-1];
+        double btnb = glAll.likelihoods[glAll.genotypes.length-1] - glAll.likelihoods[glAll.genotypes.length-2];
+        double btr = 1.0d; // FIXME: do we need this?
+
+        // just to mimic old behavior
+        String type = "homozygous";
+        if (gt.charAt(0) != gt.charAt(1)) { type = "heterozygous-SNP"; }
+        else if (gt.charAt(0) != upRef) { type = "homozygous-SNP"; }
+
+
+        out.print(String.format("%s %d %s %s %4.6f %4.6f %s A:%d C:%d G:%d T:%d %d ",
+                      context.getContig(),
+                      context.getPosition(),
+                      upRef,
+                      gt,
+                      btnb,
+                      btr,
+                      type,
+                      aCount,
+                      cCount,
+                      gCount,
+                      tCount,
+                      aCount + cCount + gCount + tCount));
+
+        out.print(dumpTheories(glAll));
+
+        out.print(String.format("%s %s ",
+                      priorString,
+                      rodString));
+
+        out.print(dumpTheories(glForward));
+        out.print(dumpTheories(glReverse));
+        out.print("\n");
+
+        return 1;
+    }
+
+    // Given result of map function
+    public Integer reduceInit() {
+        return 0;
+    }
+    public Integer reduce(Integer value, Integer sum) {
+        return value + sum;
+    }
+
+    @Override
+    public void onTraversalDone(Integer result) {
+    }
+
+    protected static final double NOVEL_HET_PRIOR = 2*1E-4;
+    protected static final double NOVEL_OTHER_PRIOR = 1E-5;
+    protected static final double NOVEL_REF_PRIOR = 1d - 4*NOVEL_HET_PRIOR - 6*NOVEL_OTHER_PRIOR;
+
+    protected static final double KNOWN_HET_PRIOR = .05;
+    protected static final double KNOWN_HNREF_PRIOR = .05*.05;
+    protected static final double KNOWN_OTHER_PRIOR = 1E-5;
+    protected static final double KNOWN_REF_PRIOR = 1d - KNOWN_HET_PRIOR - KNOWN_HNREF_PRIOR - 8* KNOWN_OTHER_PRIOR;
+
+    protected static final double FREQ_OTHER_PRIOR = 1E-5;
+
+    public double[] getNovelSitePriors(final byte referenceBase) {
+        //TODO: convert everything to use DiploidGenotype
+        DiploidGenotype[] theories = DiploidGenotype.values();
+        double[] priors = new double[theories.length];
+
+        DiploidGenotype refTheory = DiploidGenotype.fromBases(referenceBase, referenceBase);
+
+        // use novel variant prior
+        // ref (e.g. AA) -> 1 - 4*(2*10^-4) - 6*(10^-5)
+        // het (e.g. AC, AG, AT) -> 2*10^-4 (AC, AG, AT,
+        // other (e.g. CC, CG, CT, GG, GT, TT) -> 10^-5
+        int i=0;
+        for (DiploidGenotype theory : theories) {
+            double prior = 0;
+            if (theory == refTheory) {
+                prior = NOVEL_REF_PRIOR;
+            } else if (theory.getAllele1() == referenceBase || theory.getAllele2() == referenceBase) {
+                prior = NOVEL_HET_PRIOR;
+            } else { // not reference, and not het with reference base
+                prior = NOVEL_OTHER_PRIOR;
+            }
+            priors[i++] = prior;
+        }
+
+        return priors;
+    }
+
+    public double[] getKnownSitePriors(final byte referenceBase,
+                                       final List<String> alleles) {
+
+        //TODO: convert everything to use DiploidGenotype
+        DiploidGenotype[] theories = DiploidGenotype.values();
+        double[] priors = new double[theories.length];
+
+        DiploidGenotype refTheory = DiploidGenotype.fromBases(referenceBase, referenceBase);
+
+        if (alleles != null && alleles.size() > 2 ) {
+            // FIXME: handle multi-allele case!
+            return getNovelSitePriors(referenceBase);
+        }
+
+
+        StringBuilder sb = new StringBuilder();
+        if (alleles != null ) {
+            for(String allele : alleles) { sb.append(allele); }
+        }
+        String allelesString = sb.toString();
+
+
+        DiploidGenotype knownAlleles = DiploidGenotype.fromBases(allelesString.getBytes());
+        byte altAllele = getAlternateAllele(knownAlleles, referenceBase);
+        DiploidGenotype hnref = DiploidGenotype.fromBases(altAllele, altAllele);
+
+
+        // do we not have a freq (in dbsnp, known alleles)
+        // AA -
+        // AG - .05
+        // GG - .05^2
+        // other - 10^-5
+        int i=0;
+        for (DiploidGenotype theory : theories) {
+            double prior = 0;
+            if (theory == refTheory) {
+                prior = KNOWN_REF_PRIOR;
+            } else if (theory == knownAlleles) {
+                prior = KNOWN_HET_PRIOR;
+            } else if (theory == hnref) {
+                prior = KNOWN_HNREF_PRIOR;
+            } else {
+                prior = NOVEL_OTHER_PRIOR;
+            }
+            priors[i++] = prior;
+        }
+
+        return priors;
+
+    }
+
+    public double[] getKnownSiteKnownFreqPriors(final byte referenceBase,
+                              final List<String> alleles,
+                              final Double altAllelefrequency) {
+
+        //TODO: convert everything to use DiploidGenotype
+        DiploidGenotype[] theories = DiploidGenotype.values();
+        double[] priors = new double[theories.length];
+
+        DiploidGenotype refTheory = DiploidGenotype.fromBases(referenceBase, referenceBase);
+
+        // if we're dealing with more than one allele here... make sure they all have no frequency data
+        if (alleles != null && alleles.size() > 2 ) {
+            throw new PicardException("Can't handle multiple known alleles with a single known frequency!");
+        }
+
+
+        StringBuilder sb = new StringBuilder();
+        if (alleles != null ) {
+            for(String allele : alleles) { sb.append(allele); }
+        }
+        String allelesString = sb.toString();
+
+
+//            System.out.println("FREQ");
+        // assert that we have just one set of frequencies...
+//FIXME:            if (aaf.size() > 1) { throw new PicardException("Attempting to call with multiple allele frequencies!"); }
+        DiploidGenotype knownAlleles = DiploidGenotype.fromBases(allelesString.getBytes());
+        byte altAllele = getAlternateAllele(knownAlleles, referenceBase);
+        DiploidGenotype hnref = DiploidGenotype.fromBases(altAllele, altAllele);
+
+        double q = altAllelefrequency;
+        double p = 1d - q;
+
+
+        // do we have a freq (in hapmap, known alleles)
+        // AA -
+        // AG -
+        // GG -
+        // other - 10^-5
+        int i=0;
+        for (DiploidGenotype theory : theories) {
+            double prior = 0;
+            if (theory == refTheory) {
+                prior = p*p;                            //TODO: should this really sum to 1?
+            } else if (theory == knownAlleles) {
+                prior = 2*p*q;
+            } else if (theory == hnref) {
+                prior = q*q;
+            } else {
+                prior = FREQ_OTHER_PRIOR;
+            }
+            priors[i++] = prior;
+        }
+
+
+        return priors;
+    }
+
+    protected byte getAlternateAllele(final DiploidGenotype gt, final byte refAllele) {
+        return (gt.getAllele1() == refAllele)?gt.getAllele2():gt.getAllele1();
+    }
+
+    protected String dumpTheories(GenotypeLikelihoods gl) {
+        StringBuilder sb = new StringBuilder();
+        for (int i = gl.genotypes.length-1; i >= 0; i--)
+        {
+            sb.append(String.format("%s:%4.2f ", gl.genotypes[i], gl.likelihoods[i]));
+        }
+        return sb.toString();
+    }
+
+
+}