Minor useful changes to BaseUtils and MathUtils to support a new haplotype score annotation that determines to the two most likely haplotypes over an interval and scores variants by their consistency with a diploid model. Appears to be useful.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3085 348d0f76-0448-11de-a6fe-93d51630548a
2010-03-28 21:45:22 +00:00 · 2010-03-28 21:45:22 +00:00 · b8ab74a6dc
parent e9e53f68ab
commit b8ab74a6dc
6 changed files with 337 additions and 25 deletions
--- a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java
+++ b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java
@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Collection;

 /**
 * Useful class for storing different AlignmentContexts
@ -53,9 +54,17 @@ public class StratifiedAlignmentContext {

    private GenomeLoc loc;
    private AlignmentContext[] contexts = new AlignmentContext[StratifiedContextType.values().length];
+
+    // todo -- why are you storing reads separately each time?  There's a ReadBackedPileup object that's supposed to handle this
    private ArrayList<SAMRecord>[] reads = new ArrayList[StratifiedContextType.values().length];
    private ArrayList<Integer>[] offsets = new ArrayList[StratifiedContextType.values().length];

+    //
+    // accessors
+    //
+    public GenomeLoc getLocation() { return loc; }
+    public ArrayList<SAMRecord> getReads(StratifiedContextType type) { return reads[type.ordinal()]; }
+    public ArrayList<Integer> getOffsets(StratifiedContextType type) { return offsets[type.ordinal()]; }

    public StratifiedAlignmentContext(GenomeLoc loc) {
        this.loc = loc;
@ -65,10 +74,10 @@ public class StratifiedAlignmentContext {
        }
    }

-    public AlignmentContext getContext(StratifiedContextType context) {
-        int index = context.ordinal();
+    public AlignmentContext getContext(StratifiedContextType type) {
+        int index = type.ordinal();
        if ( contexts[index] == null )
-            contexts[index] = new AlignmentContext(loc, new ReadBackedPileup(loc, reads[index], offsets[index]));
+            contexts[index] = new AlignmentContext(loc, new ReadBackedPileup(loc, getReads(type), getOffsets(type)));
        return contexts[index];
    }

@ -214,4 +223,20 @@ public class StratifiedAlignmentContext {

        return contexts;
    }
+
+    public static AlignmentContext joinContexts(Collection<StratifiedAlignmentContext> contexts, StratifiedContextType type) {
+        ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
+        ArrayList<Integer> offsets = new ArrayList<Integer>();
+
+        GenomeLoc loc = null;
+        for ( StratifiedAlignmentContext context : contexts ) {
+            loc = context.getLocation();
+            reads.addAll(context.getReads(type));
+            offsets.addAll(context.getOffsets(type));
+        }
+
+        if ( loc == null  )
+            throw new StingException("BUG: joinContexts requires at least one context to join");
+        return new AlignmentContext(loc, new ReadBackedPileup(loc, reads, offsets));
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
@ -0,0 +1,219 @@
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
+import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.pileup.ExtendedPileupElement;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
+
+import java.util.*;
+
+import net.sf.samtools.SAMRecord;
+
+
+public class HaplotypeScore implements InfoFieldAnnotation, WorkInProgressAnnotation {
+
+    public Map<String, Object> annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
+        if ( !vc.isBiallelic() || !vc.isSNP() )
+            return null;
+
+        AlignmentContext context = StratifiedAlignmentContext.joinContexts(stratifiedContexts.values(), StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
+
+        int contextWingSize = Math.min(((int)ref.getWindow().size() - 1)/2, 10);
+        int contextSize = contextWingSize * 2 + 1;
+
+        int refMiddle = (int)(ref.getWindow().size() - 1) / 2;
+        int refStart = refMiddle - contextWingSize;
+        int refStop = refMiddle + contextWingSize + 1;
+        String refString = new String(ref.getBases()).substring(refStart, refStop);
+        Consensus refConsensus = new Consensus(refString.getBytes(), 60);
+
+        ReadBackedPileup altPileup = getPileupOfAllele(vc.getAlternateAllele(0), context.getBasePileup());
+        Consensus altConsensus = new Consensus(altPileup, contextSize);
+
+        //System.exit(1);
+        // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
+        double score = scoreReadsAgainstHaplotypes(Arrays.asList(refConsensus, altConsensus), context.getBasePileup(), contextSize);
+
+        // return the score
+        Map<String, Object> map = new HashMap<String, Object>();
+        map.put(getKeyName(), String.format("%.2f", score));
+        return map;
+    }
+
+    // calculate the haplotype scores by walking over all reads and comparing them to the haplotypes
+    private double scoreReadsAgainstHaplotypes(List<Consensus> haplotypes, ReadBackedPileup pileup, int contextSize) {
+        System.out.printf("REF: %s%n", haplotypes.get(0));
+        System.out.printf("ALT: %s%n", haplotypes.get(1));
+
+        double[][] haplotypeScores = new double[pileup.size()][haplotypes.size()];
+        for ( ExtendedPileupElement p : pileup.extendedForeachIterator() ) {
+            SAMRecord read = p.getRead();
+            int readOffsetFromPileup = p.getOffset();
+
+            System.out.printf("--------------------------------------------- Read %s%n", read.getReadName());
+            double m = 10000000;
+            for ( int i = 0; i < haplotypes.size(); i++ ) {
+                Consensus haplotype = haplotypes.get(i);
+                int start = readOffsetFromPileup - (contextSize - 1)/2;
+                double score = scoreReadAgainstHaplotype(read, start, contextSize, haplotype);
+                haplotypeScores[p.getPileupOffset()][i] = score;
+                System.out.printf("  vs. haplotype %d = %f%n", i, score);
+                m = Math.min(score, m);
+            }
+            System.out.printf("  => best score was %f%n", m);
+        }
+
+        double overallScore = 0.0;
+        for ( double[] readHaplotypeScores : haplotypeScores ) {
+            overallScore += MathUtils.arrayMin(readHaplotypeScores);
+        }
+
+        return overallScore;
+    }
+
+    private double scoreReadAgainstHaplotype(SAMRecord read, int baseOffsetStart, int contextSize, Consensus haplotype ) {
+        double log10LExpected = 0.0;
+        double log10LMismatches = 0.0;
+
+        for ( int i = 0; i < contextSize; i++ ) {
+            int baseOffset = i + baseOffsetStart;
+            if ( baseOffset < 0 )
+                continue;
+            if ( baseOffset >= read.getReadLength() )
+                break;
+
+            byte haplotypeBase = haplotype.bases[i];
+            byte readBase = read.getReadBases()[baseOffset];
+            if ( ! BaseUtils.basesAreEqual(readBase, haplotypeBase ) ) {
+                log10LMismatches += read.getBaseQualities()[baseOffset];
+            }
+
+            //System.out.printf("Read %s: scoring %c vs. %c => %f%n", read.getReadName(), (char)haplotypeBase, (char)readBase, log10LMismatches);
+        }
+
+        return log10LMismatches; //  - log10LExpected;
+    }
+
+
+    private static final double[] FLAT_BASE_PRIORS = new double[BaseUtils.Base.values().length];
+    static {
+        for ( int i = 0; i < BaseUtils.Base.values().length; i++ )
+            FLAT_BASE_PRIORS[i] = Math.log10(1.0 / BaseUtils.Base.values().length);
+    }
+
+    private class Consensus {
+        byte[] bases = null;
+        byte[] quals = null;
+
+        /**
+         * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
+         *
+         * @param bases
+         * @param qual
+         */
+        Consensus(byte[] bases, int qual) {
+            this.bases = bases;
+            quals = new byte[bases.length];
+            Arrays.fill(quals, (byte)qual);
+        }
+
+        Consensus(ReadBackedPileup pileup, int contextSize ) {
+            this.bases = new byte[contextSize];
+            this.quals = new byte[contextSize];
+            calculateConsensusOverWindow(pileup, contextSize, (contextSize - 1) / 2);
+        }
+
+        private void calculateConsensusOverWindow(ReadBackedPileup pileup, int contextSize, int pileupOffset) {
+            // for each context position
+            for ( int i = 0; i < contextSize; i++ ) {
+                int offsetFromPileup = i - pileupOffset;
+                ReadBackedPileup offsetPileup = pileupAtOffset(pileup, offsetFromPileup);
+                System.out.printf("pileup is %s%n", offsetPileup);
+                BaseQual bq = calcConsensusAtLocus(offsetPileup, FLAT_BASE_PRIORS);
+                this.bases[i] = bq.getBase();
+                this.quals[i] = bq.getQual();
+                System.out.printf("  At %d: offset %d bq = %c / %d%n", i, offsetFromPileup, (char)bq.getBase(), bq.getQual());
+            }
+        }
+
+        private BaseQual calcConsensusAtLocus( ReadBackedPileup pileup, double[] log10priors ) {
+            double[] log10BaseLikelihoods = new double[BaseUtils.Base.values().length];
+
+            // loop over a, c, g, t and determine the most likely hypothesis
+            for ( BaseUtils.Base base : BaseUtils.Base.values() ) {
+                double log10L = log10priors[base.getIndex()];
+
+                for ( PileupElement p : pileup ) {
+                    double baseP = QualityUtils.qualToProb(p.getQual());
+                    double L = base.sameBase(p.getBase()) ? baseP : 1 - baseP;
+                    log10L += Math.log10(L);
+                }
+
+                log10BaseLikelihoods[base.getIndex()] = log10L;
+            }
+
+            double[] posteriors = MathUtils.normalizeFromLog10(log10BaseLikelihoods, false);
+            int mostLikelyIndex = MathUtils.maxElementIndex(posteriors);
+            byte mostLikelyBase = BaseUtils.Base.values()[mostLikelyIndex].getBase();                // get the most likely option
+            double MAX_CONSENSUS_QUALITY = 0.000001;
+            byte qual = QualityUtils.probToQual(posteriors[mostLikelyIndex],MAX_CONSENSUS_QUALITY);  // call posterior calculator here over L over bases
+            return new BaseQual(mostLikelyBase, qual);
+        }
+
+        public String toString() { return new String(this.bases); }
+    }
+
+    private class BaseQual extends Pair<Byte, Byte> {
+        public BaseQual(byte base, byte qual) {
+            super(base, qual);
+        }
+
+        public byte getBase() { return getFirst(); }
+        public byte getQual() { return getSecond(); }
+    }
+
+    private static ReadBackedPileup pileupAtOffset(ReadBackedPileup pileup, int offsetFromPileup) {
+        ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
+        ArrayList<Integer> offsets = new ArrayList<Integer>();
+
+        // go through the pileup, read by read, collecting up the context at offset
+        for ( PileupElement p : pileup ) {
+            // not safe -- doesn't work for indel-containing reads!
+
+            // whole algorithm should be restructured to handle other reads in the window or use LocusIteratorByState
+            SAMRecord read = p.getRead();
+            int readOffsetInPileup = p.getOffset();
+            int neededReadOffset = readOffsetInPileup + offsetFromPileup;
+            if ( neededReadOffset >= 0 && neededReadOffset < read.getReadLength() ) {
+                reads.add(p.getRead());
+                offsets.add(neededReadOffset);
+            }
+        }
+
+        return new ReadBackedPileup(pileup.getLocation(), reads, offsets);
+    }
+
+    private static ReadBackedPileup getPileupOfAllele( Allele allele, ReadBackedPileup pileup ) {
+        ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
+        byte alleleBase = allele.getBases()[0]; // assumes SNP 
+
+        for ( PileupElement p : pileup ) {
+            if ( p.getBase() == alleleBase ) {
+                filteredPileup.add(p);
+            }
+        }
+
+        return new ReadBackedPileup(pileup.getLocation(), filteredPileup);
+    }
+
+    public String getKeyName() { return "HaplotypeScore"; }
+    public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("HaplotypeScore", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Consistency of the site with two (and only two) segregating haplotypes"); }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
@ -51,26 +51,26 @@ public class QualByDepth implements InfoFieldAnnotation, StandardAnnotation {
        return depth;
    }

-    private double genotypeQualByDepth(final Map<String, Genotype> genotypes, Map<String, StratifiedAlignmentContext> stratifiedContexts) {
-        ArrayList<Double> qualsByDepth = new ArrayList<Double>();
-        for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
-
-            // we care only about variant calls
-            if ( genotype.getValue().isHomRef() )
-                continue;
-
-            StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey());
-            if ( context == null )
-                continue;
-
-            qualsByDepth.add(genotype.getValue().getNegLog10PError() / context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
-        }
-
-        double mean = 0.0;
-        for ( Double value : qualsByDepth )
-            mean += value;
-        mean /= qualsByDepth.size();
-
-        return mean;
-    }
+//    private double genotypeQualByDepth(final Map<String, Genotype> genotypes, Map<String, StratifiedAlignmentContext> stratifiedContexts) {
+//        ArrayList<Double> qualsByDepth = new ArrayList<Double>();
+//        for ( Map.Entry<String, Genotype> genotype : genotypes.entrySet() ) {
+//
+//            // we care only about variant calls
+//            if ( genotype.getValue().isHomRef() )
+//                continue;
+//
+//            StratifiedAlignmentContext context = stratifiedContexts.get(genotype.getKey());
+//            if ( context == null )
+//                continue;
+//
+//            qualsByDepth.add(genotype.getValue().getNegLog10PError() / context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
+//        }
+//
+//        double mean = 0.0;
+//        for ( Double value : qualsByDepth )
+//            mean += value;
+//        mean /= qualsByDepth.size();
+//
+//        return mean;
+//    }
 }
--- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java
@ -10,6 +10,30 @@ import java.util.Random;
 public class BaseUtils {
    public final static char[] BASES = { 'A', 'C', 'G', 'T' };
    public final static char[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' };
+
+    public enum Base {
+        A ( 'A', 0 ),
+        C ( 'C', 1 ),
+        G ( 'G', 2 ),
+        T ( 'T', 3 );
+
+        byte b;
+        int index;
+        private Base(char base, int index) {
+            this.b = (byte)base;
+            this.index = index;
+        }
+
+        public byte getBase() { return b; }
+        public char getBaseAsChar() { return (char)b; }
+        public int getIndex() { return index; }
+
+        public boolean sameBase(byte o) { return b == o; }
+        public boolean sameBase(char o) { return b == (byte)o; }
+        public boolean sameBase(int i)  { return index == i; }
+    }
+
+
    // todo -- fix me (enums?)
    public static final byte DELETION_INDEX = 4;
    public static final byte NO_CALL_INDEX = 5; // (this is 'N')
--- a/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java
@ -308,4 +308,36 @@ public class MathUtils {
    public static double[] normalizeFromLog10(double[] array) {
        return normalizeFromLog10(array, false);
    }
+
+    public static int maxElementIndex(double[] array) {
+        if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
+
+        int maxI = -1;
+        for ( int i = 0; i < array.length; i++ ) {
+            if ( maxI == -1 || array[i] > array[maxI] )
+                maxI = i;
+        }
+
+        return maxI;
+    }
+
+    public static double arrayMax(double[] array) {
+        return array[maxElementIndex(array)];
+    }
+
+    public static double arrayMin(double[] array) {
+        return array[minElementIndex(array)];
+    }
+
+    public static int minElementIndex(double[] array) {
+        if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
+
+        int minI = -1;
+        for ( int i = 0; i < array.length; i++ ) {
+            if ( minI == -1 || array[i] < array[minI] )
+                minI = i;
+        }
+
+        return minI;
+    }
 }
--- a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java
+++ b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java
@ -507,4 +507,16 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
    private String getQualsString() {
        return quals2String(getQuals());
    }
+
+    public String toString() {
+        StringBuilder s = new StringBuilder();
+        s.append(getLocation());
+        s.append(": ");
+
+        for ( PileupElement p : this ) {
+            s.append((char)p.getBase());
+        }
+
+        return s.toString();
+    }
 }