AlleleBalance and on/off primary base filters -- version 0.0.1 -- for experimental use only

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1294 348d0f76-0448-11de-a6fe-93d51630548a
2009-07-22 17:54:44 +00:00 · 2009-07-22 17:54:44 +00:00 · 9c12c02768
parent 00f9bcd6d1
commit 9c12c02768
9 changed files with 480 additions and 64 deletions
--- a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java
@ -110,6 +110,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
        attributes = new HashMap<String, String>();
    }

+    public TabularROD(final String name, ArrayList<String> header) {
+        this(name);
+        this.header = header;
+    }
+
    /**
     * Make a new TabularROD with name, using header columns header, at loc, without any bound data.  Data
     * must be bound to each corresponding header[i] field before the object is really usable.
@ -155,7 +160,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
     * @return
     */
    public Object initialize(final File source) throws FileNotFoundException {
-        List<String> header = null;
+        return readHeader(source);
+    }
+
+    public static ArrayList<String> readHeader(final File source) throws FileNotFoundException {
+        ArrayList<String> header = null;
        int linesLookedAt = 0;
        xReadLines reader = new xReadLines(source);

@ -186,9 +195,16 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
                header.add(Integer.toString(i));
        }

+        try {
+            reader.close();
+        } catch ( IOException e ) {
+            throw new RuntimeException(e);
+        }
+
        return header;
    }

+
    // ----------------------------------------------------------------------
    //
    // ROD accessors
--- a/java/src/org/broadinstitute/sting/gatk/refdata/rodVariants.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodVariants.java
@ -50,6 +50,8 @@ public class rodVariants extends BasicReferenceOrderedDatum implements AllelicVa
    public String delimiterRegex() { return "\\s+"; }

    public boolean parseLine(Object header, String[] parts) throws IOException {
+        if ( parts.length < 18 )
+            throw new IOException("Invalid rodVariant row found -- too few elements.  Expected 18+, got " + parts.length);
        if (!parts[0].startsWith("#")) {
            loc = GenomeLocParser.createGenomeLoc(parts[0], Long.valueOf(parts[1]));
            refBase = parts[2].charAt(0);
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleMetricsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleMetricsWalker.java
@ -1,27 +0,0 @@
-package org.broadinstitute.sting.playground.gatk.walkers;
-
-import org.broadinstitute.sting.utils.cmdLine.Argument;
-import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
-
-import java.util.List;
-
-/**
- * Created by IntelliJ IDEA.
- * User: andrewk
- * Date: Apr 2, 2009
- * Time: 9:01:44 PM
- * To change this template use File | Settings | File Templates.
- */
-public class AlleleMetricsWalker {
-    // Class that will walk over various metrics in a reference ordered way
-    // This class walks over the genome in reference order and calls AlleleMetrics on each class
-    // Hapmap and dbSNP tracks are taken from the command line
-    // At first pass, this will at least be able to walk over a GFF file and compare to the hapmap and dbsnp
-    // tracks specified on the command line and handed in via the LocusContext
-
-    public void map(List<AllelicVariant> avdata) {
-
-    }
-    
-
-}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/TransitionTranversionAnalysis.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/TransitionTranversionAnalysis.java
@ -20,55 +20,29 @@ import java.util.List;
 *
 */
 public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
-    int N_TRANSITION_TRANVERSION_BINS = 100;
-    Histogram<Integer> transitions;
-    Histogram<Integer> transversions;
+    long nTransitions = 0, nTransversions = 0;

    public TransitionTranversionAnalysis() {
        super("transitions_transversions");
-        transitions   = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
-        transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
    }

    public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
        if ( eval != null && eval.isSNP() ) {
            char refBase = eval.getRefSnpFWD();
            char altBase = eval.getAltSnpFWD();
-            //System.out.printf("%c %c%n", refBase, altBase);
-            //int i = transition_transversion_bin(dbsnp.getHeterozygosity());
-            //System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
-            //EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];

            BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
-            Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
-            double het = eval.getHeterozygosity();
-            h.setBin(het, h.getBin(het) + 1);
-            //int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
-            //int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
-            //System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
+            if ( subType == BaseUtils.BaseSubstitutionType.TRANSITION )
+                nTransitions++;
+            else
+                nTransversions++;
        }

        return null;
    }

    public List<String> done() {
-        int nTransitions = 0;
-        int nTransversions = 0;
-
        List<String> s = new ArrayList<String>();
-        for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
-            //double avHet = Math.pow(10, transitions.bin2x(i));
-            double avHet = transitions.bin2x(i);
-            if ( avHet > 0.5 ) break;
-
-            int sit = transitions.getBin(i);
-            int ver = transversions.getBin(i);
-            nTransitions += sit;
-            nTransversions += ver;
-            double ratio = (float)sit/ver;
-            //s.append(String.format("%s %s %.2f %d %d %f%n", commentLine, prefix, avHet, sit, ver, ratio));
-        }
-
        s.add(String.format("transitions    %d", nTransitions));
        s.add(String.format("transversions  %d", nTransversions));
        s.add(String.format("ratio          %.2f", nTransitions / (1.0 * nTransversions)));
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java
@ -0,0 +1,317 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variants;
+
+import org.broadinstitute.sting.gatk.LocusContext;
+import org.broadinstitute.sting.gatk.refdata.rodVariants;
+import org.broadinstitute.sting.gatk.refdata.TabularROD;
+import org.broadinstitute.sting.utils.*;
+import org.apache.log4j.Logger;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Set;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+import edu.mit.broad.picard.util.MathUtil;
+
+class GenotypeFeatureData extends ArrayList<Double> {
+    public enum Tail { LeftTailed, RightTailed, TwoTailed }
+
+    private String genotype = null;
+    private Integer[] permutation = null;
+    private Logger logger = null;
+
+    private Tail tail = null;
+
+    double lowThreshold = -1;
+    double highThreshold = -1;
+
+    public GenotypeFeatureData(final String genotype, Logger logger, Tail tail ) {
+        super();
+        this.genotype = genotype;
+        this.logger = logger;
+        this.tail = tail;
+    }
+
+    public Tail getTail() {
+        return tail;
+    }
+
+    public double getLowThreshold() {
+        return lowThreshold;
+    }
+
+    public void setLowThreshold(double lowThreshold) {
+        this.lowThreshold = lowThreshold;
+    }
+
+    public double getHighThreshold() {
+        return highThreshold;
+    }
+
+    public void setHighThreshold(double highThreshold) {
+        this.highThreshold = highThreshold;
+    }
+
+    public void finalizeData() {
+        permutation = Utils.SortPermutation(this);
+    }
+
+    public void determineThresholds(final double pvalueLimit) {
+        if ( pvalueLimit <= 0.0 || pvalueLimit >= 1.0 )
+            throw new RuntimeException(String.format("Invalid pValue limit = %f", pvalueLimit));
+
+        switch ( tail ) {
+            case LeftTailed:
+                lowThreshold = highThreshold = determineOneTailThreshold(pvalueLimit);
+                break;
+            case RightTailed:
+                lowThreshold = highThreshold = determineOneTailThreshold(1 - pvalueLimit);
+                break;
+            case TwoTailed:
+                lowThreshold = determineOneTailThreshold(pvalueLimit/2);
+                highThreshold = determineOneTailThreshold(1-pvalueLimit/2);
+                break;
+        }
+
+        //logger.info(String.format("  %d of %d elements are >= threshold = %f (%f percent)",
+        //        nElementsAboveThreshold(), nElements(), getHighThreshold(), nElementsAboveThreshold() / ( 1.0 * nElements())));
+        //logger.info(String.format("  %d of %d elements are <= threshold = %f (%f percent)",
+        //        nElementsBelowThreshold(), nElements(), getLowThreshold(), nElementsBelowThreshold() / ( 1.0 * nElements())));
+    }
+
+    private double determineOneTailThreshold(final double pvalueLimit) {
+        double trueSortedIndexLimit = pvalueLimit * nElements();
+        int sortedIndexLimit = (int)(pvalueLimit > 0.5 ? Math.ceil(trueSortedIndexLimit) : Math.floor(trueSortedIndexLimit));
+        double threshold = get(itemIndex(sortedIndexLimit));
+
+        logger.debug(String.format("determineTailThreshold(%s, %f) => %f => %d => %f", genotype, pvalueLimit, trueSortedIndexLimit, sortedIndexLimit, threshold));
+
+        return threshold;
+    }
+
+    public int nElementsBelowThreshold() { return nElementsPassingThreshold(true, false); }
+    public int nElementsAboveThreshold() { return nElementsPassingThreshold(false, true); }
+
+    public int nElementsPassingThreshold(boolean keepBelow, boolean keepAbove) {
+        int count = 0;
+
+        for ( double stat : this ) {
+            if ( passesThreshold(stat, keepBelow, keepAbove) ) count++;
+        }
+
+        return count;
+    }
+
+    public boolean passesThreshold(double value, boolean keepBelow, boolean keepAbove) {
+        //System.out.printf("passThreshold %s, %b, %b = %b%n", value, keepBelow, keepAbove, value < threshold && keepBelow || value >= threshold && keepAbove);
+        return value <= getHighThreshold() && keepBelow || value >= getLowThreshold() && keepAbove;
+    }
+
+    public boolean passesThreshold(double value) {
+        switch ( tail ) {
+            case LeftTailed:
+                return value >= getLowThreshold();
+            case RightTailed:
+                return value <= getHighThreshold();
+            case TwoTailed:
+                return value >= getLowThreshold() && value < getHighThreshold();
+            default:
+                return true;
+        }
+    }
+
+    public double pValue(double value) {
+        return 1.0;
+    }
+
+    public int itemIndex(int sortedIndex) {
+        return permutation[sortedIndex];
+    }
+
+    public int nElements() { return size(); }
+
+    public String toString() {
+        return String.format("[GenotypeFeatureData: genotype=%s, tail=%s, lowThreshold=%f, highThreshold=%f]", genotype, tail, lowThreshold, highThreshold);
+    }
+}
+
+
+class ObservationTable extends HashMap<String, GenotypeFeatureData> {
+    String statField = null;
+    File observationsFile = null;
+    Logger logger = null;
+    GenotypeFeatureData.Tail tail;
+
+    public ObservationTable(final File observationsFile, final String statField, Logger logger, GenotypeFeatureData.Tail tail ) throws NoSuchFieldException, FileNotFoundException, IOException {
+        this.observationsFile = observationsFile;
+        this.statField = statField;
+        this.logger = logger;
+        this.tail = tail;
+        readAllData(observationsFile, statField);
+    }
+
+    public Set<String> genotypes() {
+        return keySet();
+    }
+
+    public void readAllData(final File observationsFile, final String statField) throws NoSuchFieldException, FileNotFoundException, IOException {
+        ArrayList<String> header = TabularROD.readHeader(observationsFile);
+
+        //logger.info(String.format("Starting to read table %s", observationsFile));
+
+        for ( String line : new xReadLines(observationsFile) ) {
+            TabularROD d = new TabularROD("ignoreMe", header);
+            String[] parts = line.split("\\s+");
+            if ( d.parseLine(header, parts) ) {
+                if (! d.containsKey(statField)) {
+                    throw new NoSuchFieldException(String.format("Could not find field %s in line %s", statField, line));
+                }
+                double stat = Double.valueOf(d.get(statField));
+
+                final String genotype = d.get("genotype");
+                GenotypeFeatureData gfd = getData(genotype);
+                gfd.add(stat);
+            }
+        }
+
+        for ( GenotypeFeatureData gfd : this.values() ) {
+            gfd.finalizeData();
+        }
+
+        //logger.info(String.format("Read table %s, found %d genotypes/data pairs in %s field",
+        //        observationsFile, size(), statField));
+    }
+
+    public GenotypeFeatureData getData(final String genotype) {
+        if ( ! containsKey(genotype) ) {
+            GenotypeFeatureData gfd = new GenotypeFeatureData(genotype, logger, tail);
+            put(genotype, gfd);
+        }
+
+        return get(genotype);
+    }
+
+    public String toString() {
+        return String.format("[ObservationTable: file=%s, field=%s, nGenotypes=%d]", observationsFile, statField, size());
+    }
+}
+
+public abstract class RatioFilter implements VariantExclusionCriterion {
+    protected double pvalueLimit = -1;
+    protected File observationsFile = null;
+    protected String statField = null; // "AlleleRatio";
+    protected Logger logger = null; // Logger.getLogger(RatioFilter.class);
+    protected ObservationTable dataTable = null;
+    protected String name = null;
+    protected GenotypeFeatureData.Tail tail = null;
+
+
+    /**
+     * A short-term hack to stop the systme from rejecting poorly covered sites, under the assumption
+     * that the ratio test is poorly determined without at least MIN_COUNTS_TO_APPLY_TEST.  To be
+     * replaced by the more robust sampling approach outlined below
+     */
+    final private static int MIN_COUNTS_TO_APPLY_TEST = 20;
+
+    final static boolean integrateOverSamplingProbabilities = true;
+
+    public RatioFilter(final String name, final String statField, Class myClass, GenotypeFeatureData.Tail tail ) {
+        this.name = name;
+        this.statField = statField;
+        this.tail = tail;
+        logger = Logger.getLogger(myClass);
+    }
+
+    public void initialize(String arguments) {
+        if (arguments != null && !arguments.isEmpty()) {
+            String[] argPieces = arguments.split(",");
+            try {
+                pvalueLimit = Double.valueOf(argPieces[0]);
+                observationsFile = new File(argPieces[1]);
+
+                dataTable = new ObservationTable(observationsFile, statField, logger, tail);
+
+                logger.info(String.format("Initialized data for ratio filter %s: %s", name, dataTable));
+                for ( String genotype : dataTable.genotypes() ) {
+                    GenotypeFeatureData gfd = dataTable.get(genotype);
+                    gfd.determineThresholds(pvalueLimit);
+                    logger.info(gfd.toString());
+                }
+            } catch ( NumberFormatException e ) {
+                throw new RuntimeException(String.format("Couldn't parse pValue limit from %s", argPieces[0]), e);
+            } catch ( FileNotFoundException e ) {
+                throw new RuntimeException("Couldn't open ObservationTable " + observationsFile, e);
+            } catch ( NoSuchFieldException e ) {
+                throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile, e);
+            } catch ( IOException e ) {
+                throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile,e );
+            }
+        }
+    }
+
+    protected abstract boolean applyToVariant(rodVariants variant);
+    protected abstract Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant);
+
+    public boolean exclude(char ref, LocusContext context, rodVariants variant) {
+        boolean exclude = false;
+
+        //
+        // todo -- need to calculate a significance value for the chance that the
+        // todo -- observed first and second counts are reliably not passing the threshold
+        // todo -- basically, we need to sample with replacement from all first / second pairs
+        // todo -- and only conclude that the variant fails the test if the probability of failing
+        // todo -- across all samples is greater than some P value, like 0.05
+        //
+        ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
+        if ( applyToVariant(variant) ) {
+            Pair<Integer, Integer> counts = scoreVariant(ref, pileup, variant);
+            GenotypeFeatureData gfd = dataTable.get(variant.getBestGenotype());
+
+            if (integrateOverSamplingProbabilities)
+                exclude = integralExclude(gfd, counts);
+            else
+                exclude = pointEstimateExclude(gfd, counts);
+
+            //
+            // for printing only
+            //
+            int n = counts.first + counts.second;
+            double value = counts.first / (1.0 * counts.first + counts.second);
+            logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, %s, exclude=%b, bases=%s",
+                    name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n, 
+                    value, gfd, exclude, pileup.getBases()));
+        }
+
+        return exclude;
+    }
+
+    private final static double SEARCH_INCREMENT = 0.01;
+    private final static double integralPValueThreshold = 0.05;
+
+    private boolean integralExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) {
+        double sumExclude = 0.0, sumP = 0.0;
+        int n = counts.first + counts.second;
+        for ( double r = 0.0; r <= 1.0; r += SEARCH_INCREMENT ) {
+            double p = MathUtils.binomialProbability(counts.first, n, r);
+            sumP += p;
+            boolean exclude = ! gfd.passesThreshold(r);
+            sumExclude += p * (exclude ? 1.0 : 0.0);
+
+            //System.out.printf("integral: k=%d, n=%d, r=%f, p=%f, sumP = %f, exclude=%b | sum=%f, percentExcluded=%f%n",
+            //        counts.first, n, r, p, sumP, exclude, sumExclude, sumExclude / sumP);
+        }
+
+        double percentExcluded = sumExclude / sumP;
+        return 1 - percentExcluded <= integralPValueThreshold ;
+    }
+
+    private boolean pointEstimateExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) {
+        int n = counts.first + counts.second;
+        if ( n < MIN_COUNTS_TO_APPLY_TEST ) return false;
+        double value = counts.first / (1.0 * counts.first + counts.second);
+        //double value = counts.first / (1.0 * Math.max(counts.second, 1.0));
+        return ! gfd.passesThreshold(value);
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECAlleleBalance.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECAlleleBalance.java
@ -0,0 +1,54 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variants;
+
+import org.broadinstitute.sting.gatk.refdata.rodVariants;
+import org.broadinstitute.sting.utils.ReadBackedPileup;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.Pair;
+
+public class VECAlleleBalance extends RatioFilter {
+    final private static String statField = "AlleleRatio";
+    final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.TwoTailed;
+
+
+    public VECAlleleBalance() {
+        super("AlleleBalance", statField, VECAlleleBalance.class, tail);
+    }
+
+    /**
+     * We can only filter out het calls with the AlleleBalance filter
+     *
+     * @param variant
+     * @return
+     */
+    protected boolean applyToVariant(rodVariants variant) {
+        String genotype = variant.getBestGenotype();
+        return genotype.charAt(0) != genotype.charAt(1);
+    }
+
+    /**
+     * Return the count of bases matching the major (first) and minor (second) alleles as a pair.
+     *
+     * @param ref
+     * @param pileup
+     * @param variant
+     * @return
+     */
+    protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
+        final String genotype = variant.getBestGenotype();
+        final String bases = pileup.getBases();
+
+        if ( genotype.length() > 2 )
+            throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype));
+
+
+        char a = genotype.toUpperCase().charAt(0);
+        char b = genotype.toUpperCase().charAt(1);
+        int aCount = Utils.countOccurrences(a, bases.toUpperCase());
+        int bCount = Utils.countOccurrences(b, bases.toUpperCase());
+
+        int major = Math.max(aCount, bCount);
+        int minor = Math.min(aCount, bCount);
+
+        return new Pair(major, minor);
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java
@ -0,0 +1,54 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variants;
+
+import org.broadinstitute.sting.gatk.refdata.rodVariants;
+import org.broadinstitute.sting.utils.*;
+
+public class VECOnOffGenotypeRatio extends RatioFilter {
+    final private static String statField = "onOffRatio";
+    final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.LeftTailed;
+
+    public VECOnOffGenotypeRatio() {
+        super("On/Off Genotype Ratio", statField, VECOnOffGenotypeRatio.class, tail);
+    }
+
+    /**
+     * On/off genotype filters can be applied to any genotype
+     *
+     * @param variant
+     * @return
+     */
+    protected boolean applyToVariant(rodVariants variant) {
+        return true;
+    }
+
+    /**
+     * Return the counts of bases that are on (matching the bestGenotype) and off (not matching the
+     * best genotype).  On are in the first field, off in the second.
+     *
+     * @param ref
+     * @param pileup
+     * @param variant
+     * @return
+     */
+    protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
+        final String genotype = variant.getBestGenotype().toUpperCase();
+        final String bases = pileup.getBases();
+
+        if ( genotype.length() > 2 )
+            throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype));
+
+
+        int on = 0, off = 0;
+
+        for ( char base : BaseUtils.BASES ) {
+            int count = BasicPileup.countBase(base, bases);
+            if ( Utils.countOccurrences(base, genotype) > 0 )
+                on += count;
+            else
+                off += count;
+            //System.out.printf("count = %d, on=%d, off=%d for %c in %s%n", count, on, off, base, genotype);            
+        }
+
+        return new Pair<Integer, Integer>(on, off);
+    }
+}
--- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java
@ -6,6 +6,7 @@ import java.util.Random;
 * BaseUtils contains some basic utilities for manipulating nucleotides.
 */
 public class BaseUtils {
+    public final static char[] BASES = { 'A', 'C', 'G', 'T' };

    /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
    // a pyrimidine to another pyrimidine nucleotide (C <-> T).
--- a/java/src/org/broadinstitute/sting/utils/BasicPileup.java
+++ b/java/src/org/broadinstitute/sting/utils/BasicPileup.java
@ -347,12 +347,35 @@ abstract public class BasicPileup implements Pileup {
        return null;
    }

-    public static byte consensusBase(String bases) {
+    public static class BaseCounts {
+        int a, c, t, g;
+
+        public BaseCounts(int a, int c, int t, int g) {
+            this.a = a;
+            this.c = c;
+            this.t = t;
+            this.g = g;
+        }
+    }
+
+    public static int countBase(final char base, final String bases) {
+        return Utils.countOccurrences(base, bases);
+    }
+
+    public static BaseCounts countBases(final String bases) {
        String canon = bases.toUpperCase();
-        int ACount = Utils.countOccurrences('A', bases);
-        int CCount = Utils.countOccurrences('C', bases);
-        int TCount = Utils.countOccurrences('T', bases);
-        int GCount = Utils.countOccurrences('G', bases);
+        return new BaseCounts(Utils.countOccurrences('A', canon),
+                Utils.countOccurrences('C', canon),
+                Utils.countOccurrences('T', canon),
+                Utils.countOccurrences('G', canon));
+    }
+
+    public static byte consensusBase(String bases) {
+        BaseCounts counts = countBases( bases );
+        int ACount = counts.a;
+        int CCount = counts.c;
+        int TCount = counts.t;
+        int GCount = counts.g;

        int m = Math.max(ACount, Math.max(CCount, Math.max(TCount, GCount)));
        if ( ACount == m ) return 'A';
@ -362,3 +385,5 @@ abstract public class BasicPileup implements Pileup {
        return 0;
    }
 }
+
+