From cd68cc239b9dd200fb9a741e55dd60221c9d5c4a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 29 Dec 2011 00:41:59 -0500 Subject: [PATCH] Added knuth-shuffle (KS) and randomSubset using KS to MathUtils * Knuth-shuffle is a simple, yet effective array permutator (hope this is good english). * added a simple randomSubset that returns a random subset without repeats of any given array with the same probability for every permutation. * added unit tests to both functions --- .../broadinstitute/sting/utils/MathUtils.java | 818 ++++++++++-------- .../sting/utils/MathUtilsUnitTest.java | 92 +- 2 files changed, 545 insertions(+), 365 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 759e1649d..4a3100a94 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -49,8 +50,11 @@ public class MathUtils { */ - /** Private constructor. No instantiating this class! */ - private MathUtils() {} + /** + * Private constructor. No instantiating this class! + */ + private MathUtils() { + } @Requires({"d > 0.0"}) public static int fastPositiveRound(double d) { @@ -58,21 +62,21 @@ public class MathUtils { } public static int fastRound(double d) { - if ( d > 0.0 ) { + if (d > 0.0) { return fastPositiveRound(d); } else { - return -1*fastPositiveRound(-1*d); + return -1 * fastPositiveRound(-1 * d); } } public static double sum(Collection numbers) { - return sum(numbers,false); + return sum(numbers, false); } - public static double sum( Collection numbers, boolean ignoreNan ) { + public static double sum(Collection numbers, boolean ignoreNan) { double sum = 0; - for ( Number n : numbers ) { - if ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) { + for (Number n : numbers) { + if (!ignoreNan || !Double.isNaN(n.doubleValue())) { sum += n.doubleValue(); } } @@ -82,66 +86,72 @@ public class MathUtils { public static int nonNanSize(Collection numbers) { int size = 0; - for ( Number n : numbers) { + for (Number n : numbers) { size += Double.isNaN(n.doubleValue()) ? 0 : 1; } return size; } - public static double average( Collection numbers, boolean ignoreNan) { - if ( ignoreNan ) { - return sum(numbers,true)/nonNanSize(numbers); + public static double average(Collection numbers, boolean ignoreNan) { + if (ignoreNan) { + return sum(numbers, true) / nonNanSize(numbers); } else { - return sum(numbers,false)/nonNanSize(numbers); + return sum(numbers, false) / nonNanSize(numbers); } } - public static double variance( Collection numbers, Number mean, boolean ignoreNan ) { + public static double variance(Collection numbers, Number mean, boolean ignoreNan) { double mn = mean.doubleValue(); double var = 0; - for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; } - if ( ignoreNan ) { return var/(nonNanSize(numbers)-1); } - return var/(numbers.size()-1); + for (Number n : numbers) { + var += (!ignoreNan || !Double.isNaN(n.doubleValue())) ? (n.doubleValue() - mn) * (n.doubleValue() - mn) : 0; + } + if (ignoreNan) { + return var / (nonNanSize(numbers) - 1); + } + return var / (numbers.size() - 1); } public static double variance(Collection numbers, Number mean) { - return variance(numbers,mean,false); + return variance(numbers, mean, false); } public static double variance(Collection numbers, boolean ignoreNan) { - return variance(numbers,average(numbers,ignoreNan),ignoreNan); + return variance(numbers, average(numbers, ignoreNan), ignoreNan); } public static double variance(Collection numbers) { - return variance(numbers,average(numbers,false),false); + return variance(numbers, average(numbers, false), false); } public static double sum(double[] values) { double s = 0.0; - for ( double v : values) s += v; + for (double v : values) s += v; return s; } /** * Calculates the log10 cumulative sum of an array with log10 probabilities + * * @param log10p the array with log10 probabilites - * @param upTo index in the array to calculate the cumsum up to + * @param upTo index in the array to calculate the cumsum up to * @return the log10 of the cumulative sum */ - public static double log10CumulativeSumLog10(double [] log10p, int upTo) { + public static double log10CumulativeSumLog10(double[] log10p, int upTo) { return log10sumLog10(log10p, 0, upTo); } /** * Converts a real space array of probabilities into a log10 array + * * @param prRealSpace * @return */ public static double[] toLog10(double[] prRealSpace) { double[] log10s = new double[prRealSpace.length]; - for ( int i = 0; i < prRealSpace.length; i++ ) + for (int i = 0; i < prRealSpace.length; i++) log10s[i] = Math.log10(prRealSpace[i]); return log10s; } @@ -154,7 +164,7 @@ public class MathUtils { double sum = 0.0; double maxValue = Utils.findMaxEntry(log10p); - for ( int i = start; i < finish; i++ ) { + for (int i = start; i < finish; i++) { sum += Math.pow(10.0, log10p[i] - maxValue); } @@ -163,13 +173,13 @@ public class MathUtils { public static double sumDoubles(List values) { double s = 0.0; - for ( double v : values) s += v; + for (double v : values) s += v; return s; } public static int sumIntegers(List values) { int s = 0; - for ( int v : values) s += v; + for (int v : values) s += v; return s; } @@ -185,11 +195,11 @@ public class MathUtils { } public static boolean wellFormedDouble(double val) { - return ! Double.isInfinite(val) && ! Double.isNaN(val); + return !Double.isInfinite(val) && !Double.isNaN(val); } public static double bound(double value, double minBoundary, double maxBoundary) { - return Math.max(Math.min(value, maxBoundary), minBoundary); + return Math.max(Math.min(value, maxBoundary), minBoundary); } public static boolean isBounded(double val, double lower, double upper) { @@ -197,7 +207,7 @@ public class MathUtils { } public static boolean isPositive(double val) { - return ! isNegativeOrZero(val); + return !isNegativeOrZero(val); } public static boolean isPositiveOrZero(double val) { @@ -209,17 +219,19 @@ public class MathUtils { } public static boolean isNegative(double val) { - return ! isPositiveOrZero(val); + return !isPositiveOrZero(val); } /** * Compares double values for equality (within 1e-6), or inequality. * - * @param a the first double value - * @param b the second double value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + * @param a the first double value + * @param b the second double value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b) { return compareDoubles(a, b, 1e-6); } + public static byte compareDoubles(double a, double b) { + return compareDoubles(a, b, 1e-6); + } /** * Compares double values for equality (within epsilon), or inequality. @@ -227,23 +239,28 @@ public class MathUtils { * @param a the first double value * @param b the second double value * @param epsilon the precision within which two double values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b, double epsilon) - { - if (Math.abs(a - b) < epsilon) { return 0; } - if (a > b) { return -1; } + public static byte compareDoubles(double a, double b, double epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } return 1; } /** * Compares float values for equality (within 1e-6), or inequality. * - * @param a the first float value - * @param b the second float value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + * @param a the first float value + * @param b the second float value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareFloats(float a, float b) { return compareFloats(a, b, 1e-6f); } + public static byte compareFloats(float a, float b) { + return compareFloats(a, b, 1e-6f); + } /** * Compares float values for equality (within epsilon), or inequality. @@ -251,47 +268,50 @@ public class MathUtils { * @param a the first float value * @param b the second float value * @param epsilon the precision within which two float values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareFloats(float a, float b, float epsilon) - { - if (Math.abs(a - b) < epsilon) { return 0; } - if (a > b) { return -1; } + public static byte compareFloats(float a, float b, float epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } return 1; } - public static double NormalDistribution(double mean, double sd, double x) - { - double a = 1.0 / (sd*Math.sqrt(2.0 * Math.PI)); - double b = Math.exp(-1.0 * (Math.pow(x - mean,2.0)/(2.0 * sd * sd))); + public static double NormalDistribution(double mean, double sd, double x) { + double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); + double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); return a * b; } - public static double binomialCoefficient (int n, int k) { + public static double binomialCoefficient(int n, int k) { return Math.pow(10, log10BinomialCoefficient(n, k)); } + /** * Computes a binomial probability. This is computed using the formula - * - * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) - * + *

+ * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) + *

* where n is the number of trials, k is the number of successes, and p is the probability of success * - * @param n number of Bernoulli trials - * @param k number of successes - * @param p probability of success - * - * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. + * @param n number of Bernoulli trials + * @param k number of successes + * @param p probability of success + * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. */ - public static double binomialProbability (int n, int k, double p) { + public static double binomialProbability(int n, int k, double p) { return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); } /** * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. - * @param start - start of the cumulant sum (over hits) - * @param end - end of the cumulant sum (over hits) - * @param total - number of attempts for the number of hits + * + * @param start - start of the cumulant sum (over hits) + * @param end - end of the cumulant sum (over hits) + * @param total - number of attempts for the number of hits * @param probHit - probability of a successful hit * @return - returns the cumulative probability */ @@ -300,11 +320,11 @@ public class MathUtils { double prevProb; BigDecimal probCache = BigDecimal.ZERO; - for(int hits = start; hits < end; hits++) { + for (int hits = start; hits < end; hits++) { prevProb = cumProb; double probability = binomialProbability(total, hits, probHit); cumProb += probability; - if ( probability > 0 && cumProb - prevProb < probability/2 ) { // loss of precision + if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision probCache = probCache.add(new BigDecimal(prevProb)); cumProb = 0.0; hits--; // repeat loop @@ -314,20 +334,20 @@ public class MathUtils { return probCache.add(new BigDecimal(cumProb)).doubleValue(); } - + /** * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. * This is computed using the formula: - * - * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] - * + *

+ * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] + *

* where xi represents the number of times outcome i was observed, n is the number of total observations. * In this implementation, the value of n is inferred as the sum over i of xi. * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @return the multinomial of the specified configuration. + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @return the multinomial of the specified configuration. */ - public static double multinomialCoefficient (int [] k) { + public static double multinomialCoefficient(int[] k) { int n = 0; for (int xi : k) { n += xi; @@ -339,37 +359,38 @@ public class MathUtils { /** * Computes a multinomial probability efficiently avoiding overflow even for large numbers. * This is computed using the formula: - * - * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) - * + *

+ * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) + *

* where xi represents the number of times outcome i was observed, n is the number of total observations, and * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is * inferred as the sum over i of xi. * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur - * @return the multinomial probability of the specified configuration. + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur + * @return the multinomial probability of the specified configuration. */ - public static double multinomialProbability (int[] k, double[] p) { + public static double multinomialProbability(int[] k, double[] p) { if (p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); int n = 0; - double [] log10P = new double[p.length]; - for (int i=0; i array[maxI] ) + for (int i = 0; i < array.length; i++) { + if (maxI == -1 || array[i] > array[maxI]) maxI = i; } @@ -532,11 +554,11 @@ public class MathUtils { } public static int maxElementIndex(int[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); int maxI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( maxI == -1 || array[i] > array[maxI] ) + for (int i = 0; i < array.length; i++) { + if (maxI == -1 || array[i] > array[maxI]) maxI = i; } @@ -556,11 +578,11 @@ public class MathUtils { } public static int minElementIndex(double[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); int minI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( minI == -1 || array[i] < array[minI] ) + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) minI = i; } @@ -568,32 +590,32 @@ public class MathUtils { } public static int minElementIndex(byte[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); int minI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( minI == -1 || array[i] < array[minI] ) + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) minI = i; } return minI; - } + } public static int arrayMaxInt(List array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); - if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) throw new IllegalArgumentException("Array size cannot be 0!"); int m = array.get(0); - for ( int e : array ) m = Math.max(m, e); + for (int e : array) m = Math.max(m, e); return m; } public static double arrayMaxDouble(List array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); - if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) throw new IllegalArgumentException("Array size cannot be 0!"); double m = array.get(0); - for ( double e : array ) m = Math.max(m, e); + for (double e : array) m = Math.max(m, e); return m; } @@ -636,7 +658,7 @@ public class MathUtils { for (byte v : vals) { sum += v; } - return (byte) Math.floor(sum/vals.length); + return (byte) Math.floor(sum / vals.length); } public static double averageDouble(List vals) { @@ -749,7 +771,9 @@ public class MathUtils { } - /** Draw N random elements from list. */ + /** + * Draw N random elements from list. + */ public static List randomSubset(List list, int N) { if (list.size() <= N) { return list; @@ -770,6 +794,25 @@ public class MathUtils { return ans; } + /** + * Draw N random elements from an array. + * + * @param array your objects + * @param n number of elements to select at random from the list + * @return a new list with the N randomly chosen elements from list + */ + @Requires({"array != null", "n>=0"}) + @Ensures({"result != null", "result.length == Math.min(n, array.length)"}) + public static Object[] randomSubset(final Object[] array, final int n) { + if (array.length <= n) + return array.clone(); + + Object[] shuffledArray = arrayShuffle(array); + Object[] result = new Object[n]; + System.arraycopy(shuffledArray, 0, result, 0, n); + return result; + } + public static double percentage(double x, double base) { return (base > 0 ? (x / base) * 100.0 : 0); } @@ -799,7 +842,7 @@ public class MathUtils { return count; } - public static int countOccurrences(byte element, byte [] array) { + public static int countOccurrences(byte element, byte[] array) { int count = 0; for (byte y : array) { if (element == y) @@ -814,13 +857,13 @@ public class MathUtils { * Better than sorting if N (number of elements to return) is small * * @param array the array - * @param n number of top elements to return + * @param n number of top elements to return * @return the n larger elements of the array */ - public static Collection getNMaxElements(double [] array, int n) { + public static Collection getNMaxElements(double[] array, int n) { ArrayList maxN = new ArrayList(n); double lastMax = Double.MAX_VALUE; - for (int i=0; i sampleIndicesWithReplacement(int n, int k) { - ArrayList chosen_balls = new ArrayList (k); - for (int i=0; i< k; i++) { + ArrayList chosen_balls = new ArrayList(k); + for (int i = 0; i < k; i++) { //Integer chosen_ball = balls[rand.nextInt(k)]; chosen_balls.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(n)); //balls.remove(chosen_ball); @@ -872,11 +915,11 @@ public class MathUtils { /** * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times - - * @param indices the list of indices for elements to extract - * @param list the list from which the elements should be extracted - * @param the template type of the ArrayList - * @return a new ArrayList consisting of the elements at the specified indices + * + * @param indices the list of indices for elements to extract + * @param list the list from which the elements should be extracted + * @param the template type of the ArrayList + * @return a new ArrayList consisting of the elements at the specified indices */ static public ArrayList sliceListByIndices(List indices, List list) { ArrayList subset = new ArrayList(); @@ -898,18 +941,18 @@ public class MathUtils { ArrayList equalToX = new ArrayList(); ArrayList greaterThanX = new ArrayList(); - for(Comparable y : list) { - if(x.compareTo(y) > 0) { + for (Comparable y : list) { + if (x.compareTo(y) > 0) { lessThanX.add(y); - } else if(x.compareTo(y) < 0) { + } else if (x.compareTo(y) < 0) { greaterThanX.add(y); } else equalToX.add(y); } - if(lessThanX.size() > orderStat) + if (lessThanX.size() > orderStat) return orderStatisticSearch(orderStat, lessThanX); - else if(lessThanX.size() + equalToX.size() >= orderStat) + else if (lessThanX.size() + equalToX.size() >= orderStat) return orderStat; else return orderStatisticSearch(orderStat - lessThanX.size() - equalToX.size(), greaterThanX); @@ -918,7 +961,7 @@ public class MathUtils { public static Object getMedian(List list) { - return orderStatisticSearch((int) Math.ceil(list.size()/2), list); + return orderStatisticSearch((int) Math.ceil(list.size() / 2), list); } public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { @@ -926,7 +969,7 @@ public class MathUtils { // list index maps to a q-score only through the offset index // returns the kth-largest q-score. - if( reads.size() == 0) { + if (reads.size() == 0) { return 0; } @@ -938,15 +981,15 @@ public class MathUtils { final byte qk = reads.get(k).getBaseQualities()[offsets.get(k)]; - for(int iter = 0; iter < reads.size(); iter ++) { + for (int iter = 0; iter < reads.size(); iter++) { SAMRecord read = reads.get(iter); int offset = offsets.get(iter); byte quality = read.getBaseQualities()[offset]; - if(quality < qk) { + if (quality < qk) { lessThanQReads.add(read); lessThanQOffsets.add(offset); - } else if(quality > qk) { + } else if (quality > qk) { greaterThanQReads.add(read); greaterThanQOffsets.add(offset); } else { @@ -954,9 +997,9 @@ public class MathUtils { } } - if(lessThanQReads.size() > k) + if (lessThanQReads.size() > k) return getQScoreOrderStatistic(lessThanQReads, lessThanQOffsets, k); - else if(equalToQReads.size() + lessThanQReads.size() >= k) + else if (equalToQReads.size() + lessThanQReads.size() >= k) return qk; else return getQScoreOrderStatistic(greaterThanQReads, greaterThanQOffsets, k - lessThanQReads.size() - equalToQReads.size()); @@ -964,10 +1007,11 @@ public class MathUtils { } public static byte getQScoreMedian(List reads, List offsets) { - return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); + return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } - /** A utility class that computes on the fly average and standard deviation for a stream of numbers. + /** + * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that * it could overflow any naive summation-based scheme or cause loss of precision). * Instead, adding a new number observed @@ -983,20 +1027,31 @@ public class MathUtils { public void add(double obs) { obs_count++; double oldMean = mean; - mean += ( obs - mean ) / obs_count; // update mean - s += ( obs - oldMean ) * ( obs - mean ); + mean += (obs - mean) / obs_count; // update mean + s += (obs - oldMean) * (obs - mean); } public void addAll(Collection col) { - for ( Number o : col ) { + for (Number o : col) { add(o.doubleValue()); } } - public double mean() { return mean; } - public double stddev() { return Math.sqrt(s/(obs_count - 1)); } - public double var() { return s/(obs_count - 1); } - public long observationCount() { return obs_count; } + public double mean() { + return mean; + } + + public double stddev() { + return Math.sqrt(s / (obs_count - 1)); + } + + public double var() { + return s / (obs_count - 1); + } + + public long observationCount() { + return obs_count; + } public RunningAverage clone() { RunningAverage ra = new RunningAverage(); @@ -1007,71 +1062,86 @@ public class MathUtils { } public void merge(RunningAverage other) { - if ( this.obs_count > 0 || other.obs_count > 0 ) { // if we have any observations at all - this.mean = ( this.mean * this.obs_count + other.mean * other.obs_count ) / ( this.obs_count + other.obs_count ); + if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all + this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); this.s += other.s; } this.obs_count += other.obs_count; } } - + // // useful common utility routines // - public static double rate(long n, long d) { return n / (1.0 * Math.max(d, 1)); } - public static double rate(int n, int d) { return n / (1.0 * Math.max(d, 1)); } + public static double rate(long n, long d) { + return n / (1.0 * Math.max(d, 1)); + } - public static long inverseRate(long n, long d) { return n == 0 ? 0 : d / Math.max(n, 1); } - public static long inverseRate(int n, int d) { return n == 0 ? 0 : d / Math.max(n, 1); } + public static double rate(int n, int d) { + return n / (1.0 * Math.max(d, 1)); + } - public static double ratio(int num, int denom) { return ((double)num) / (Math.max(denom, 1)); } - public static double ratio(long num, long denom) { return ((double)num) / (Math.max(denom, 1)); } + public static long inverseRate(long n, long d) { + return n == 0 ? 0 : d / Math.max(n, 1); + } + + public static long inverseRate(int n, int d) { + return n == 0 ? 0 : d / Math.max(n, 1); + } + + public static double ratio(int num, int denom) { + return ((double) num) / (Math.max(denom, 1)); + } + + public static double ratio(long num, long denom) { + return ((double) num) / (Math.max(denom, 1)); + } public static final double[] log10Cache; public static final double[] jacobianLogTable; public static final int JACOBIAN_LOG_TABLE_SIZE = 101; public static final double JACOBIAN_LOG_TABLE_STEP = 0.1; - public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0/JACOBIAN_LOG_TABLE_STEP; + public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; public static final double MAX_JACOBIAN_TOLERANCE = 10.0; private static final int MAXN = 11000; private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients static { log10Cache = new double[LOG10_CACHE_SIZE]; - jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; + jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; log10Cache[0] = Double.NEGATIVE_INFINITY; - for (int k=1; k < LOG10_CACHE_SIZE; k++) + for (int k = 1; k < LOG10_CACHE_SIZE; k++) log10Cache[k] = Math.log10(k); - for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { - jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) - * JACOBIAN_LOG_TABLE_STEP)); + for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { + jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) + * JACOBIAN_LOG_TABLE_STEP)); - } + } } static public double softMax(final double[] vec) { double acc = vec[0]; - for (int k=1; k < vec.length; k++) - acc = softMax(acc,vec[k]); + for (int k = 1; k < vec.length; k++) + acc = softMax(acc, vec[k]); return acc; } static public double max(double x0, double x1, double x2) { - double a = Math.max(x0,x1); - return Math.max(a,x2); + double a = Math.max(x0, x1); + return Math.max(a, x2); } - - static public double softMax(final double x0, final double x1, final double x2) { - // compute naively log10(10^x[0] + 10^x[1]+...) - // return Math.log10(MathUtils.sumLog10(vec)); - // better approximation: do Jacobian logarithm function on data pairs - double a = softMax(x0,x1); - return softMax(a,x2); + static public double softMax(final double x0, final double x1, final double x2) { + // compute naively log10(10^x[0] + 10^x[1]+...) + // return Math.log10(MathUtils.sumLog10(vec)); + + // better approximation: do Jacobian logarithm function on data pairs + double a = softMax(x0, x1); + return softMax(a, x2); } static public double softMax(final double x, final double y) { @@ -1084,49 +1154,50 @@ public class MathUtils { // slow exact version: // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - double diff = x-y; + double diff = x - y; if (diff > MAX_JACOBIAN_TOLERANCE) return x; else if (diff < -MAX_JACOBIAN_TOLERANCE) return y; else if (diff >= 0) { - int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + int ind = (int) (diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5); return x + jacobianLogTable[ind]; - } - else { - int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + } else { + int ind = (int) (-diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5); return y + jacobianLogTable[ind]; } } - public static double phredScaleToProbability (byte q) { - return Math.pow(10,(-q)/10.0); + public static double phredScaleToProbability(byte q) { + return Math.pow(10, (-q) / 10.0); } - public static double phredScaleToLog10Probability (byte q) { - return ((-q)/10.0); + public static double phredScaleToLog10Probability(byte q) { + return ((-q) / 10.0); } /** * Returns the phred scaled value of probability p + * * @param p probability (between 0 and 1). * @return phred scaled probability of p */ - public static byte probabilityToPhredScale (double p) { + public static byte probabilityToPhredScale(double p) { return (byte) ((-10) * Math.log10(p)); } - public static double log10ProbabilityToPhredScale (double log10p) { + public static double log10ProbabilityToPhredScale(double log10p) { return (-10) * log10p; } /** * Converts LN to LOG10 + * * @param ln log(x) * @return log10(x) */ - public static double lnToLog10 (double ln) { + public static double lnToLog10(double ln) { return ln * Math.log10(Math.exp(1)); } @@ -1134,169 +1205,190 @@ public class MathUtils { * Constants to simplify the log gamma function calculation. */ private static final double - zero = 0.0, - one = 1.0, - half = .5, - a0 = 7.72156649015328655494e-02, - a1 = 3.22467033424113591611e-01, - a2 = 6.73523010531292681824e-02, - a3 = 2.05808084325167332806e-02, - a4 = 7.38555086081402883957e-03, - a5 = 2.89051383673415629091e-03, - a6 = 1.19270763183362067845e-03, - a7 = 5.10069792153511336608e-04, - a8 = 2.20862790713908385557e-04, - a9 = 1.08011567247583939954e-04, - a10 = 2.52144565451257326939e-05, - a11 = 4.48640949618915160150e-05, - tc = 1.46163214496836224576e+00, - tf = -1.21486290535849611461e-01, - tt = -3.63867699703950536541e-18, - t0 = 4.83836122723810047042e-01, - t1 = -1.47587722994593911752e-01, - t2 = 6.46249402391333854778e-02, - t3 = -3.27885410759859649565e-02, - t4 = 1.79706750811820387126e-02, - t5 = -1.03142241298341437450e-02, - t6 = 6.10053870246291332635e-03, - t7 = -3.68452016781138256760e-03, - t8 = 2.25964780900612472250e-03, - t9 = -1.40346469989232843813e-03, - t10 = 8.81081882437654011382e-04, - t11 = -5.38595305356740546715e-04, - t12 = 3.15632070903625950361e-04, - t13 = -3.12754168375120860518e-04, - t14 = 3.35529192635519073543e-04, - u0 = -7.72156649015328655494e-02, - u1 = 6.32827064025093366517e-01, - u2 = 1.45492250137234768737e+00, - u3 = 9.77717527963372745603e-01, - u4 = 2.28963728064692451092e-01, - u5 = 1.33810918536787660377e-02, - v1 = 2.45597793713041134822e+00, - v2 = 2.12848976379893395361e+00, - v3 = 7.69285150456672783825e-01, - v4 = 1.04222645593369134254e-01, - v5 = 3.21709242282423911810e-03, - s0 = -7.72156649015328655494e-02, - s1 = 2.14982415960608852501e-01, - s2 = 3.25778796408930981787e-01, - s3 = 1.46350472652464452805e-01, - s4 = 2.66422703033638609560e-02, - s5 = 1.84028451407337715652e-03, - s6 = 3.19475326584100867617e-05, - r1 = 1.39200533467621045958e+00, - r2 = 7.21935547567138069525e-01, - r3 = 1.71933865632803078993e-01, - r4 = 1.86459191715652901344e-02, - r5 = 7.77942496381893596434e-04, - r6 = 7.32668430744625636189e-06, - w0 = 4.18938533204672725052e-01, - w1 = 8.33333333333329678849e-02, - w2 = -2.77777777728775536470e-03, - w3 = 7.93650558643019558500e-04, - w4 = -5.95187557450339963135e-04, - w5 = 8.36339918996282139126e-04, - w6 = -1.63092934096575273989e-03; + zero = 0.0, + one = 1.0, + half = .5, + a0 = 7.72156649015328655494e-02, + a1 = 3.22467033424113591611e-01, + a2 = 6.73523010531292681824e-02, + a3 = 2.05808084325167332806e-02, + a4 = 7.38555086081402883957e-03, + a5 = 2.89051383673415629091e-03, + a6 = 1.19270763183362067845e-03, + a7 = 5.10069792153511336608e-04, + a8 = 2.20862790713908385557e-04, + a9 = 1.08011567247583939954e-04, + a10 = 2.52144565451257326939e-05, + a11 = 4.48640949618915160150e-05, + tc = 1.46163214496836224576e+00, + tf = -1.21486290535849611461e-01, + tt = -3.63867699703950536541e-18, + t0 = 4.83836122723810047042e-01, + t1 = -1.47587722994593911752e-01, + t2 = 6.46249402391333854778e-02, + t3 = -3.27885410759859649565e-02, + t4 = 1.79706750811820387126e-02, + t5 = -1.03142241298341437450e-02, + t6 = 6.10053870246291332635e-03, + t7 = -3.68452016781138256760e-03, + t8 = 2.25964780900612472250e-03, + t9 = -1.40346469989232843813e-03, + t10 = 8.81081882437654011382e-04, + t11 = -5.38595305356740546715e-04, + t12 = 3.15632070903625950361e-04, + t13 = -3.12754168375120860518e-04, + t14 = 3.35529192635519073543e-04, + u0 = -7.72156649015328655494e-02, + u1 = 6.32827064025093366517e-01, + u2 = 1.45492250137234768737e+00, + u3 = 9.77717527963372745603e-01, + u4 = 2.28963728064692451092e-01, + u5 = 1.33810918536787660377e-02, + v1 = 2.45597793713041134822e+00, + v2 = 2.12848976379893395361e+00, + v3 = 7.69285150456672783825e-01, + v4 = 1.04222645593369134254e-01, + v5 = 3.21709242282423911810e-03, + s0 = -7.72156649015328655494e-02, + s1 = 2.14982415960608852501e-01, + s2 = 3.25778796408930981787e-01, + s3 = 1.46350472652464452805e-01, + s4 = 2.66422703033638609560e-02, + s5 = 1.84028451407337715652e-03, + s6 = 3.19475326584100867617e-05, + r1 = 1.39200533467621045958e+00, + r2 = 7.21935547567138069525e-01, + r3 = 1.71933865632803078993e-01, + r4 = 1.86459191715652901344e-02, + r5 = 7.77942496381893596434e-04, + r6 = 7.32668430744625636189e-06, + w0 = 4.18938533204672725052e-01, + w1 = 8.33333333333329678849e-02, + w2 = -2.77777777728775536470e-03, + w3 = 7.93650558643019558500e-04, + w4 = -5.95187557450339963135e-04, + w5 = 8.36339918996282139126e-04, + w6 = -1.63092934096575273989e-03; /** * Efficient rounding functions to simplify the log gamma function calculation - * double to long with 32 bit shift + * double to long with 32 bit shift */ - private static final int HI (double x) { - return (int)(Double.doubleToLongBits(x) >> 32); + private static final int HI(double x) { + return (int) (Double.doubleToLongBits(x) >> 32); } /** * Efficient rounding functions to simplify the log gamma function calculation - * double to long without shift + * double to long without shift */ - private static final int LO (double x) { - return (int)Double.doubleToLongBits(x); + private static final int LO(double x) { + return (int) Double.doubleToLongBits(x); } /** * Most efficent implementation of the lnGamma (FDLIBM) * Use via the log10Gamma wrapper method. */ - private static double lnGamma (double x) { - double t,y,z,p,p1,p2,p3,q,r,w; + private static double lnGamma(double x) { + double t, y, z, p, p1, p2, p3, q, r, w; int i; int hx = HI(x); int lx = LO(x); /* purge off +-inf, NaN, +-0, and negative arguments */ - int ix = hx&0x7fffffff; + int ix = hx & 0x7fffffff; if (ix >= 0x7ff00000) return Double.POSITIVE_INFINITY; - if ((ix|lx)==0 || hx < 0) return Double.NaN; - if (ix<0x3b900000) { /* |x|<2**-70, return -log(|x|) */ + if ((ix | lx) == 0 || hx < 0) return Double.NaN; + if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ return -Math.log(x); } /* purge off 1 and 2 */ - if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0; - /* for x < 2.0 */ - else if(ix<0x40000000) { - if(ix<=0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) r = 0; + /* for x < 2.0 */ + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ r = -Math.log(x); - if(ix>=0x3FE76944) {y = one-x; i= 0;} - else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;} - else {y = x; i=2;} + if (ix >= 0x3FE76944) { + y = one - x; + i = 0; + } else if (ix >= 0x3FCDA661) { + y = x - (tc - one); + i = 1; + } else { + y = x; + i = 2; + } } else { r = zero; - if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */ - else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */ - else {y=x-one;i=2;} + if (ix >= 0x3FFBB4C3) { + y = 2.0 - x; + i = 0; + } /* [1.7316,2] */ else if (ix >= 0x3FF3B4C4) { + y = x - tc; + i = 1; + } /* [1.23,1.73] */ else { + y = x - one; + i = 2; + } } - switch(i) { - case 0: - z = y*y; - p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10)))); - p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11))))); - p = y*p1+p2; - r += (p-0.5*y); break; - case 1: - z = y*y; - w = z*y; - p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */ - p2 = t1+w*(t4+w*(t7+w*(t10+w*t13))); - p3 = t2+w*(t5+w*(t8+w*(t11+w*t14))); - p = z*p1-(tt-w*(p2+y*p3)); - r += (tf + p); break; - case 2: - p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5))))); - p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5)))); - r += (-0.5*y + p1/p2); + switch (i) { + case 0: + z = y * y; + p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); + p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); + p = y * p1 + p2; + r += (p - 0.5 * y); + break; + case 1: + z = y * y; + w = z * y; + p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ + p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); + p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); + p = z * p1 - (tt - w * (p2 + y * p3)); + r += (tf + p); + break; + case 2: + p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); + p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); + r += (-0.5 * y + p1 / p2); } - } - else if(ix<0x40200000) { /* x < 8.0 */ - i = (int)x; + } else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int) x; t = zero; - y = x-(double)i; - p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6)))))); - q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6))))); - r = half*y+p/q; - z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ - switch(i) { - case 7: z *= (y+6.0); /* FALLTHRU */ - case 6: z *= (y+5.0); /* FALLTHRU */ - case 5: z *= (y+4.0); /* FALLTHRU */ - case 4: z *= (y+3.0); /* FALLTHRU */ - case 3: z *= (y+2.0); /* FALLTHRU */ - r += Math.log(z); break; + y = x - (double) i; + p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); + q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); + r = half * y + p / q; + z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: + z *= (y + 6.0); /* FALLTHRU */ + case 6: + z *= (y + 5.0); /* FALLTHRU */ + case 5: + z *= (y + 4.0); /* FALLTHRU */ + case 4: + z *= (y + 3.0); /* FALLTHRU */ + case 3: + z *= (y + 2.0); /* FALLTHRU */ + r += Math.log(z); + break; } /* 8.0 <= x < 2**58 */ } else if (ix < 0x43900000) { t = Math.log(x); - z = one/x; - y = z*z; - w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6))))); - r = (x-half)*(t-one)+w; + z = one / x; + y = z * z; + w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); + r = (x - half) * (t - one) + w; } else /* 2**58 <= x <= inf */ - r = x*(Math.log(x)-one); + r = x * (Math.log(x) - one); return r; } @@ -1308,7 +1400,7 @@ public class MathUtils { * @param x the x parameter * @return the log10 of the gamma function at x. */ - public static double log10Gamma (double x) { + public static double log10Gamma(double x) { return lnToLog10(lnGamma(x)); } @@ -1320,13 +1412,13 @@ public class MathUtils { * @param k number of successes * @return the log10 of the binomial coefficient */ - public static double log10BinomialCoefficient (int n, int k) { - return log10Gamma(n+1) - log10Gamma(k+1) - log10Gamma(n-k+1); + public static double log10BinomialCoefficient(int n, int k) { + return log10Gamma(n + 1) - log10Gamma(k + 1) - log10Gamma(n - k + 1); } - public static double log10BinomialProbability (int n, int k, double log10p) { - double log10OneMinusP = Math.log10(1-Math.pow(10,log10p)); - return log10BinomialCoefficient(n, k) + log10p*k + log10OneMinusP*(n-k); + public static double log10BinomialProbability(int n, int k, double log10p) { + double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); + return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); } @@ -1338,38 +1430,74 @@ public class MathUtils { * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) * @return */ - public static double log10MultinomialCoefficient (int n, int [] k) { + public static double log10MultinomialCoefficient(int n, int[] k) { double denominator = 0.0; for (int x : k) { - denominator += log10Gamma(x+1); + denominator += log10Gamma(x + 1); } - return log10Gamma(n+1) - denominator; + return log10Gamma(n + 1) - denominator; } /** * Computes the log10 of the multinomial distribution probability given a vector * of log10 probabilities. Designed to prevent overflows even with very large numbers. * - * @param n number of trials - * @param k array of number of successes for each possibility + * @param n number of trials + * @param k array of number of successes for each possibility * @param log10p array of log10 probabilities * @return */ - public static double log10MultinomialProbability (int n, int [] k, double [] log10p) { + public static double log10MultinomialProbability(int n, int[] k, double[] log10p) { if (log10p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); double log10Prod = 0.0; - for (int i=0; i set = new HashSet(); + set.addAll(Arrays.asList(expected)); + set.removeAll(Arrays.asList(actual)); + return set.isEmpty(); + } + + + private void p (Object []x) { + for (Object v: x) + System.out.print((Integer) v + " "); + System.out.println(); + } }