2010-04-20 07:00:08 +08:00
/ *
* Copyright ( c ) 2010 The Broad Institute
2010-04-20 23:26:32 +08:00
*
2010-04-20 07:00:08 +08:00
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
2010-04-20 23:26:32 +08:00
* files ( the "Software" ) , to deal in the Software without
2010-04-20 07:00:08 +08:00
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
2010-04-20 23:26:32 +08:00
*
2010-04-20 07:00:08 +08:00
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
2010-04-20 23:26:32 +08:00
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
2010-04-20 07:00:08 +08:00
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
* /
2009-04-24 01:45:19 +08:00
package org.broadinstitute.sting.utils ;
2011-12-29 13:41:59 +08:00
import com.google.java.contract.Ensures ;
2011-06-23 06:54:19 +08:00
import com.google.java.contract.Requires ;
2010-04-20 07:00:08 +08:00
import net.sf.samtools.SAMRecord ;
2011-04-08 01:03:48 +08:00
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine ;
2011-06-23 06:55:15 +08:00
import org.broadinstitute.sting.utils.exceptions.UserException ;
2009-12-15 05:57:20 +08:00
2011-07-18 08:29:58 +08:00
import java.math.BigDecimal ;
import java.util.* ;
2009-04-24 01:45:19 +08:00
/ * *
* MathUtils is a static class ( no instantiation allowed ! ) with some useful math methods .
*
* @author Kiran Garimella
* /
public class MathUtils {
Added binomProbabilityLog(int k, int n, double p) to MathUtils:
binomialProbabilityLog uses a log-space calculation of the
binomial pmf to avoid the coefficient blowing up and thus
returning Infinity or NaN (or in some very strange cases
-Infinity). The log calculation compares very well, it seems
with our current method. It's in MathUtils but could stand
testing against rigorous truth data before becoming standard.
Added median calculator functions to ListUtils
getQScoreMedian is a new utility I wrote that given reads and
offsets will find the median Q score. While I was at it, I wrote
a similar method, getMedian, which will return the median of any
list of Comparables, independent of initial order. These are in
ListUtils.
Added a new poolseq directory and three walkers
CoverageAndPowerWalker is built on top of the PrintCoverage walker
and prints out the power to detect a mutant allele in a pool of
2*(number of individuals in the pool) alleles. It can be flagged
either to do this by boostrapping, or by pure math with a
probability of error based on the median Q-score. This walker
compiles, runs, and gives quite reasonable outputs that compare
visually well to the power calculation computed by Syzygy.
ArtificialPoolWalker is designed to take multiple single-sample
.bam files and create a (random) artificial pool. The coverage of
that pool is a user-defined proportion of the total coverage over
all of the input files. The output is not only a new .bam file,
but also an auxiliary file that has for each locus, the genotype
of the individuals, the confidence of that call, and that person's
representation in the artificial pool .bam at that locus. This
walker compiles and, uhh, looks pretty. Needs some testing.
AnalyzePowerWalker extends CoverageAndPowerWalker so that it can read previous power
calcuations (e.g. from Syzygy) and print them to the output file as well for direct
downstream comparisons.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1460 348d0f76-0448-11de-a6fe-93d51630548a
2009-08-26 05:27:50 +08:00
/ * * Public constants - used for the Lanczos approximation to the factorial function
* ( for the calculation of the binomial / multinomial probability in logspace )
* @param LANC_SEQ [ ] - an array holding the constants which correspond to the product
* of Chebyshev Polynomial coefficients , and points on the Gamma function ( for interpolation )
* [ see A Precision Approximation of the Gamma Function J . SIAM Numer . Anal . Ser . B , Vol . 1 1964. pp . 86 - 96 ]
* @param LANC_G - a value for the Lanczos approximation to the gamma function that works to
* high precision
* /
2011-12-29 13:41:59 +08:00
/ * *
* Private constructor . No instantiating this class !
* /
private MathUtils ( ) {
}
2009-04-24 01:45:19 +08:00
2012-01-11 01:38:47 +08:00
// A fast implementation of the Math.round() method. This method does not perform
// under/overflow checking, so this shouldn't be used in the general case (but is fine
// if one is already make those checks before calling in to the rounding).
public static int fastRound ( double d ) {
2012-02-02 08:35:33 +08:00
return ( d > 0 ) ? ( int ) ( d + 0.5d ) : ( int ) ( d - 0.5d ) ;
2011-06-23 06:54:19 +08:00
}
2012-01-12 23:27:10 +08:00
public static double approximateLog10SumLog10 ( final double [ ] vals ) {
2012-02-02 08:35:33 +08:00
return approximateLog10SumLog10 ( vals , vals . length ) ;
2012-01-12 23:27:10 +08:00
}
public static double approximateLog10SumLog10 ( final double [ ] vals , final int endIndex ) {
2012-01-11 01:38:47 +08:00
2012-02-02 08:35:33 +08:00
final int maxElementIndex = MathUtils . maxElementIndex ( vals , endIndex ) ;
double approxSum = vals [ maxElementIndex ] ;
if ( approxSum = = Double . NEGATIVE_INFINITY )
2012-01-11 01:38:47 +08:00
return approxSum ;
2012-02-02 08:35:33 +08:00
for ( int i = 0 ; i < endIndex ; i + + ) {
if ( i = = maxElementIndex | | vals [ i ] = = Double . NEGATIVE_INFINITY )
continue ;
final double diff = approxSum - vals [ i ] ;
if ( diff < MathUtils . MAX_JACOBIAN_TOLERANCE ) {
// See notes from the 2-inout implementation below
final int ind = fastRound ( diff / MathUtils . JACOBIAN_LOG_TABLE_STEP ) ; // hard rounding
approxSum + = MathUtils . jacobianLogTable [ ind ] ;
}
}
2012-01-11 01:38:47 +08:00
return approxSum ;
}
public static double approximateLog10SumLog10 ( double small , double big ) {
// make sure small is really the smaller value
2012-02-02 08:35:33 +08:00
if ( small > big ) {
2012-01-11 01:38:47 +08:00
final double t = big ;
big = small ;
small = t ;
2011-06-23 06:54:19 +08:00
}
2012-01-11 01:38:47 +08:00
2012-02-02 08:35:33 +08:00
if ( small = = Double . NEGATIVE_INFINITY | | big = = Double . NEGATIVE_INFINITY )
2012-01-11 01:38:47 +08:00
return big ;
2012-02-02 08:35:33 +08:00
final double diff = big - small ;
if ( diff > = MathUtils . MAX_JACOBIAN_TOLERANCE )
2012-01-11 01:38:47 +08:00
return big ;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup with integer quantization
// we have pre-stored correction for 0,0.1,0.2,... 10.0
final int ind = fastRound ( diff / MathUtils . JACOBIAN_LOG_TABLE_STEP ) ; // hard rounding
return big + MathUtils . jacobianLogTable [ ind ] ;
2011-06-23 06:54:19 +08:00
}
2011-04-13 07:00:50 +08:00
public static double sum ( Collection < Number > numbers ) {
2011-12-29 13:41:59 +08:00
return sum ( numbers , false ) ;
2011-04-13 07:00:50 +08:00
}
2011-12-29 13:41:59 +08:00
public static double sum ( Collection < Number > numbers , boolean ignoreNan ) {
2011-03-04 06:43:36 +08:00
double sum = 0 ;
2011-12-29 13:41:59 +08:00
for ( Number n : numbers ) {
if ( ! ignoreNan | | ! Double . isNaN ( n . doubleValue ( ) ) ) {
2011-04-13 07:00:50 +08:00
sum + = n . doubleValue ( ) ;
}
2011-03-04 06:43:36 +08:00
}
return sum ;
}
2011-04-13 07:00:50 +08:00
public static int nonNanSize ( Collection < Number > numbers ) {
int size = 0 ;
2011-12-29 13:41:59 +08:00
for ( Number n : numbers ) {
2011-04-13 07:00:50 +08:00
size + = Double . isNaN ( n . doubleValue ( ) ) ? 0 : 1 ;
}
return size ;
2011-03-04 06:43:36 +08:00
}
2012-02-02 08:35:33 +08:00
2012-02-02 08:35:09 +08:00
public static double average ( Collection < Integer > x ) {
return ( double ) sum ( x ) / x . size ( ) ;
}
2011-03-04 06:43:36 +08:00
2011-12-29 13:41:59 +08:00
public static double average ( Collection < Number > numbers , boolean ignoreNan ) {
if ( ignoreNan ) {
return sum ( numbers , true ) / nonNanSize ( numbers ) ;
2012-02-02 08:35:33 +08:00
}
else {
2011-12-29 13:41:59 +08:00
return sum ( numbers , false ) / nonNanSize ( numbers ) ;
2011-04-13 07:00:50 +08:00
}
}
2011-12-29 13:41:59 +08:00
public static double variance ( Collection < Number > numbers , Number mean , boolean ignoreNan ) {
2011-03-04 06:43:36 +08:00
double mn = mean . doubleValue ( ) ;
double var = 0 ;
2011-12-29 13:41:59 +08:00
for ( Number n : numbers ) {
var + = ( ! ignoreNan | | ! Double . isNaN ( n . doubleValue ( ) ) ) ? ( n . doubleValue ( ) - mn ) * ( n . doubleValue ( ) - mn ) : 0 ;
}
if ( ignoreNan ) {
return var / ( nonNanSize ( numbers ) - 1 ) ;
}
return var / ( numbers . size ( ) - 1 ) ;
2011-04-13 07:00:50 +08:00
}
public static double variance ( Collection < Number > numbers , Number mean ) {
2011-12-29 13:41:59 +08:00
return variance ( numbers , mean , false ) ;
2011-04-13 07:00:50 +08:00
}
public static double variance ( Collection < Number > numbers , boolean ignoreNan ) {
2011-12-29 13:41:59 +08:00
return variance ( numbers , average ( numbers , ignoreNan ) , ignoreNan ) ;
2011-03-04 06:43:36 +08:00
}
public static double variance ( Collection < Number > numbers ) {
2011-12-29 13:41:59 +08:00
return variance ( numbers , average ( numbers , false ) , false ) ;
2011-03-04 06:43:36 +08:00
}
2009-08-05 08:41:31 +08:00
public static double sum ( double [ ] values ) {
double s = 0.0 ;
2012-02-02 08:35:33 +08:00
for ( double v : values )
s + = v ;
2009-08-05 08:41:31 +08:00
return s ;
}
2012-02-02 08:35:09 +08:00
public static long sum ( int [ ] x ) {
long total = 0 ;
for ( int v : x )
total + = v ;
return total ;
}
2011-07-29 06:58:36 +08:00
/ * *
* Calculates the log10 cumulative sum of an array with log10 probabilities
2011-12-29 13:41:59 +08:00
*
2011-07-29 06:58:36 +08:00
* @param log10p the array with log10 probabilites
2011-12-29 13:41:59 +08:00
* @param upTo index in the array to calculate the cumsum up to
2011-07-29 06:58:36 +08:00
* @return the log10 of the cumulative sum
* /
2011-12-29 13:41:59 +08:00
public static double log10CumulativeSumLog10 ( double [ ] log10p , int upTo ) {
2011-07-29 06:58:36 +08:00
return log10sumLog10 ( log10p , 0 , upTo ) ;
}
2011-03-11 21:55:30 +08:00
/ * *
* Converts a real space array of probabilities into a log10 array
2011-12-29 13:41:59 +08:00
*
2011-03-11 21:55:30 +08:00
* @param prRealSpace
* @return
* /
public static double [ ] toLog10 ( double [ ] prRealSpace ) {
double [ ] log10s = new double [ prRealSpace . length ] ;
2011-12-29 13:41:59 +08:00
for ( int i = 0 ; i < prRealSpace . length ; i + + )
2011-03-11 21:55:30 +08:00
log10s [ i ] = Math . log10 ( prRealSpace [ i ] ) ;
return log10s ;
}
2010-11-16 06:19:22 +08:00
public static double log10sumLog10 ( double [ ] log10p , int start ) {
2011-07-29 06:58:36 +08:00
return log10sumLog10 ( log10p , start , log10p . length ) ;
}
public static double log10sumLog10 ( double [ ] log10p , int start , int finish ) {
2010-10-31 20:41:52 +08:00
double sum = 0.0 ;
double maxValue = Utils . findMaxEntry ( log10p ) ;
2011-12-29 13:41:59 +08:00
for ( int i = start ; i < finish ; i + + ) {
2010-10-31 20:41:52 +08:00
sum + = Math . pow ( 10.0 , log10p [ i ] - maxValue ) ;
}
return Math . log10 ( sum ) + maxValue ;
}
2011-08-09 01:34:04 +08:00
public static double sumDoubles ( List < Double > values ) {
2010-09-04 02:21:43 +08:00
double s = 0.0 ;
2012-02-02 08:35:33 +08:00
for ( double v : values )
s + = v ;
2010-09-04 02:21:43 +08:00
return s ;
}
2011-08-09 01:34:04 +08:00
public static int sumIntegers ( List < Integer > values ) {
2011-03-05 14:09:05 +08:00
int s = 0 ;
2012-02-02 08:35:33 +08:00
for ( int v : values )
s + = v ;
2011-03-05 14:09:05 +08:00
return s ;
}
2009-08-05 08:41:31 +08:00
public static double sumLog10 ( double [ ] log10values ) {
2010-11-16 06:19:22 +08:00
return Math . pow ( 10.0 , log10sumLog10 ( log10values ) ) ;
2012-02-02 08:35:33 +08:00
// double s = 0.0;
// for ( double v : log10values) s += Math.pow(10.0, v);
// return s;
2010-11-16 06:19:22 +08:00
}
public static double log10sumLog10 ( double [ ] log10values ) {
return log10sumLog10 ( log10values , 0 ) ;
2009-08-05 08:41:31 +08:00
}
public static boolean wellFormedDouble ( double val ) {
2011-12-29 13:41:59 +08:00
return ! Double . isInfinite ( val ) & & ! Double . isNaN ( val ) ;
2009-08-05 08:41:31 +08:00
}
2011-11-22 03:50:51 +08:00
public static double bound ( double value , double minBoundary , double maxBoundary ) {
2011-12-29 13:41:59 +08:00
return Math . max ( Math . min ( value , maxBoundary ) , minBoundary ) ;
2011-11-22 03:50:51 +08:00
}
2009-08-05 08:41:31 +08:00
public static boolean isBounded ( double val , double lower , double upper ) {
return val > = lower & & val < = upper ;
}
public static boolean isPositive ( double val ) {
2011-12-29 13:41:59 +08:00
return ! isNegativeOrZero ( val ) ;
2009-08-05 08:41:31 +08:00
}
public static boolean isPositiveOrZero ( double val ) {
return isBounded ( val , 0.0 , Double . POSITIVE_INFINITY ) ;
}
public static boolean isNegativeOrZero ( double val ) {
return isBounded ( val , Double . NEGATIVE_INFINITY , 0.0 ) ;
}
public static boolean isNegative ( double val ) {
2011-12-29 13:41:59 +08:00
return ! isPositiveOrZero ( val ) ;
2009-08-05 08:41:31 +08:00
}
2009-04-24 01:45:19 +08:00
/ * *
* Compares double values for equality ( within 1e-6 ) , or inequality .
*
2011-12-29 13:41:59 +08:00
* @param a the first double value
* @param b the second double value
* @return - 1 if a is greater than b , 0 if a is equal to be within 1e-6 , 1 if b is greater than a .
2009-04-24 01:45:19 +08:00
* /
2011-12-29 13:41:59 +08:00
public static byte compareDoubles ( double a , double b ) {
return compareDoubles ( a , b , 1e-6 ) ;
}
2009-04-24 01:45:19 +08:00
/ * *
* Compares double values for equality ( within epsilon ) , or inequality .
*
* @param a the first double value
* @param b the second double value
* @param epsilon the precision within which two double values will be considered equal
2011-12-29 13:41:59 +08:00
* @return - 1 if a is greater than b , 0 if a is equal to be within epsilon , 1 if b is greater than a .
2009-04-24 01:45:19 +08:00
* /
2011-12-29 13:41:59 +08:00
public static byte compareDoubles ( double a , double b , double epsilon ) {
if ( Math . abs ( a - b ) < epsilon ) {
return 0 ;
}
if ( a > b ) {
return - 1 ;
}
2009-04-24 01:45:19 +08:00
return 1 ;
}
/ * *
* Compares float values for equality ( within 1e-6 ) , or inequality .
*
2011-12-29 13:41:59 +08:00
* @param a the first float value
* @param b the second float value
* @return - 1 if a is greater than b , 0 if a is equal to be within 1e-6 , 1 if b is greater than a .
2009-04-24 01:45:19 +08:00
* /
2011-12-29 13:41:59 +08:00
public static byte compareFloats ( float a , float b ) {
return compareFloats ( a , b , 1e-6f ) ;
}
2009-04-24 01:45:19 +08:00
/ * *
* Compares float values for equality ( within epsilon ) , or inequality .
*
* @param a the first float value
* @param b the second float value
* @param epsilon the precision within which two float values will be considered equal
2011-12-29 13:41:59 +08:00
* @return - 1 if a is greater than b , 0 if a is equal to be within epsilon , 1 if b is greater than a .
2009-04-24 01:45:19 +08:00
* /
2011-12-29 13:41:59 +08:00
public static byte compareFloats ( float a , float b , float epsilon ) {
if ( Math . abs ( a - b ) < epsilon ) {
return 0 ;
}
if ( a > b ) {
return - 1 ;
}
2009-04-24 01:45:19 +08:00
return 1 ;
}
2009-04-24 11:32:04 +08:00
2011-12-29 13:41:59 +08:00
public static double NormalDistribution ( double mean , double sd , double x ) {
double a = 1.0 / ( sd * Math . sqrt ( 2.0 * Math . PI ) ) ;
double b = Math . exp ( - 1.0 * ( Math . pow ( x - mean , 2.0 ) / ( 2.0 * sd * sd ) ) ) ;
2009-04-24 21:42:15 +08:00
return a * b ;
}
2011-12-29 13:41:59 +08:00
public static double binomialCoefficient ( int n , int k ) {
2011-06-23 06:55:15 +08:00
return Math . pow ( 10 , log10BinomialCoefficient ( n , k ) ) ;
}
2011-12-29 13:41:59 +08:00
2009-04-24 11:32:04 +08:00
/ * *
2009-04-27 23:03:50 +08:00
* Computes a binomial probability . This is computed using the formula
2011-12-29 13:41:59 +08:00
* < p / >
* B ( k ; n ; p ) = [ n ! / ( k ! ( n - k ) ! ) ] ( p ^ k ) ( ( 1 - p ) ^ k )
* < p / >
2009-04-27 23:03:50 +08:00
* where n is the number of trials , k is the number of successes , and p is the probability of success
2009-04-24 11:32:04 +08:00
*
2011-12-29 13:41:59 +08:00
* @param n number of Bernoulli trials
* @param k number of successes
* @param p probability of success
* @return the binomial probability of the specified configuration . Computes values down to about 1e-237 .
2009-04-24 11:32:04 +08:00
* /
2011-12-29 13:41:59 +08:00
public static double binomialProbability ( int n , int k , double p ) {
2011-06-23 06:55:15 +08:00
return Math . pow ( 10 , log10BinomialProbability ( n , k , Math . log10 ( p ) ) ) ;
Added binomProbabilityLog(int k, int n, double p) to MathUtils:
binomialProbabilityLog uses a log-space calculation of the
binomial pmf to avoid the coefficient blowing up and thus
returning Infinity or NaN (or in some very strange cases
-Infinity). The log calculation compares very well, it seems
with our current method. It's in MathUtils but could stand
testing against rigorous truth data before becoming standard.
Added median calculator functions to ListUtils
getQScoreMedian is a new utility I wrote that given reads and
offsets will find the median Q score. While I was at it, I wrote
a similar method, getMedian, which will return the median of any
list of Comparables, independent of initial order. These are in
ListUtils.
Added a new poolseq directory and three walkers
CoverageAndPowerWalker is built on top of the PrintCoverage walker
and prints out the power to detect a mutant allele in a pool of
2*(number of individuals in the pool) alleles. It can be flagged
either to do this by boostrapping, or by pure math with a
probability of error based on the median Q-score. This walker
compiles, runs, and gives quite reasonable outputs that compare
visually well to the power calculation computed by Syzygy.
ArtificialPoolWalker is designed to take multiple single-sample
.bam files and create a (random) artificial pool. The coverage of
that pool is a user-defined proportion of the total coverage over
all of the input files. The output is not only a new .bam file,
but also an auxiliary file that has for each locus, the genotype
of the individuals, the confidence of that call, and that person's
representation in the artificial pool .bam at that locus. This
walker compiles and, uhh, looks pretty. Needs some testing.
AnalyzePowerWalker extends CoverageAndPowerWalker so that it can read previous power
calcuations (e.g. from Syzygy) and print them to the output file as well for direct
downstream comparisons.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1460 348d0f76-0448-11de-a6fe-93d51630548a
2009-08-26 05:27:50 +08:00
}
2009-08-28 03:31:53 +08:00
/ * *
* Performs the cumulative sum of binomial probabilities , where the probability calculation is done in log space .
2011-12-29 13:41:59 +08:00
*
* @param start - start of the cumulant sum ( over hits )
* @param end - end of the cumulant sum ( over hits )
* @param total - number of attempts for the number of hits
2009-08-28 03:31:53 +08:00
* @param probHit - probability of a successful hit
* @return - returns the cumulative probability
* /
2011-06-23 06:55:15 +08:00
public static double binomialCumulativeProbability ( int start , int end , int total , double probHit ) {
2009-08-28 03:31:53 +08:00
double cumProb = 0.0 ;
2009-12-15 05:57:20 +08:00
double prevProb ;
BigDecimal probCache = BigDecimal . ZERO ;
2011-12-29 13:41:59 +08:00
for ( int hits = start ; hits < end ; hits + + ) {
2009-12-15 05:57:20 +08:00
prevProb = cumProb ;
2011-06-23 06:55:15 +08:00
double probability = binomialProbability ( total , hits , probHit ) ;
2009-12-15 05:57:20 +08:00
cumProb + = probability ;
2011-12-29 13:41:59 +08:00
if ( probability > 0 & & cumProb - prevProb < probability / 2 ) { // loss of precision
2009-12-15 05:57:20 +08:00
probCache = probCache . add ( new BigDecimal ( prevProb ) ) ;
cumProb = 0.0 ;
hits - - ; // repeat loop
// prevProb changes at start of loop
}
2009-08-28 03:31:53 +08:00
}
2009-12-15 05:57:20 +08:00
return probCache . add ( new BigDecimal ( cumProb ) ) . doubleValue ( ) ;
2009-08-28 03:31:53 +08:00
}
2011-12-29 13:41:59 +08:00
2009-04-27 23:03:50 +08:00
/ * *
2011-06-23 06:55:15 +08:00
* Computes a multinomial coefficient efficiently avoiding overflow even for large numbers .
* This is computed using the formula :
2011-12-29 13:41:59 +08:00
* < p / >
* M ( x1 , x2 , . . . , xk ; n ) = [ n ! / ( x1 ! x2 ! . . . xk ! ) ]
* < p / >
2009-06-06 07:34:37 +08:00
* where xi represents the number of times outcome i was observed , n is the number of total observations .
* In this implementation , the value of n is inferred as the sum over i of xi .
2009-04-27 23:03:50 +08:00
*
2011-12-29 13:41:59 +08:00
* @param k an int [ ] of counts , where each element represents the number of times a certain outcome was observed
* @return the multinomial of the specified configuration .
2009-04-27 23:03:50 +08:00
* /
2011-12-29 13:41:59 +08:00
public static double multinomialCoefficient ( int [ ] k ) {
2011-06-23 06:55:15 +08:00
int n = 0 ;
for ( int xi : k ) {
n + = xi ;
2009-05-01 14:28:16 +08:00
}
2011-06-23 06:55:15 +08:00
return Math . pow ( 10 , log10MultinomialCoefficient ( n , k ) ) ;
2009-06-06 07:34:37 +08:00
}
/ * *
2011-06-23 06:55:15 +08:00
* Computes a multinomial probability efficiently avoiding overflow even for large numbers .
* This is computed using the formula :
2011-12-29 13:41:59 +08:00
* < p / >
* M ( x1 , x2 , . . . , xk ; n ; p1 , p2 , . . . , pk ) = [ n ! / ( x1 ! x2 ! . . . xk ! ) ] ( p1 ^ x1 ) ( p2 ^ x2 ) ( . . . ) ( pk ^ xk )
* < p / >
2009-06-06 07:34:37 +08:00
* where xi represents the number of times outcome i was observed , n is the number of total observations , and
* pi represents the probability of the i - th outcome to occur . In this implementation , the value of n is
* inferred as the sum over i of xi .
*
2011-12-29 13:41:59 +08:00
* @param k an int [ ] of counts , where each element represents the number of times a certain outcome was observed
* @param p a double [ ] of probabilities , where each element represents the probability a given outcome can occur
* @return the multinomial probability of the specified configuration .
2009-06-06 07:34:37 +08:00
* /
2011-12-29 13:41:59 +08:00
public static double multinomialProbability ( int [ ] k , double [ ] p ) {
2011-06-23 06:55:15 +08:00
if ( p . length ! = k . length )
throw new UserException . BadArgumentValue ( "p and k" , "Array of log10 probabilities must have the same size as the array of number of sucesses: " + p . length + ", " + k . length ) ;
int n = 0 ;
2011-12-29 13:41:59 +08:00
double [ ] log10P = new double [ p . length ] ;
for ( int i = 0 ; i < p . length ; i + + ) {
2011-06-23 06:55:15 +08:00
log10P [ i ] = Math . log10 ( p [ i ] ) ;
n + = k [ i ] ;
2009-04-27 23:03:50 +08:00
}
2011-12-29 13:41:59 +08:00
return Math . pow ( 10 , log10MultinomialProbability ( n , k , log10P ) ) ;
2009-04-27 23:03:50 +08:00
}
2009-08-04 01:00:20 +08:00
2011-08-05 05:49:08 +08:00
/ * *
* calculate the Root Mean Square of an array of integers
2011-12-29 13:41:59 +08:00
*
* @param x an byte [ ] of numbers
* @return the RMS of the specified numbers .
* /
2011-08-05 05:49:08 +08:00
public static double rms ( byte [ ] x ) {
2011-12-29 13:41:59 +08:00
if ( x . length = = 0 )
2011-08-05 05:49:08 +08:00
return 0.0 ;
double rms = 0.0 ;
for ( int i : x )
rms + = i * i ;
rms / = x . length ;
return Math . sqrt ( rms ) ;
}
2009-08-04 01:00:20 +08:00
/ * *
* calculate the Root Mean Square of an array of integers
2011-12-29 13:41:59 +08:00
*
* @param x an int [ ] of numbers
* @return the RMS of the specified numbers .
* /
2009-08-04 01:00:20 +08:00
public static double rms ( int [ ] x ) {
2011-12-29 13:41:59 +08:00
if ( x . length = = 0 )
2009-08-04 01:00:20 +08:00
return 0.0 ;
double rms = 0.0 ;
for ( int i : x )
rms + = i * i ;
rms / = x . length ;
return Math . sqrt ( rms ) ;
}
Added binomProbabilityLog(int k, int n, double p) to MathUtils:
binomialProbabilityLog uses a log-space calculation of the
binomial pmf to avoid the coefficient blowing up and thus
returning Infinity or NaN (or in some very strange cases
-Infinity). The log calculation compares very well, it seems
with our current method. It's in MathUtils but could stand
testing against rigorous truth data before becoming standard.
Added median calculator functions to ListUtils
getQScoreMedian is a new utility I wrote that given reads and
offsets will find the median Q score. While I was at it, I wrote
a similar method, getMedian, which will return the median of any
list of Comparables, independent of initial order. These are in
ListUtils.
Added a new poolseq directory and three walkers
CoverageAndPowerWalker is built on top of the PrintCoverage walker
and prints out the power to detect a mutant allele in a pool of
2*(number of individuals in the pool) alleles. It can be flagged
either to do this by boostrapping, or by pure math with a
probability of error based on the median Q-score. This walker
compiles, runs, and gives quite reasonable outputs that compare
visually well to the power calculation computed by Syzygy.
ArtificialPoolWalker is designed to take multiple single-sample
.bam files and create a (random) artificial pool. The coverage of
that pool is a user-defined proportion of the total coverage over
all of the input files. The output is not only a new .bam file,
but also an auxiliary file that has for each locus, the genotype
of the individuals, the confidence of that call, and that person's
representation in the artificial pool .bam at that locus. This
walker compiles and, uhh, looks pretty. Needs some testing.
AnalyzePowerWalker extends CoverageAndPowerWalker so that it can read previous power
calcuations (e.g. from Syzygy) and print them to the output file as well for direct
downstream comparisons.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1460 348d0f76-0448-11de-a6fe-93d51630548a
2009-08-26 05:27:50 +08:00
2011-01-05 08:28:05 +08:00
/ * *
* calculate the Root Mean Square of an array of doubles
2011-12-29 13:41:59 +08:00
*
* @param x a double [ ] of numbers
* @return the RMS of the specified numbers .
* /
2011-01-05 08:28:05 +08:00
public static double rms ( Double [ ] x ) {
2011-12-29 13:41:59 +08:00
if ( x . length = = 0 )
2011-01-05 08:28:05 +08:00
return 0.0 ;
double rms = 0.0 ;
for ( Double i : x )
rms + = i * i ;
rms / = x . length ;
return Math . sqrt ( rms ) ;
}
2011-11-02 05:46:12 +08:00
public static double rms ( Collection < Integer > l ) {
2011-08-30 06:15:26 +08:00
if ( l . size ( ) = = 0 )
return 0.0 ;
double rms = 0.0 ;
2011-11-02 05:46:12 +08:00
for ( int i : l )
2011-12-29 13:41:59 +08:00
rms + = i * i ;
2011-08-30 06:15:26 +08:00
rms / = l . size ( ) ;
return Math . sqrt ( rms ) ;
}
2011-12-29 13:41:59 +08:00
public static double distanceSquared ( final double [ ] x , final double [ ] y ) {
2011-03-09 08:26:28 +08:00
double dist = 0.0 ;
2011-12-29 13:41:59 +08:00
for ( int iii = 0 ; iii < x . length ; iii + + ) {
2011-03-09 08:26:28 +08:00
dist + = ( x [ iii ] - y [ iii ] ) * ( x [ iii ] - y [ iii ] ) ;
}
return dist ;
}
2011-05-03 03:14:42 +08:00
public static double round ( double num , int digits ) {
2011-12-29 13:41:59 +08:00
double result = num * Math . pow ( 10.0 , ( double ) digits ) ;
2011-05-03 03:14:42 +08:00
result = Math . round ( result ) ;
2011-12-29 13:41:59 +08:00
result = result / Math . pow ( 10.0 , ( double ) digits ) ;
2011-05-03 03:14:42 +08:00
return result ;
}
2009-11-16 10:41:20 +08:00
/ * *
2011-01-23 11:36:09 +08:00
* normalizes the log10 - based array . ASSUMES THAT ALL ARRAY ENTRIES ARE < = 0 ( < = 1 IN REAL - SPACE ) .
2010-03-19 21:18:08 +08:00
*
2011-12-29 13:41:59 +08:00
* @param array the array to be normalized
2010-03-19 21:18:08 +08:00
* @param takeLog10OfOutput if true , the output will be transformed back into log10 units
* @return a newly allocated array corresponding the normalized values in array , maybe log10 transformed
2011-12-29 13:41:59 +08:00
* /
2010-03-19 21:18:08 +08:00
public static double [ ] normalizeFromLog10 ( double [ ] array , boolean takeLog10OfOutput ) {
2011-09-26 09:18:56 +08:00
return normalizeFromLog10 ( array , takeLog10OfOutput , false ) ;
}
public static double [ ] normalizeFromLog10 ( double [ ] array , boolean takeLog10OfOutput , boolean keepInLogSpace ) {
2009-11-16 10:41:20 +08:00
// for precision purposes, we need to add (or really subtract, since they're
// all negative) the largest value; also, we need to convert to normal-space.
double maxValue = Utils . findMaxEntry ( array ) ;
2011-09-26 09:18:56 +08:00
// we may decide to just normalize in log space with converting to linear space
if ( keepInLogSpace ) {
for ( int i = 0 ; i < array . length ; i + + )
array [ i ] - = maxValue ;
return array ;
}
// default case: go to linear space
double [ ] normalized = new double [ array . length ] ;
2009-11-16 10:41:20 +08:00
for ( int i = 0 ; i < array . length ; i + + )
normalized [ i ] = Math . pow ( 10 , array [ i ] - maxValue ) ;
// normalize
double sum = 0.0 ;
for ( int i = 0 ; i < array . length ; i + + )
sum + = normalized [ i ] ;
2010-03-19 21:18:08 +08:00
for ( int i = 0 ; i < array . length ; i + + ) {
double x = normalized [ i ] / sum ;
2012-02-02 08:35:33 +08:00
if ( takeLog10OfOutput )
x = Math . log10 ( x ) ;
2010-03-19 21:18:08 +08:00
normalized [ i ] = x ;
}
2009-11-16 10:41:20 +08:00
return normalized ;
}
2010-03-19 21:18:08 +08:00
2011-03-09 08:26:28 +08:00
public static double [ ] normalizeFromLog10 ( List < Double > array , boolean takeLog10OfOutput ) {
double [ ] normalized = new double [ array . size ( ) ] ;
// for precision purposes, we need to add (or really subtract, since they're
// all negative) the largest value; also, we need to convert to normal-space.
2011-12-29 13:41:59 +08:00
double maxValue = MathUtils . arrayMaxDouble ( array ) ;
2011-03-09 08:26:28 +08:00
for ( int i = 0 ; i < array . size ( ) ; i + + )
normalized [ i ] = Math . pow ( 10 , array . get ( i ) - maxValue ) ;
// normalize
double sum = 0.0 ;
for ( int i = 0 ; i < array . size ( ) ; i + + )
sum + = normalized [ i ] ;
for ( int i = 0 ; i < array . size ( ) ; i + + ) {
double x = normalized [ i ] / sum ;
2012-02-02 08:35:33 +08:00
if ( takeLog10OfOutput )
x = Math . log10 ( x ) ;
2011-03-09 08:26:28 +08:00
normalized [ i ] = x ;
}
return normalized ;
}
2011-12-29 13:41:59 +08:00
2011-01-23 11:36:09 +08:00
/ * *
* normalizes the log10 - based array . ASSUMES THAT ALL ARRAY ENTRIES ARE < = 0 ( < = 1 IN REAL - SPACE ) .
*
2011-12-29 13:41:59 +08:00
* @param array the array to be normalized
2011-01-23 11:36:09 +08:00
* @return a newly allocated array corresponding the normalized values in array
2011-12-29 13:41:59 +08:00
* /
2010-03-19 21:18:08 +08:00
public static double [ ] normalizeFromLog10 ( double [ ] array ) {
return normalizeFromLog10 ( array , false ) ;
}
2010-03-29 05:45:22 +08:00
2011-03-09 08:26:28 +08:00
public static double [ ] normalizeFromLog10 ( List < Double > array ) {
return normalizeFromLog10 ( array , false ) ;
}
2012-01-12 23:27:10 +08:00
public static int maxElementIndex ( final double [ ] array ) {
2012-02-02 08:35:33 +08:00
return maxElementIndex ( array , array . length ) ;
2012-01-12 23:27:10 +08:00
}
public static int maxElementIndex ( final double [ ] array , final int endIndex ) {
2012-02-02 08:35:33 +08:00
if ( array = = null )
throw new IllegalArgumentException ( "Array cannot be null!" ) ;
2010-03-29 05:45:22 +08:00
int maxI = - 1 ;
2012-01-12 23:27:10 +08:00
for ( int i = 0 ; i < endIndex ; i + + ) {
2011-12-29 13:41:59 +08:00
if ( maxI = = - 1 | | array [ i ] > array [ maxI ] )
2010-03-29 05:45:22 +08:00
maxI = i ;
}
return maxI ;
}
2012-01-12 23:27:10 +08:00
public static int maxElementIndex ( final int [ ] array ) {
2012-02-02 08:35:33 +08:00
return maxElementIndex ( array , array . length ) ;
2012-01-12 23:27:10 +08:00
}
public static int maxElementIndex ( final int [ ] array , int endIndex ) {
2012-02-02 08:35:33 +08:00
if ( array = = null )
throw new IllegalArgumentException ( "Array cannot be null!" ) ;
2011-07-29 06:58:36 +08:00
int maxI = - 1 ;
2012-01-12 23:27:10 +08:00
for ( int i = 0 ; i < endIndex ; i + + ) {
2011-12-29 13:41:59 +08:00
if ( maxI = = - 1 | | array [ i ] > array [ maxI ] )
2011-07-29 06:58:36 +08:00
maxI = i ;
}
return maxI ;
}
2010-03-29 05:45:22 +08:00
public static double arrayMax ( double [ ] array ) {
return array [ maxElementIndex ( array ) ] ;
}
public static double arrayMin ( double [ ] array ) {
return array [ minElementIndex ( array ) ] ;
}
2012-01-03 07:26:31 +08:00
public static int arrayMin ( int [ ] array ) {
return array [ minElementIndex ( array ) ] ;
}
2010-12-05 04:23:06 +08:00
public static byte arrayMin ( byte [ ] array ) {
return array [ minElementIndex ( array ) ] ;
}
2010-03-29 05:45:22 +08:00
public static int minElementIndex ( double [ ] array ) {
2012-02-02 08:35:33 +08:00
if ( array = = null )
throw new IllegalArgumentException ( "Array cannot be null!" ) ;
2010-03-29 05:45:22 +08:00
int minI = - 1 ;
2011-12-29 13:41:59 +08:00
for ( int i = 0 ; i < array . length ; i + + ) {
if ( minI = = - 1 | | array [ i ] < array [ minI ] )
2010-03-29 05:45:22 +08:00
minI = i ;
}
return minI ;
}
2010-04-20 07:00:08 +08:00
2010-12-05 04:23:06 +08:00
public static int minElementIndex ( byte [ ] array ) {
2012-02-02 08:35:33 +08:00
if ( array = = null )
throw new IllegalArgumentException ( "Array cannot be null!" ) ;
2010-12-05 04:23:06 +08:00
int minI = - 1 ;
2011-12-29 13:41:59 +08:00
for ( int i = 0 ; i < array . length ; i + + ) {
if ( minI = = - 1 | | array [ i ] < array [ minI ] )
2010-12-05 04:23:06 +08:00
minI = i ;
}
return minI ;
2011-12-29 13:41:59 +08:00
}
2010-12-05 04:23:06 +08:00
2012-01-03 07:26:31 +08:00
public static int minElementIndex ( int [ ] array ) {
2012-02-02 08:35:33 +08:00
if ( array = = null )
throw new IllegalArgumentException ( "Array cannot be null!" ) ;
2012-01-03 07:26:31 +08:00
int minI = - 1 ;
for ( int i = 0 ; i < array . length ; i + + ) {
if ( minI = = - 1 | | array [ i ] < array [ minI ] )
minI = i ;
}
return minI ;
}
2011-08-09 01:34:04 +08:00
public static int arrayMaxInt ( List < Integer > array ) {
2012-02-02 08:35:33 +08:00
if ( array = = null )
throw new IllegalArgumentException ( "Array cannot be null!" ) ;
if ( array . size ( ) = = 0 )
throw new IllegalArgumentException ( "Array size cannot be 0!" ) ;
2010-11-09 05:01:33 +08:00
int m = array . get ( 0 ) ;
2012-02-02 08:35:33 +08:00
for ( int e : array )
m = Math . max ( m , e ) ;
2010-11-09 05:01:33 +08:00
return m ;
}
2011-08-09 01:34:04 +08:00
public static double arrayMaxDouble ( List < Double > array ) {
2012-02-02 08:35:33 +08:00
if ( array = = null )
throw new IllegalArgumentException ( "Array cannot be null!" ) ;
if ( array . size ( ) = = 0 )
throw new IllegalArgumentException ( "Array size cannot be 0!" ) ;
2011-03-09 08:26:28 +08:00
double m = array . get ( 0 ) ;
2012-02-02 08:35:33 +08:00
for ( double e : array )
m = Math . max ( m , e ) ;
2011-03-09 08:26:28 +08:00
return m ;
}
2010-04-20 07:00:08 +08:00
public static double average ( List < Long > vals , int maxI ) {
long sum = 0 L ;
int i = 0 ;
for ( long x : vals ) {
if ( i > maxI )
break ;
sum + = x ;
i + + ;
//System.out.printf(" %d/%d", sum, i);
}
//System.out.printf("Sum = %d, n = %d, maxI = %d, avg = %f%n", sum, i, maxI, (1.0 * sum) / i);
return ( 1.0 * sum ) / i ;
}
public static double averageDouble ( List < Double > vals , int maxI ) {
double sum = 0.0 ;
int i = 0 ;
for ( double x : vals ) {
if ( i > maxI )
break ;
sum + = x ;
i + + ;
}
return ( 1.0 * sum ) / i ;
}
public static double average ( List < Long > vals ) {
return average ( vals , vals . size ( ) ) ;
}
2012-02-02 08:35:09 +08:00
public static double average ( int [ ] x ) {
int sum = 0 ;
for ( int v : x )
sum + = v ;
return ( double ) sum / x . length ;
}
2011-06-23 06:54:33 +08:00
public static byte average ( byte [ ] vals ) {
int sum = 0 ;
for ( byte v : vals ) {
sum + = v ;
}
2011-12-29 13:41:59 +08:00
return ( byte ) Math . floor ( sum / vals . length ) ;
2011-06-23 06:54:33 +08:00
}
2010-04-20 07:00:08 +08:00
public static double averageDouble ( List < Double > vals ) {
return averageDouble ( vals , vals . size ( ) ) ;
}
// Java Generics can't do primitive types, so I had to do this the simplistic way
public static Integer [ ] sortPermutation ( final int [ ] A ) {
class comparator implements Comparator < Integer > {
public int compare ( Integer a , Integer b ) {
if ( A [ a . intValue ( ) ] < A [ b . intValue ( ) ] ) {
return - 1 ;
}
if ( A [ a . intValue ( ) ] = = A [ b . intValue ( ) ] ) {
return 0 ;
}
if ( A [ a . intValue ( ) ] > A [ b . intValue ( ) ] ) {
return 1 ;
}
return 0 ;
}
}
Integer [ ] permutation = new Integer [ A . length ] ;
for ( int i = 0 ; i < A . length ; i + + ) {
permutation [ i ] = i ;
}
Arrays . sort ( permutation , new comparator ( ) ) ;
return permutation ;
}
public static Integer [ ] sortPermutation ( final double [ ] A ) {
class comparator implements Comparator < Integer > {
public int compare ( Integer a , Integer b ) {
if ( A [ a . intValue ( ) ] < A [ b . intValue ( ) ] ) {
return - 1 ;
}
if ( A [ a . intValue ( ) ] = = A [ b . intValue ( ) ] ) {
return 0 ;
}
if ( A [ a . intValue ( ) ] > A [ b . intValue ( ) ] ) {
return 1 ;
}
return 0 ;
}
}
Integer [ ] permutation = new Integer [ A . length ] ;
for ( int i = 0 ; i < A . length ; i + + ) {
permutation [ i ] = i ;
}
Arrays . sort ( permutation , new comparator ( ) ) ;
return permutation ;
}
public static < T extends Comparable > Integer [ ] sortPermutation ( List < T > A ) {
final Object [ ] data = A . toArray ( ) ;
class comparator implements Comparator < Integer > {
public int compare ( Integer a , Integer b ) {
return ( ( T ) data [ a ] ) . compareTo ( data [ b ] ) ;
}
}
Integer [ ] permutation = new Integer [ A . size ( ) ] ;
for ( int i = 0 ; i < A . size ( ) ; i + + ) {
permutation [ i ] = i ;
}
Arrays . sort ( permutation , new comparator ( ) ) ;
return permutation ;
}
public static int [ ] permuteArray ( int [ ] array , Integer [ ] permutation ) {
int [ ] output = new int [ array . length ] ;
for ( int i = 0 ; i < output . length ; i + + ) {
output [ i ] = array [ permutation [ i ] ] ;
}
return output ;
}
public static double [ ] permuteArray ( double [ ] array , Integer [ ] permutation ) {
double [ ] output = new double [ array . length ] ;
for ( int i = 0 ; i < output . length ; i + + ) {
output [ i ] = array [ permutation [ i ] ] ;
}
return output ;
}
public static Object [ ] permuteArray ( Object [ ] array , Integer [ ] permutation ) {
Object [ ] output = new Object [ array . length ] ;
for ( int i = 0 ; i < output . length ; i + + ) {
output [ i ] = array [ permutation [ i ] ] ;
}
return output ;
}
public static String [ ] permuteArray ( String [ ] array , Integer [ ] permutation ) {
String [ ] output = new String [ array . length ] ;
for ( int i = 0 ; i < output . length ; i + + ) {
output [ i ] = array [ permutation [ i ] ] ;
}
return output ;
}
public static < T > List < T > permuteList ( List < T > list , Integer [ ] permutation ) {
List < T > output = new ArrayList < T > ( ) ;
for ( int i = 0 ; i < permutation . length ; i + + ) {
output . add ( list . get ( permutation [ i ] ) ) ;
}
return output ;
}
2011-12-29 13:41:59 +08:00
/ * *
* Draw N random elements from list .
* /
2010-04-20 07:00:08 +08:00
public static < T > List < T > randomSubset ( List < T > list , int N ) {
if ( list . size ( ) < = N ) {
return list ;
}
int idx [ ] = new int [ list . size ( ) ] ;
for ( int i = 0 ; i < list . size ( ) ; i + + ) {
2011-04-08 01:03:48 +08:00
idx [ i ] = GenomeAnalysisEngine . getRandomGenerator ( ) . nextInt ( ) ;
2010-04-20 07:00:08 +08:00
}
Integer [ ] perm = sortPermutation ( idx ) ;
List < T > ans = new ArrayList < T > ( ) ;
for ( int i = 0 ; i < N ; i + + ) {
ans . add ( list . get ( perm [ i ] ) ) ;
}
return ans ;
}
2011-12-29 13:41:59 +08:00
/ * *
* Draw N random elements from an array .
*
* @param array your objects
* @param n number of elements to select at random from the list
* @return a new list with the N randomly chosen elements from list
* /
@Requires ( { "array != null" , "n>=0" } )
@Ensures ( { "result != null" , "result.length == Math.min(n, array.length)" } )
public static Object [ ] randomSubset ( final Object [ ] array , final int n ) {
if ( array . length < = n )
return array . clone ( ) ;
Object [ ] shuffledArray = arrayShuffle ( array ) ;
Object [ ] result = new Object [ n ] ;
System . arraycopy ( shuffledArray , 0 , result , 0 , n ) ;
return result ;
}
2010-04-20 07:00:08 +08:00
public static double percentage ( double x , double base ) {
return ( base > 0 ? ( x / base ) * 100.0 : 0 ) ;
}
public static double percentage ( int x , int base ) {
return ( base > 0 ? ( ( double ) x / ( double ) base ) * 100.0 : 0 ) ;
}
public static double percentage ( long x , long base ) {
return ( base > 0 ? ( ( double ) x / ( double ) base ) * 100.0 : 0 ) ;
}
public static int countOccurrences ( char c , String s ) {
int count = 0 ;
for ( int i = 0 ; i < s . length ( ) ; i + + ) {
count + = s . charAt ( i ) = = c ? 1 : 0 ;
}
return count ;
}
public static < T > int countOccurrences ( T x , List < T > l ) {
int count = 0 ;
for ( T y : l ) {
2012-02-02 08:35:33 +08:00
if ( x . equals ( y ) )
count + + ;
2010-04-20 07:00:08 +08:00
}
return count ;
}
2011-12-29 13:41:59 +08:00
public static int countOccurrences ( byte element , byte [ ] array ) {
2011-07-22 09:39:00 +08:00
int count = 0 ;
for ( byte y : array ) {
if ( element = = y )
count + + ;
}
return count ;
}
2011-07-29 06:58:36 +08:00
/ * *
* Returns the top ( larger ) N elements of the array . Naive n ^ 2 implementation ( Selection Sort ) .
* Better than sorting if N ( number of elements to return ) is small
*
* @param array the array
2011-12-29 13:41:59 +08:00
* @param n number of top elements to return
2011-07-29 06:58:36 +08:00
* @return the n larger elements of the array
* /
2011-12-29 13:41:59 +08:00
public static Collection < Double > getNMaxElements ( double [ ] array , int n ) {
2011-07-29 06:58:36 +08:00
ArrayList < Double > maxN = new ArrayList < Double > ( n ) ;
double lastMax = Double . MAX_VALUE ;
2011-12-29 13:41:59 +08:00
for ( int i = 0 ; i < n ; i + + ) {
2011-07-29 06:58:36 +08:00
double max = Double . MIN_VALUE ;
for ( double x : array ) {
max = Math . min ( lastMax , Math . max ( x , max ) ) ;
}
maxN . add ( max ) ;
lastMax = max ;
}
return maxN ;
}
2011-07-22 09:39:00 +08:00
2010-04-20 07:00:08 +08:00
/ * *
* Returns n random indices drawn with replacement from the range 0. . ( k - 1 )
*
* @param n the total number of indices sampled from
* @param k the number of random indices to draw ( with replacement )
* @return a list of k random indices ranging from 0 to ( n - 1 ) with possible duplicates
* /
static public ArrayList < Integer > sampleIndicesWithReplacement ( int n , int k ) {
2011-12-29 13:41:59 +08:00
ArrayList < Integer > chosen_balls = new ArrayList < Integer > ( k ) ;
for ( int i = 0 ; i < k ; i + + ) {
2010-04-20 07:00:08 +08:00
//Integer chosen_ball = balls[rand.nextInt(k)];
2011-04-08 01:03:48 +08:00
chosen_balls . add ( GenomeAnalysisEngine . getRandomGenerator ( ) . nextInt ( n ) ) ;
2010-04-20 07:00:08 +08:00
//balls.remove(chosen_ball);
}
return chosen_balls ;
}
/ * *
* Returns n random indices drawn without replacement from the range 0. . ( k - 1 )
*
* @param n the total number of indices sampled from
* @param k the number of random indices to draw ( without replacement )
* @return a list of k random indices ranging from 0 to ( n - 1 ) without duplicates
* /
static public ArrayList < Integer > sampleIndicesWithoutReplacement ( int n , int k ) {
ArrayList < Integer > chosen_balls = new ArrayList < Integer > ( k ) ;
for ( int i = 0 ; i < n ; i + + ) {
chosen_balls . add ( i ) ;
}
2011-04-08 01:03:48 +08:00
Collections . shuffle ( chosen_balls , GenomeAnalysisEngine . getRandomGenerator ( ) ) ;
2010-04-20 07:00:08 +08:00
//return (ArrayList<Integer>) chosen_balls.subList(0, k);
return new ArrayList < Integer > ( chosen_balls . subList ( 0 , k ) ) ;
}
/ * *
* Given a list of indices into a list , return those elements of the list with the possibility of drawing list elements multiple times
2011-12-29 13:41:59 +08:00
*
* @param indices the list of indices for elements to extract
* @param list the list from which the elements should be extracted
* @param < T > the template type of the ArrayList
* @return a new ArrayList consisting of the elements at the specified indices
2010-04-20 07:00:08 +08:00
* /
static public < T > ArrayList < T > sliceListByIndices ( List < Integer > indices , List < T > list ) {
ArrayList < T > subset = new ArrayList < T > ( ) ;
for ( int i : indices ) {
subset . add ( list . get ( i ) ) ;
}
return subset ;
}
public static Comparable orderStatisticSearch ( int orderStat , List < Comparable > list ) {
// this finds the order statistic of the list (kth largest element)
// the list is assumed *not* to be sorted
final Comparable x = list . get ( orderStat ) ;
ListIterator iterator = list . listIterator ( ) ;
ArrayList lessThanX = new ArrayList ( ) ;
ArrayList equalToX = new ArrayList ( ) ;
ArrayList greaterThanX = new ArrayList ( ) ;
2011-12-29 13:41:59 +08:00
for ( Comparable y : list ) {
if ( x . compareTo ( y ) > 0 ) {
2010-04-20 07:00:08 +08:00
lessThanX . add ( y ) ;
2012-02-02 08:35:33 +08:00
}
else if ( x . compareTo ( y ) < 0 ) {
2010-04-20 07:00:08 +08:00
greaterThanX . add ( y ) ;
2012-02-02 08:35:33 +08:00
}
else
2010-04-20 07:00:08 +08:00
equalToX . add ( y ) ;
}
2011-12-29 13:41:59 +08:00
if ( lessThanX . size ( ) > orderStat )
2010-04-20 07:00:08 +08:00
return orderStatisticSearch ( orderStat , lessThanX ) ;
2011-12-29 13:41:59 +08:00
else if ( lessThanX . size ( ) + equalToX . size ( ) > = orderStat )
2010-04-20 07:00:08 +08:00
return orderStat ;
else
return orderStatisticSearch ( orderStat - lessThanX . size ( ) - equalToX . size ( ) , greaterThanX ) ;
}
public static Object getMedian ( List < Comparable > list ) {
2011-12-29 13:41:59 +08:00
return orderStatisticSearch ( ( int ) Math . ceil ( list . size ( ) / 2 ) , list ) ;
2010-04-20 07:00:08 +08:00
}
public static byte getQScoreOrderStatistic ( List < SAMRecord > reads , List < Integer > offsets , int k ) {
// version of the order statistic calculator for SAMRecord/Integer lists, where the
// list index maps to a q-score only through the offset index
// returns the kth-largest q-score.
2011-12-29 13:41:59 +08:00
if ( reads . size ( ) = = 0 ) {
2010-04-20 07:00:08 +08:00
return 0 ;
}
ArrayList lessThanQReads = new ArrayList ( ) ;
ArrayList equalToQReads = new ArrayList ( ) ;
ArrayList greaterThanQReads = new ArrayList ( ) ;
ArrayList lessThanQOffsets = new ArrayList ( ) ;
ArrayList greaterThanQOffsets = new ArrayList ( ) ;
final byte qk = reads . get ( k ) . getBaseQualities ( ) [ offsets . get ( k ) ] ;
2011-12-29 13:41:59 +08:00
for ( int iter = 0 ; iter < reads . size ( ) ; iter + + ) {
2010-04-20 07:00:08 +08:00
SAMRecord read = reads . get ( iter ) ;
int offset = offsets . get ( iter ) ;
byte quality = read . getBaseQualities ( ) [ offset ] ;
2011-12-29 13:41:59 +08:00
if ( quality < qk ) {
2010-04-20 07:00:08 +08:00
lessThanQReads . add ( read ) ;
lessThanQOffsets . add ( offset ) ;
2012-02-02 08:35:33 +08:00
}
else if ( quality > qk ) {
2010-04-20 07:00:08 +08:00
greaterThanQReads . add ( read ) ;
greaterThanQOffsets . add ( offset ) ;
2012-02-02 08:35:33 +08:00
}
else {
2010-04-20 07:00:08 +08:00
equalToQReads . add ( reads . get ( iter ) ) ;
}
}
2011-12-29 13:41:59 +08:00
if ( lessThanQReads . size ( ) > k )
2010-04-20 07:00:08 +08:00
return getQScoreOrderStatistic ( lessThanQReads , lessThanQOffsets , k ) ;
2011-12-29 13:41:59 +08:00
else if ( equalToQReads . size ( ) + lessThanQReads . size ( ) > = k )
2010-04-20 07:00:08 +08:00
return qk ;
else
return getQScoreOrderStatistic ( greaterThanQReads , greaterThanQOffsets , k - lessThanQReads . size ( ) - equalToQReads . size ( ) ) ;
}
public static byte getQScoreMedian ( List < SAMRecord > reads , List < Integer > offsets ) {
2011-12-29 13:41:59 +08:00
return getQScoreOrderStatistic ( reads , offsets , ( int ) Math . floor ( reads . size ( ) / 2. ) ) ;
2010-04-20 07:00:08 +08:00
}
2010-05-08 05:34:18 +08:00
2012-02-02 08:35:09 +08:00
public static long sum ( Collection < Integer > x ) {
long sum = 0 ;
for ( int v : x )
2012-02-02 08:35:33 +08:00
sum + = v ;
2012-02-02 08:35:09 +08:00
return sum ;
}
2011-12-29 13:41:59 +08:00
/ * *
* A utility class that computes on the fly average and standard deviation for a stream of numbers .
2010-08-05 05:39:02 +08:00
* The number of observations does not have to be known in advance , and can be also very big ( so that
* it could overflow any naive summation - based scheme or cause loss of precision ) .
* Instead , adding a new number < code > observed < / code >
* to a sample with < code > add ( observed ) < / code > immediately updates the instance of this object so that
* it contains correct mean and standard deviation for all the numbers seen so far . Source : Knuth , vol .2
* ( see also e . g . http : //www.johndcook.com/standard_deviation.html for online reference).
* /
public static class RunningAverage {
private double mean = 0.0 ;
private double s = 0.0 ;
private long obs_count = 0 ;
public void add ( double obs ) {
obs_count + + ;
double oldMean = mean ;
2011-12-29 13:41:59 +08:00
mean + = ( obs - mean ) / obs_count ; // update mean
s + = ( obs - oldMean ) * ( obs - mean ) ;
2010-08-05 05:39:02 +08:00
}
2010-05-08 05:34:18 +08:00
2011-03-01 09:23:37 +08:00
public void addAll ( Collection < Number > col ) {
2011-12-29 13:41:59 +08:00
for ( Number o : col ) {
2011-03-01 09:23:37 +08:00
add ( o . doubleValue ( ) ) ;
}
}
2011-12-29 13:41:59 +08:00
public double mean ( ) {
return mean ;
}
public double stddev ( ) {
return Math . sqrt ( s / ( obs_count - 1 ) ) ;
}
public double var ( ) {
return s / ( obs_count - 1 ) ;
}
public long observationCount ( ) {
return obs_count ;
}
2011-03-02 08:35:23 +08:00
public RunningAverage clone ( ) {
RunningAverage ra = new RunningAverage ( ) ;
ra . mean = this . mean ;
ra . s = this . s ;
ra . obs_count = this . obs_count ;
return ra ;
}
public void merge ( RunningAverage other ) {
2011-12-29 13:41:59 +08:00
if ( this . obs_count > 0 | | other . obs_count > 0 ) { // if we have any observations at all
this . mean = ( this . mean * this . obs_count + other . mean * other . obs_count ) / ( this . obs_count + other . obs_count ) ;
2011-03-02 08:35:23 +08:00
this . s + = other . s ;
}
this . obs_count + = other . obs_count ;
}
2010-08-05 05:39:02 +08:00
}
2011-12-29 13:41:59 +08:00
2010-05-08 05:34:18 +08:00
//
// useful common utility routines
//
2011-12-29 13:41:59 +08:00
public static double rate ( long n , long d ) {
return n / ( 1.0 * Math . max ( d , 1 ) ) ;
}
public static double rate ( int n , int d ) {
return n / ( 1.0 * Math . max ( d , 1 ) ) ;
}
public static long inverseRate ( long n , long d ) {
return n = = 0 ? 0 : d / Math . max ( n , 1 ) ;
}
2010-05-08 05:34:18 +08:00
2011-12-29 13:41:59 +08:00
public static long inverseRate ( int n , int d ) {
return n = = 0 ? 0 : d / Math . max ( n , 1 ) ;
}
2010-05-08 05:34:18 +08:00
2011-12-29 13:41:59 +08:00
public static double ratio ( int num , int denom ) {
return ( ( double ) num ) / ( Math . max ( denom , 1 ) ) ;
}
public static double ratio ( long num , long denom ) {
return ( ( double ) num ) / ( Math . max ( denom , 1 ) ) ;
}
2011-03-07 23:31:58 +08:00
public static final double [ ] log10Cache ;
public static final double [ ] jacobianLogTable ;
public static final int JACOBIAN_LOG_TABLE_SIZE = 101 ;
public static final double JACOBIAN_LOG_TABLE_STEP = 0.1 ;
2011-12-29 13:41:59 +08:00
public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP ;
2011-03-07 23:31:58 +08:00
public static final double MAX_JACOBIAN_TOLERANCE = 10.0 ;
2011-12-14 12:36:10 +08:00
private static final int MAXN = 11000 ;
private static final int LOG10_CACHE_SIZE = 4 * MAXN ; // we need to be able to go up to 2*(2N) when calculating some of the coefficients
2011-03-07 23:31:58 +08:00
static {
2011-12-14 12:36:10 +08:00
log10Cache = new double [ LOG10_CACHE_SIZE ] ;
2011-12-29 13:41:59 +08:00
jacobianLogTable = new double [ JACOBIAN_LOG_TABLE_SIZE ] ;
2011-03-07 23:31:58 +08:00
log10Cache [ 0 ] = Double . NEGATIVE_INFINITY ;
2011-12-29 13:41:59 +08:00
for ( int k = 1 ; k < LOG10_CACHE_SIZE ; k + + )
2011-03-07 23:31:58 +08:00
log10Cache [ k ] = Math . log10 ( k ) ;
2011-12-29 13:41:59 +08:00
for ( int k = 0 ; k < JACOBIAN_LOG_TABLE_SIZE ; k + + ) {
2012-02-02 08:35:33 +08:00
jacobianLogTable [ k ] = Math . log10 ( 1.0 + Math . pow ( 10.0 , - ( ( double ) k ) * JACOBIAN_LOG_TABLE_STEP ) ) ;
2011-03-07 23:31:58 +08:00
2011-12-29 13:41:59 +08:00
}
2011-03-07 23:31:58 +08:00
}
2011-04-21 02:57:34 +08:00
static public double softMax ( final double [ ] vec ) {
2011-03-07 23:31:58 +08:00
double acc = vec [ 0 ] ;
2011-12-29 13:41:59 +08:00
for ( int k = 1 ; k < vec . length ; k + + )
acc = softMax ( acc , vec [ k ] ) ;
2011-03-07 23:31:58 +08:00
return acc ;
2010-10-28 09:48:47 +08:00
}
2011-03-07 23:31:58 +08:00
static public double max ( double x0 , double x1 , double x2 ) {
2011-12-29 13:41:59 +08:00
double a = Math . max ( x0 , x1 ) ;
return Math . max ( a , x2 ) ;
2011-03-07 23:31:58 +08:00
}
2011-12-29 13:41:59 +08:00
2011-04-21 02:57:34 +08:00
static public double softMax ( final double x0 , final double x1 , final double x2 ) {
2011-12-29 13:41:59 +08:00
// compute naively log10(10^x[0] + 10^x[1]+...)
// return Math.log10(MathUtils.sumLog10(vec));
2011-03-07 23:31:58 +08:00
2011-12-29 13:41:59 +08:00
// better approximation: do Jacobian logarithm function on data pairs
double a = softMax ( x0 , x1 ) ;
return softMax ( a , x2 ) ;
2011-03-07 23:31:58 +08:00
}
2011-06-23 06:55:09 +08:00
static public double softMax ( final double x , final double y ) {
2011-09-10 06:00:23 +08:00
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
// slow exact version:
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
2011-12-29 13:41:59 +08:00
double diff = x - y ;
2011-09-10 06:00:23 +08:00
if ( diff > MAX_JACOBIAN_TOLERANCE )
return x ;
else if ( diff < - MAX_JACOBIAN_TOLERANCE )
return y ;
else if ( diff > = 0 ) {
2011-12-29 13:41:59 +08:00
int ind = ( int ) ( diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5 ) ;
2011-09-10 06:00:23 +08:00
return x + jacobianLogTable [ ind ] ;
2012-02-02 08:35:33 +08:00
}
else {
2011-12-29 13:41:59 +08:00
int ind = ( int ) ( - diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5 ) ;
2011-09-10 06:00:23 +08:00
return y + jacobianLogTable [ ind ] ;
}
}
2011-03-07 23:31:58 +08:00
2011-12-29 13:41:59 +08:00
public static double phredScaleToProbability ( byte q ) {
return Math . pow ( 10 , ( - q ) / 10.0 ) ;
2011-06-23 06:54:33 +08:00
}
2011-12-29 13:41:59 +08:00
public static double phredScaleToLog10Probability ( byte q ) {
return ( ( - q ) / 10.0 ) ;
2011-06-23 06:54:33 +08:00
}
2011-08-05 05:49:08 +08:00
/ * *
* Returns the phred scaled value of probability p
2011-12-29 13:41:59 +08:00
*
2011-08-05 05:49:08 +08:00
* @param p probability ( between 0 and 1 ) .
* @return phred scaled probability of p
* /
2011-12-29 13:41:59 +08:00
public static byte probabilityToPhredScale ( double p ) {
2011-06-23 06:54:33 +08:00
return ( byte ) ( ( - 10 ) * Math . log10 ( p ) ) ;
}
2011-12-29 13:41:59 +08:00
public static double log10ProbabilityToPhredScale ( double log10p ) {
2011-06-23 06:54:33 +08:00
return ( - 10 ) * log10p ;
}
2011-06-23 06:55:09 +08:00
/ * *
* Converts LN to LOG10
2011-12-29 13:41:59 +08:00
*
2011-06-23 06:55:09 +08:00
* @param ln log ( x )
* @return log10 ( x )
* /
2011-12-29 13:41:59 +08:00
public static double lnToLog10 ( double ln ) {
2011-06-23 06:55:09 +08:00
return ln * Math . log10 ( Math . exp ( 1 ) ) ;
}
/ * *
* Constants to simplify the log gamma function calculation .
* /
2012-02-02 08:35:33 +08:00
private static final double zero = 0.0 , one = 1.0 , half = .5 , a0 = 7.72156649015328655494e-02 , a1 = 3.22467033424113591611e-01 , a2 = 6.73523010531292681824e-02 , a3 = 2.05808084325167332806e-02 , a4 = 7.38555086081402883957e-03 , a5 = 2.89051383673415629091e-03 , a6 = 1.19270763183362067845e-03 , a7 = 5.10069792153511336608e-04 , a8 = 2.20862790713908385557e-04 , a9 = 1.08011567247583939954e-04 , a10 = 2.52144565451257326939e-05 , a11 = 4.48640949618915160150e-05 , tc = 1.46163214496836224576e+00 , tf = - 1.21486290535849611461e-01 , tt = - 3.63867699703950536541e-18 , t0 = 4.83836122723810047042e-01 , t1 = - 1.47587722994593911752e-01 , t2 = 6.46249402391333854778e-02 , t3 = - 3.27885410759859649565e-02 , t4 = 1.79706750811820387126e-02 , t5 = - 1.03142241298341437450e-02 , t6 = 6.10053870246291332635e-03 , t7 = - 3.68452016781138256760e-03 , t8 = 2.25964780900612472250e-03 , t9 = - 1.40346469989232843813e-03 , t10 = 8.81081882437654011382e-04 , t11 = - 5.38595305356740546715e-04 , t12 = 3.15632070903625950361e-04 , t13 = - 3.12754168375120860518e-04 , t14 = 3.35529192635519073543e-04 , u0 = - 7.72156649015328655494e-02 , u1 = 6.32827064025093366517e-01 , u2 = 1.45492250137234768737e+00 , u3 = 9.77717527963372745603e-01 , u4 = 2.28963728064692451092e-01 , u5 = 1.33810918536787660377e-02 , v1 = 2.45597793713041134822e+00 , v2 = 2.12848976379893395361e+00 , v3 = 7.69285150456672783825e-01 , v4 = 1.04222645593369134254e-01 , v5 = 3.21709242282423911810e-03 , s0 = - 7.72156649015328655494e-02 , s1 = 2.14982415960608852501e-01 , s2 = 3.25778796408930981787e-01 , s3 = 1.46350472652464452805e-01 , s4 = 2.66422703033638609560e-02 , s5 = 1.84028451407337715652e-03 , s6 = 3.19475326584100867617e-05 , r1 = 1.39200533467621045958e+00 , r2 = 7.21935547567138069525e-01 , r3 = 1.71933865632803078993e-01 , r4 = 1.86459191715652901344e-02 , r5 = 7.77942496381893596434e-04 , r6 = 7.32668430744625636189e-06 , w0 = 4.18938533204672725052e-01 , w1 = 8.33333333333329678849e-02 , w2 = - 2.77777777728775536470e-03 , w3 = 7.93650558643019558500e-04 , w4 = - 5.95187557450339963135e-04 , w5 = 8.36339918996282139126e-04 , w6 = - 1.63092934096575273989e-03 ;
2011-06-23 06:55:09 +08:00
/ * *
* Efficient rounding functions to simplify the log gamma function calculation
2011-12-29 13:41:59 +08:00
* double to long with 32 bit shift
2011-06-23 06:55:09 +08:00
* /
2011-12-29 13:41:59 +08:00
private static final int HI ( double x ) {
return ( int ) ( Double . doubleToLongBits ( x ) > > 32 ) ;
2011-06-23 06:55:09 +08:00
}
/ * *
* Efficient rounding functions to simplify the log gamma function calculation
2011-12-29 13:41:59 +08:00
* double to long without shift
2011-06-23 06:55:09 +08:00
* /
2011-12-29 13:41:59 +08:00
private static final int LO ( double x ) {
return ( int ) Double . doubleToLongBits ( x ) ;
2011-06-23 06:55:09 +08:00
}
/ * *
* Most efficent implementation of the lnGamma ( FDLIBM )
* Use via the log10Gamma wrapper method .
* /
2011-12-29 13:41:59 +08:00
private static double lnGamma ( double x ) {
double t , y , z , p , p1 , p2 , p3 , q , r , w ;
2011-06-23 06:55:09 +08:00
int i ;
int hx = HI ( x ) ;
int lx = LO ( x ) ;
/* purge off +-inf, NaN, +-0, and negative arguments */
2011-12-29 13:41:59 +08:00
int ix = hx & 0x7fffffff ;
2012-02-02 08:35:33 +08:00
if ( ix > = 0x7ff00000 )
return Double . POSITIVE_INFINITY ;
if ( ( ix | lx ) = = 0 | | hx < 0 )
return Double . NaN ;
2011-12-29 13:41:59 +08:00
if ( ix < 0x3b900000 ) { /* |x|<2**-70, return -log(|x|) */
2011-06-23 06:55:09 +08:00
return - Math . log ( x ) ;
}
/* purge off 1 and 2 */
2012-02-02 08:35:33 +08:00
if ( ( ( ( ix - 0x3ff00000 ) | lx ) = = 0 ) | | ( ( ( ix - 0x40000000 ) | lx ) = = 0 ) )
r = 0 ;
2011-12-29 13:41:59 +08:00
/* for x < 2.0 */
else if ( ix < 0x40000000 ) {
if ( ix < = 0x3feccccc ) { /* lgamma(x) = lgamma(x+1)-log(x) */
2011-06-23 06:55:09 +08:00
r = - Math . log ( x ) ;
2011-12-29 13:41:59 +08:00
if ( ix > = 0x3FE76944 ) {
y = one - x ;
i = 0 ;
2012-02-02 08:35:33 +08:00
}
else if ( ix > = 0x3FCDA661 ) {
2011-12-29 13:41:59 +08:00
y = x - ( tc - one ) ;
i = 1 ;
2012-02-02 08:35:33 +08:00
}
else {
2011-12-29 13:41:59 +08:00
y = x ;
i = 2 ;
}
2012-02-02 08:35:33 +08:00
}
else {
2011-06-23 06:55:09 +08:00
r = zero ;
2011-12-29 13:41:59 +08:00
if ( ix > = 0x3FFBB4C3 ) {
y = 2.0 - x ;
i = 0 ;
2012-02-02 08:35:33 +08:00
} /* [1.7316,2] */
else if ( ix > = 0x3FF3B4C4 ) {
2011-12-29 13:41:59 +08:00
y = x - tc ;
i = 1 ;
2012-02-02 08:35:33 +08:00
} /* [1.23,1.73] */
else {
2011-12-29 13:41:59 +08:00
y = x - one ;
i = 2 ;
}
2011-06-23 06:55:09 +08:00
}
2011-12-29 13:41:59 +08:00
switch ( i ) {
case 0 :
z = y * y ;
p1 = a0 + z * ( a2 + z * ( a4 + z * ( a6 + z * ( a8 + z * a10 ) ) ) ) ;
p2 = z * ( a1 + z * ( a3 + z * ( a5 + z * ( a7 + z * ( a9 + z * a11 ) ) ) ) ) ;
p = y * p1 + p2 ;
r + = ( p - 0.5 * y ) ;
break ;
case 1 :
z = y * y ;
w = z * y ;
p1 = t0 + w * ( t3 + w * ( t6 + w * ( t9 + w * t12 ) ) ) ; /* parallel comp */
p2 = t1 + w * ( t4 + w * ( t7 + w * ( t10 + w * t13 ) ) ) ;
p3 = t2 + w * ( t5 + w * ( t8 + w * ( t11 + w * t14 ) ) ) ;
p = z * p1 - ( tt - w * ( p2 + y * p3 ) ) ;
r + = ( tf + p ) ;
break ;
case 2 :
p1 = y * ( u0 + y * ( u1 + y * ( u2 + y * ( u3 + y * ( u4 + y * u5 ) ) ) ) ) ;
p2 = one + y * ( v1 + y * ( v2 + y * ( v3 + y * ( v4 + y * v5 ) ) ) ) ;
r + = ( - 0.5 * y + p1 / p2 ) ;
2011-06-23 06:55:09 +08:00
}
2012-02-02 08:35:33 +08:00
}
else if ( ix < 0x40200000 ) { /* x < 8.0 */
2011-12-29 13:41:59 +08:00
i = ( int ) x ;
2011-06-23 06:55:09 +08:00
t = zero ;
2011-12-29 13:41:59 +08:00
y = x - ( double ) i ;
p = y * ( s0 + y * ( s1 + y * ( s2 + y * ( s3 + y * ( s4 + y * ( s5 + y * s6 ) ) ) ) ) ) ;
q = one + y * ( r1 + y * ( r2 + y * ( r3 + y * ( r4 + y * ( r5 + y * r6 ) ) ) ) ) ;
r = half * y + p / q ;
z = one ; /* lgamma(1+s) = log(s) + lgamma(s) */
switch ( i ) {
case 7 :
z * = ( y + 6.0 ) ; /* FALLTHRU */
case 6 :
z * = ( y + 5.0 ) ; /* FALLTHRU */
case 5 :
z * = ( y + 4.0 ) ; /* FALLTHRU */
case 4 :
z * = ( y + 3.0 ) ; /* FALLTHRU */
case 3 :
z * = ( y + 2.0 ) ; /* FALLTHRU */
r + = Math . log ( z ) ;
break ;
2011-06-23 06:55:09 +08:00
}
/* 8.0 <= x < 2**58 */
2012-02-02 08:35:33 +08:00
}
else if ( ix < 0x43900000 ) {
2011-06-23 06:55:09 +08:00
t = Math . log ( x ) ;
2011-12-29 13:41:59 +08:00
z = one / x ;
y = z * z ;
w = w0 + z * ( w1 + y * ( w2 + y * ( w3 + y * ( w4 + y * ( w5 + y * w6 ) ) ) ) ) ;
r = ( x - half ) * ( t - one ) + w ;
2012-02-02 08:35:33 +08:00
}
else
2011-06-23 06:55:09 +08:00
/* 2**58 <= x <= inf */
2011-12-29 13:41:59 +08:00
r = x * ( Math . log ( x ) - one ) ;
2011-06-23 06:55:09 +08:00
return r ;
}
/ * *
* Calculates the log10 of the gamma function for x using the efficient FDLIBM
* implementation to avoid overflows and guarantees high accuracy even for large
* numbers .
*
* @param x the x parameter
* @return the log10 of the gamma function at x .
* /
2011-12-29 13:41:59 +08:00
public static double log10Gamma ( double x ) {
2011-06-23 06:55:09 +08:00
return lnToLog10 ( lnGamma ( x ) ) ;
}
/ * *
2011-06-23 06:55:15 +08:00
* Calculates the log10 of the binomial coefficient . Designed to prevent
* overflows even with very large numbers .
2011-06-23 06:55:09 +08:00
*
2011-06-23 06:55:15 +08:00
* @param n total number of trials
2011-06-23 06:55:09 +08:00
* @param k number of successes
* @return the log10 of the binomial coefficient
* /
2011-12-29 13:41:59 +08:00
public static double log10BinomialCoefficient ( int n , int k ) {
return log10Gamma ( n + 1 ) - log10Gamma ( k + 1 ) - log10Gamma ( n - k + 1 ) ;
2011-06-23 06:55:09 +08:00
}
2011-12-29 13:41:59 +08:00
public static double log10BinomialProbability ( int n , int k , double log10p ) {
double log10OneMinusP = Math . log10 ( 1 - Math . pow ( 10 , log10p ) ) ;
return log10BinomialCoefficient ( n , k ) + log10p * k + log10OneMinusP * ( n - k ) ;
2011-06-23 06:55:15 +08:00
}
2011-06-23 06:55:09 +08:00
/ * *
2011-06-23 06:55:15 +08:00
* Calculates the log10 of the multinomial coefficient . Designed to prevent
* overflows even with very large numbers .
2011-06-23 06:55:09 +08:00
*
2011-06-23 06:55:15 +08:00
* @param n total number of trials
2011-06-23 06:55:09 +08:00
* @param k array of any size with the number of successes for each grouping ( k1 , k2 , k3 , . . . , km )
* @return
* /
2011-12-29 13:41:59 +08:00
public static double log10MultinomialCoefficient ( int n , int [ ] k ) {
2011-06-23 06:55:09 +08:00
double denominator = 0.0 ;
2011-06-23 06:55:15 +08:00
for ( int x : k ) {
2011-12-29 13:41:59 +08:00
denominator + = log10Gamma ( x + 1 ) ;
2011-06-23 06:55:09 +08:00
}
2011-12-29 13:41:59 +08:00
return log10Gamma ( n + 1 ) - denominator ;
2011-06-23 06:55:09 +08:00
}
2011-06-23 06:55:15 +08:00
/ * *
* Computes the log10 of the multinomial distribution probability given a vector
* of log10 probabilities . Designed to prevent overflows even with very large numbers .
*
2011-12-29 13:41:59 +08:00
* @param n number of trials
* @param k array of number of successes for each possibility
2011-06-23 06:55:15 +08:00
* @param log10p array of log10 probabilities
* @return
* /
2011-12-29 13:41:59 +08:00
public static double log10MultinomialProbability ( int n , int [ ] k , double [ ] log10p ) {
2011-06-23 06:55:15 +08:00
if ( log10p . length ! = k . length )
throw new UserException . BadArgumentValue ( "p and k" , "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p . length + ", " + k . length ) ;
double log10Prod = 0.0 ;
2011-12-29 13:41:59 +08:00
for ( int i = 0 ; i < log10p . length ; i + + ) {
log10Prod + = log10p [ i ] * k [ i ] ;
2011-06-23 06:55:15 +08:00
}
return log10MultinomialCoefficient ( n , k ) + log10Prod ;
}
2011-12-29 13:41:59 +08:00
public static double factorial ( int x ) {
return Math . pow ( 10 , log10Gamma ( x + 1 ) ) ;
}
public static double log10Factorial ( int x ) {
return log10Gamma ( x + 1 ) ;
2011-06-23 06:55:20 +08:00
}
2011-12-29 13:41:59 +08:00
/ * *
* Adds two arrays together and returns a new array with the sum .
*
* @param a one array
* @param b another array
* @return a new array with the sum of a and b
* /
@Requires ( "a.length == b.length" )
@Ensures ( "result.length == a.length" )
public static int [ ] addArrays ( int [ ] a , int [ ] b ) {
int [ ] c = new int [ a . length ] ;
for ( int i = 0 ; i < a . length ; i + + )
c [ i ] = a [ i ] + b [ i ] ;
return c ;
}
/ * *
* Quick implementation of the Knuth - shuffle algorithm to generate a random
* permutation of the given array .
*
* @param array the original array
* @return a new array with the elements shuffled
* /
public static Object [ ] arrayShuffle ( Object [ ] array ) {
int n = array . length ;
Object [ ] shuffled = array . clone ( ) ;
for ( int i = 0 ; i < n ; i + + ) {
int j = i + GenomeAnalysisEngine . getRandomGenerator ( ) . nextInt ( n - i ) ;
Object tmp = shuffled [ i ] ;
shuffled [ i ] = shuffled [ j ] ;
shuffled [ j ] = tmp ;
}
return shuffled ;
2011-06-23 06:55:20 +08:00
}
2012-01-19 09:54:10 +08:00
/ * *
* Vector operations
2012-02-02 08:35:33 +08:00
*
2012-01-23 22:22:31 +08:00
* @param v1 first numerical array
* @param v2 second numerical array
2012-02-02 08:35:33 +08:00
* @return a new array with the elements added
2012-01-19 09:54:10 +08:00
* /
2012-01-23 22:22:31 +08:00
public static < E extends Number > Double [ ] vectorSum ( E v1 [ ] , E v2 [ ] ) {
2012-01-19 09:54:10 +08:00
if ( v1 . length ! = v2 . length )
throw new UserException ( "BUG: vectors v1, v2 of different size in vectorSum()" ) ;
2012-01-23 22:22:31 +08:00
Double [ ] result = new Double [ v1 . length ] ;
2012-02-02 08:35:33 +08:00
for ( int k = 0 ; k < v1 . length ; k + + )
result [ k ] = v1 [ k ] . doubleValue ( ) + v2 [ k ] . doubleValue ( ) ;
2012-01-19 09:54:10 +08:00
return result ;
}
2012-01-23 22:22:31 +08:00
public static < E extends Number > Double [ ] scalarTimesVector ( E a , E [ ] v1 ) {
2012-01-19 09:54:10 +08:00
2012-01-23 22:22:31 +08:00
Double result [ ] = new Double [ v1 . length ] ;
2012-02-02 08:35:33 +08:00
for ( int k = 0 ; k < v1 . length ; k + + )
result [ k ] = a . doubleValue ( ) * v1 [ k ] . doubleValue ( ) ;
2012-01-19 09:54:10 +08:00
return result ;
}
2012-02-02 08:35:33 +08:00
public static < E extends Number > Double dotProduct ( E [ ] v1 , E [ ] v2 ) {
2012-01-19 09:54:10 +08:00
if ( v1 . length ! = v2 . length )
throw new UserException ( "BUG: vectors v1, v2 of different size in vectorSum()" ) ;
2012-01-23 22:22:31 +08:00
Double result = 0.0 ;
2012-02-02 08:35:33 +08:00
for ( int k = 0 ; k < v1 . length ; k + + )
result + = v1 [ k ] . doubleValue ( ) * v2 [ k ] . doubleValue ( ) ;
2012-01-19 09:54:10 +08:00
return result ;
}
public static double [ ] vectorLog10 ( double v1 [ ] ) {
double result [ ] = new double [ v1 . length ] ;
2012-02-02 08:35:33 +08:00
for ( int k = 0 ; k < v1 . length ; k + + )
2012-01-19 09:54:10 +08:00
result [ k ] = Math . log10 ( v1 [ k ] ) ;
return result ;
}
2012-01-23 22:22:31 +08:00
// todo - silly overloading, just because Java can't unbox/box arrays of primitive types, and we can't do generics with primitive types!
public static Double [ ] vectorLog10 ( Double v1 [ ] ) {
Double result [ ] = new Double [ v1 . length ] ;
2012-02-02 08:35:33 +08:00
for ( int k = 0 ; k < v1 . length ; k + + )
2012-01-23 22:22:31 +08:00
result [ k ] = Math . log10 ( v1 [ k ] ) ;
return result ;
}
2011-08-09 01:34:04 +08:00
}