diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 759e1649d..737f4bb5f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -49,8 +50,11 @@ public class MathUtils { */ - /** Private constructor. No instantiating this class! */ - private MathUtils() {} + /** + * Private constructor. No instantiating this class! + */ + private MathUtils() { + } @Requires({"d > 0.0"}) public static int fastPositiveRound(double d) { @@ -58,21 +62,21 @@ public class MathUtils { } public static int fastRound(double d) { - if ( d > 0.0 ) { + if (d > 0.0) { return fastPositiveRound(d); } else { - return -1*fastPositiveRound(-1*d); + return -1 * fastPositiveRound(-1 * d); } } public static double sum(Collection numbers) { - return sum(numbers,false); + return sum(numbers, false); } - public static double sum( Collection numbers, boolean ignoreNan ) { + public static double sum(Collection numbers, boolean ignoreNan) { double sum = 0; - for ( Number n : numbers ) { - if ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) { + for (Number n : numbers) { + if (!ignoreNan || !Double.isNaN(n.doubleValue())) { sum += n.doubleValue(); } } @@ -82,66 +86,72 @@ public class MathUtils { public static int nonNanSize(Collection numbers) { int size = 0; - for ( Number n : numbers) { + for (Number n : numbers) { size += Double.isNaN(n.doubleValue()) ? 0 : 1; } return size; } - public static double average( Collection numbers, boolean ignoreNan) { - if ( ignoreNan ) { - return sum(numbers,true)/nonNanSize(numbers); + public static double average(Collection numbers, boolean ignoreNan) { + if (ignoreNan) { + return sum(numbers, true) / nonNanSize(numbers); } else { - return sum(numbers,false)/nonNanSize(numbers); + return sum(numbers, false) / nonNanSize(numbers); } } - public static double variance( Collection numbers, Number mean, boolean ignoreNan ) { + public static double variance(Collection numbers, Number mean, boolean ignoreNan) { double mn = mean.doubleValue(); double var = 0; - for ( Number n : numbers ) { var += ( ! ignoreNan || ! Double.isNaN(n.doubleValue())) ? (n.doubleValue()-mn)*(n.doubleValue()-mn) : 0; } - if ( ignoreNan ) { return var/(nonNanSize(numbers)-1); } - return var/(numbers.size()-1); + for (Number n : numbers) { + var += (!ignoreNan || !Double.isNaN(n.doubleValue())) ? (n.doubleValue() - mn) * (n.doubleValue() - mn) : 0; + } + if (ignoreNan) { + return var / (nonNanSize(numbers) - 1); + } + return var / (numbers.size() - 1); } public static double variance(Collection numbers, Number mean) { - return variance(numbers,mean,false); + return variance(numbers, mean, false); } public static double variance(Collection numbers, boolean ignoreNan) { - return variance(numbers,average(numbers,ignoreNan),ignoreNan); + return variance(numbers, average(numbers, ignoreNan), ignoreNan); } public static double variance(Collection numbers) { - return variance(numbers,average(numbers,false),false); + return variance(numbers, average(numbers, false), false); } public static double sum(double[] values) { double s = 0.0; - for ( double v : values) s += v; + for (double v : values) s += v; return s; } /** * Calculates the log10 cumulative sum of an array with log10 probabilities + * * @param log10p the array with log10 probabilites - * @param upTo index in the array to calculate the cumsum up to + * @param upTo index in the array to calculate the cumsum up to * @return the log10 of the cumulative sum */ - public static double log10CumulativeSumLog10(double [] log10p, int upTo) { + public static double log10CumulativeSumLog10(double[] log10p, int upTo) { return log10sumLog10(log10p, 0, upTo); } /** * Converts a real space array of probabilities into a log10 array + * * @param prRealSpace * @return */ public static double[] toLog10(double[] prRealSpace) { double[] log10s = new double[prRealSpace.length]; - for ( int i = 0; i < prRealSpace.length; i++ ) + for (int i = 0; i < prRealSpace.length; i++) log10s[i] = Math.log10(prRealSpace[i]); return log10s; } @@ -154,7 +164,7 @@ public class MathUtils { double sum = 0.0; double maxValue = Utils.findMaxEntry(log10p); - for ( int i = start; i < finish; i++ ) { + for (int i = start; i < finish; i++) { sum += Math.pow(10.0, log10p[i] - maxValue); } @@ -163,13 +173,13 @@ public class MathUtils { public static double sumDoubles(List values) { double s = 0.0; - for ( double v : values) s += v; + for (double v : values) s += v; return s; } public static int sumIntegers(List values) { int s = 0; - for ( int v : values) s += v; + for (int v : values) s += v; return s; } @@ -185,11 +195,11 @@ public class MathUtils { } public static boolean wellFormedDouble(double val) { - return ! Double.isInfinite(val) && ! Double.isNaN(val); + return !Double.isInfinite(val) && !Double.isNaN(val); } public static double bound(double value, double minBoundary, double maxBoundary) { - return Math.max(Math.min(value, maxBoundary), minBoundary); + return Math.max(Math.min(value, maxBoundary), minBoundary); } public static boolean isBounded(double val, double lower, double upper) { @@ -197,7 +207,7 @@ public class MathUtils { } public static boolean isPositive(double val) { - return ! isNegativeOrZero(val); + return !isNegativeOrZero(val); } public static boolean isPositiveOrZero(double val) { @@ -209,17 +219,19 @@ public class MathUtils { } public static boolean isNegative(double val) { - return ! isPositiveOrZero(val); + return !isPositiveOrZero(val); } /** * Compares double values for equality (within 1e-6), or inequality. * - * @param a the first double value - * @param b the second double value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + * @param a the first double value + * @param b the second double value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b) { return compareDoubles(a, b, 1e-6); } + public static byte compareDoubles(double a, double b) { + return compareDoubles(a, b, 1e-6); + } /** * Compares double values for equality (within epsilon), or inequality. @@ -227,23 +239,28 @@ public class MathUtils { * @param a the first double value * @param b the second double value * @param epsilon the precision within which two double values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareDoubles(double a, double b, double epsilon) - { - if (Math.abs(a - b) < epsilon) { return 0; } - if (a > b) { return -1; } + public static byte compareDoubles(double a, double b, double epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } return 1; } /** * Compares float values for equality (within 1e-6), or inequality. * - * @param a the first float value - * @param b the second float value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + * @param a the first float value + * @param b the second float value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. */ - public static byte compareFloats(float a, float b) { return compareFloats(a, b, 1e-6f); } + public static byte compareFloats(float a, float b) { + return compareFloats(a, b, 1e-6f); + } /** * Compares float values for equality (within epsilon), or inequality. @@ -251,47 +268,50 @@ public class MathUtils { * @param a the first float value * @param b the second float value * @param epsilon the precision within which two float values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. */ - public static byte compareFloats(float a, float b, float epsilon) - { - if (Math.abs(a - b) < epsilon) { return 0; } - if (a > b) { return -1; } + public static byte compareFloats(float a, float b, float epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } return 1; } - public static double NormalDistribution(double mean, double sd, double x) - { - double a = 1.0 / (sd*Math.sqrt(2.0 * Math.PI)); - double b = Math.exp(-1.0 * (Math.pow(x - mean,2.0)/(2.0 * sd * sd))); + public static double NormalDistribution(double mean, double sd, double x) { + double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); + double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); return a * b; } - public static double binomialCoefficient (int n, int k) { + public static double binomialCoefficient(int n, int k) { return Math.pow(10, log10BinomialCoefficient(n, k)); } + /** * Computes a binomial probability. This is computed using the formula - * - * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) - * + *

+ * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) + *

* where n is the number of trials, k is the number of successes, and p is the probability of success * - * @param n number of Bernoulli trials - * @param k number of successes - * @param p probability of success - * - * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. + * @param n number of Bernoulli trials + * @param k number of successes + * @param p probability of success + * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. */ - public static double binomialProbability (int n, int k, double p) { + public static double binomialProbability(int n, int k, double p) { return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); } /** * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. - * @param start - start of the cumulant sum (over hits) - * @param end - end of the cumulant sum (over hits) - * @param total - number of attempts for the number of hits + * + * @param start - start of the cumulant sum (over hits) + * @param end - end of the cumulant sum (over hits) + * @param total - number of attempts for the number of hits * @param probHit - probability of a successful hit * @return - returns the cumulative probability */ @@ -300,11 +320,11 @@ public class MathUtils { double prevProb; BigDecimal probCache = BigDecimal.ZERO; - for(int hits = start; hits < end; hits++) { + for (int hits = start; hits < end; hits++) { prevProb = cumProb; double probability = binomialProbability(total, hits, probHit); cumProb += probability; - if ( probability > 0 && cumProb - prevProb < probability/2 ) { // loss of precision + if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision probCache = probCache.add(new BigDecimal(prevProb)); cumProb = 0.0; hits--; // repeat loop @@ -314,20 +334,20 @@ public class MathUtils { return probCache.add(new BigDecimal(cumProb)).doubleValue(); } - + /** * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. * This is computed using the formula: - * - * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] - * + *

+ * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] + *

* where xi represents the number of times outcome i was observed, n is the number of total observations. * In this implementation, the value of n is inferred as the sum over i of xi. * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @return the multinomial of the specified configuration. + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @return the multinomial of the specified configuration. */ - public static double multinomialCoefficient (int [] k) { + public static double multinomialCoefficient(int[] k) { int n = 0; for (int xi : k) { n += xi; @@ -339,37 +359,38 @@ public class MathUtils { /** * Computes a multinomial probability efficiently avoiding overflow even for large numbers. * This is computed using the formula: - * - * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) - * + *

+ * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) + *

* where xi represents the number of times outcome i was observed, n is the number of total observations, and * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is * inferred as the sum over i of xi. * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur - * @return the multinomial probability of the specified configuration. + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur + * @return the multinomial probability of the specified configuration. */ - public static double multinomialProbability (int[] k, double[] p) { + public static double multinomialProbability(int[] k, double[] p) { if (p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); int n = 0; - double [] log10P = new double[p.length]; - for (int i=0; i array[maxI] ) + for (int i = 0; i < array.length; i++) { + if (maxI == -1 || array[i] > array[maxI]) maxI = i; } @@ -532,11 +554,11 @@ public class MathUtils { } public static int maxElementIndex(int[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); int maxI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( maxI == -1 || array[i] > array[maxI] ) + for (int i = 0; i < array.length; i++) { + if (maxI == -1 || array[i] > array[maxI]) maxI = i; } @@ -551,16 +573,20 @@ public class MathUtils { return array[minElementIndex(array)]; } + public static int arrayMin(int[] array) { + return array[minElementIndex(array)]; + } + public static byte arrayMin(byte[] array) { return array[minElementIndex(array)]; } public static int minElementIndex(double[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); int minI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( minI == -1 || array[i] < array[minI] ) + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) minI = i; } @@ -568,32 +594,44 @@ public class MathUtils { } public static int minElementIndex(byte[] array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); int minI = -1; - for ( int i = 0; i < array.length; i++ ) { - if ( minI == -1 || array[i] < array[minI] ) + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) minI = i; } return minI; - } + } + + public static int minElementIndex(int[] array) { + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); + + int minI = -1; + for (int i = 0; i < array.length; i++) { + if (minI == -1 || array[i] < array[minI]) + minI = i; + } + + return minI; + } public static int arrayMaxInt(List array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); - if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) throw new IllegalArgumentException("Array size cannot be 0!"); int m = array.get(0); - for ( int e : array ) m = Math.max(m, e); + for (int e : array) m = Math.max(m, e); return m; } public static double arrayMaxDouble(List array) { - if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); - if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); + if (array == null) throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) throw new IllegalArgumentException("Array size cannot be 0!"); double m = array.get(0); - for ( double e : array ) m = Math.max(m, e); + for (double e : array) m = Math.max(m, e); return m; } @@ -636,7 +674,7 @@ public class MathUtils { for (byte v : vals) { sum += v; } - return (byte) Math.floor(sum/vals.length); + return (byte) Math.floor(sum / vals.length); } public static double averageDouble(List vals) { @@ -749,7 +787,9 @@ public class MathUtils { } - /** Draw N random elements from list. */ + /** + * Draw N random elements from list. + */ public static List randomSubset(List list, int N) { if (list.size() <= N) { return list; @@ -770,6 +810,25 @@ public class MathUtils { return ans; } + /** + * Draw N random elements from an array. + * + * @param array your objects + * @param n number of elements to select at random from the list + * @return a new list with the N randomly chosen elements from list + */ + @Requires({"array != null", "n>=0"}) + @Ensures({"result != null", "result.length == Math.min(n, array.length)"}) + public static Object[] randomSubset(final Object[] array, final int n) { + if (array.length <= n) + return array.clone(); + + Object[] shuffledArray = arrayShuffle(array); + Object[] result = new Object[n]; + System.arraycopy(shuffledArray, 0, result, 0, n); + return result; + } + public static double percentage(double x, double base) { return (base > 0 ? (x / base) * 100.0 : 0); } @@ -799,7 +858,7 @@ public class MathUtils { return count; } - public static int countOccurrences(byte element, byte [] array) { + public static int countOccurrences(byte element, byte[] array) { int count = 0; for (byte y : array) { if (element == y) @@ -814,13 +873,13 @@ public class MathUtils { * Better than sorting if N (number of elements to return) is small * * @param array the array - * @param n number of top elements to return + * @param n number of top elements to return * @return the n larger elements of the array */ - public static Collection getNMaxElements(double [] array, int n) { + public static Collection getNMaxElements(double[] array, int n) { ArrayList maxN = new ArrayList(n); double lastMax = Double.MAX_VALUE; - for (int i=0; i sampleIndicesWithReplacement(int n, int k) { - ArrayList chosen_balls = new ArrayList (k); - for (int i=0; i< k; i++) { + ArrayList chosen_balls = new ArrayList(k); + for (int i = 0; i < k; i++) { //Integer chosen_ball = balls[rand.nextInt(k)]; chosen_balls.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(n)); //balls.remove(chosen_ball); @@ -872,11 +931,11 @@ public class MathUtils { /** * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times - - * @param indices the list of indices for elements to extract - * @param list the list from which the elements should be extracted - * @param the template type of the ArrayList - * @return a new ArrayList consisting of the elements at the specified indices + * + * @param indices the list of indices for elements to extract + * @param list the list from which the elements should be extracted + * @param the template type of the ArrayList + * @return a new ArrayList consisting of the elements at the specified indices */ static public ArrayList sliceListByIndices(List indices, List list) { ArrayList subset = new ArrayList(); @@ -898,18 +957,18 @@ public class MathUtils { ArrayList equalToX = new ArrayList(); ArrayList greaterThanX = new ArrayList(); - for(Comparable y : list) { - if(x.compareTo(y) > 0) { + for (Comparable y : list) { + if (x.compareTo(y) > 0) { lessThanX.add(y); - } else if(x.compareTo(y) < 0) { + } else if (x.compareTo(y) < 0) { greaterThanX.add(y); } else equalToX.add(y); } - if(lessThanX.size() > orderStat) + if (lessThanX.size() > orderStat) return orderStatisticSearch(orderStat, lessThanX); - else if(lessThanX.size() + equalToX.size() >= orderStat) + else if (lessThanX.size() + equalToX.size() >= orderStat) return orderStat; else return orderStatisticSearch(orderStat - lessThanX.size() - equalToX.size(), greaterThanX); @@ -918,7 +977,7 @@ public class MathUtils { public static Object getMedian(List list) { - return orderStatisticSearch((int) Math.ceil(list.size()/2), list); + return orderStatisticSearch((int) Math.ceil(list.size() / 2), list); } public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { @@ -926,7 +985,7 @@ public class MathUtils { // list index maps to a q-score only through the offset index // returns the kth-largest q-score. - if( reads.size() == 0) { + if (reads.size() == 0) { return 0; } @@ -938,15 +997,15 @@ public class MathUtils { final byte qk = reads.get(k).getBaseQualities()[offsets.get(k)]; - for(int iter = 0; iter < reads.size(); iter ++) { + for (int iter = 0; iter < reads.size(); iter++) { SAMRecord read = reads.get(iter); int offset = offsets.get(iter); byte quality = read.getBaseQualities()[offset]; - if(quality < qk) { + if (quality < qk) { lessThanQReads.add(read); lessThanQOffsets.add(offset); - } else if(quality > qk) { + } else if (quality > qk) { greaterThanQReads.add(read); greaterThanQOffsets.add(offset); } else { @@ -954,9 +1013,9 @@ public class MathUtils { } } - if(lessThanQReads.size() > k) + if (lessThanQReads.size() > k) return getQScoreOrderStatistic(lessThanQReads, lessThanQOffsets, k); - else if(equalToQReads.size() + lessThanQReads.size() >= k) + else if (equalToQReads.size() + lessThanQReads.size() >= k) return qk; else return getQScoreOrderStatistic(greaterThanQReads, greaterThanQOffsets, k - lessThanQReads.size() - equalToQReads.size()); @@ -964,10 +1023,11 @@ public class MathUtils { } public static byte getQScoreMedian(List reads, List offsets) { - return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); + return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } - /** A utility class that computes on the fly average and standard deviation for a stream of numbers. + /** + * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that * it could overflow any naive summation-based scheme or cause loss of precision). * Instead, adding a new number observed @@ -983,20 +1043,31 @@ public class MathUtils { public void add(double obs) { obs_count++; double oldMean = mean; - mean += ( obs - mean ) / obs_count; // update mean - s += ( obs - oldMean ) * ( obs - mean ); + mean += (obs - mean) / obs_count; // update mean + s += (obs - oldMean) * (obs - mean); } public void addAll(Collection col) { - for ( Number o : col ) { + for (Number o : col) { add(o.doubleValue()); } } - public double mean() { return mean; } - public double stddev() { return Math.sqrt(s/(obs_count - 1)); } - public double var() { return s/(obs_count - 1); } - public long observationCount() { return obs_count; } + public double mean() { + return mean; + } + + public double stddev() { + return Math.sqrt(s / (obs_count - 1)); + } + + public double var() { + return s / (obs_count - 1); + } + + public long observationCount() { + return obs_count; + } public RunningAverage clone() { RunningAverage ra = new RunningAverage(); @@ -1007,71 +1078,86 @@ public class MathUtils { } public void merge(RunningAverage other) { - if ( this.obs_count > 0 || other.obs_count > 0 ) { // if we have any observations at all - this.mean = ( this.mean * this.obs_count + other.mean * other.obs_count ) / ( this.obs_count + other.obs_count ); + if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all + this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); this.s += other.s; } this.obs_count += other.obs_count; } } - + // // useful common utility routines // - public static double rate(long n, long d) { return n / (1.0 * Math.max(d, 1)); } - public static double rate(int n, int d) { return n / (1.0 * Math.max(d, 1)); } + public static double rate(long n, long d) { + return n / (1.0 * Math.max(d, 1)); + } - public static long inverseRate(long n, long d) { return n == 0 ? 0 : d / Math.max(n, 1); } - public static long inverseRate(int n, int d) { return n == 0 ? 0 : d / Math.max(n, 1); } + public static double rate(int n, int d) { + return n / (1.0 * Math.max(d, 1)); + } - public static double ratio(int num, int denom) { return ((double)num) / (Math.max(denom, 1)); } - public static double ratio(long num, long denom) { return ((double)num) / (Math.max(denom, 1)); } + public static long inverseRate(long n, long d) { + return n == 0 ? 0 : d / Math.max(n, 1); + } + + public static long inverseRate(int n, int d) { + return n == 0 ? 0 : d / Math.max(n, 1); + } + + public static double ratio(int num, int denom) { + return ((double) num) / (Math.max(denom, 1)); + } + + public static double ratio(long num, long denom) { + return ((double) num) / (Math.max(denom, 1)); + } public static final double[] log10Cache; public static final double[] jacobianLogTable; public static final int JACOBIAN_LOG_TABLE_SIZE = 101; public static final double JACOBIAN_LOG_TABLE_STEP = 0.1; - public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0/JACOBIAN_LOG_TABLE_STEP; + public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; public static final double MAX_JACOBIAN_TOLERANCE = 10.0; private static final int MAXN = 11000; private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients static { log10Cache = new double[LOG10_CACHE_SIZE]; - jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; + jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; log10Cache[0] = Double.NEGATIVE_INFINITY; - for (int k=1; k < LOG10_CACHE_SIZE; k++) + for (int k = 1; k < LOG10_CACHE_SIZE; k++) log10Cache[k] = Math.log10(k); - for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { - jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) - * JACOBIAN_LOG_TABLE_STEP)); + for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { + jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) + * JACOBIAN_LOG_TABLE_STEP)); - } + } } static public double softMax(final double[] vec) { double acc = vec[0]; - for (int k=1; k < vec.length; k++) - acc = softMax(acc,vec[k]); + for (int k = 1; k < vec.length; k++) + acc = softMax(acc, vec[k]); return acc; } static public double max(double x0, double x1, double x2) { - double a = Math.max(x0,x1); - return Math.max(a,x2); + double a = Math.max(x0, x1); + return Math.max(a, x2); } - - static public double softMax(final double x0, final double x1, final double x2) { - // compute naively log10(10^x[0] + 10^x[1]+...) - // return Math.log10(MathUtils.sumLog10(vec)); - // better approximation: do Jacobian logarithm function on data pairs - double a = softMax(x0,x1); - return softMax(a,x2); + static public double softMax(final double x0, final double x1, final double x2) { + // compute naively log10(10^x[0] + 10^x[1]+...) + // return Math.log10(MathUtils.sumLog10(vec)); + + // better approximation: do Jacobian logarithm function on data pairs + double a = softMax(x0, x1); + return softMax(a, x2); } static public double softMax(final double x, final double y) { @@ -1084,49 +1170,50 @@ public class MathUtils { // slow exact version: // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - double diff = x-y; + double diff = x - y; if (diff > MAX_JACOBIAN_TOLERANCE) return x; else if (diff < -MAX_JACOBIAN_TOLERANCE) return y; else if (diff >= 0) { - int ind = (int)(diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + int ind = (int) (diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5); return x + jacobianLogTable[ind]; - } - else { - int ind = (int)(-diff*INV_JACOBIAN_LOG_TABLE_STEP+0.5); + } else { + int ind = (int) (-diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5); return y + jacobianLogTable[ind]; } } - public static double phredScaleToProbability (byte q) { - return Math.pow(10,(-q)/10.0); + public static double phredScaleToProbability(byte q) { + return Math.pow(10, (-q) / 10.0); } - public static double phredScaleToLog10Probability (byte q) { - return ((-q)/10.0); + public static double phredScaleToLog10Probability(byte q) { + return ((-q) / 10.0); } /** * Returns the phred scaled value of probability p + * * @param p probability (between 0 and 1). * @return phred scaled probability of p */ - public static byte probabilityToPhredScale (double p) { + public static byte probabilityToPhredScale(double p) { return (byte) ((-10) * Math.log10(p)); } - public static double log10ProbabilityToPhredScale (double log10p) { + public static double log10ProbabilityToPhredScale(double log10p) { return (-10) * log10p; } /** * Converts LN to LOG10 + * * @param ln log(x) * @return log10(x) */ - public static double lnToLog10 (double ln) { + public static double lnToLog10(double ln) { return ln * Math.log10(Math.exp(1)); } @@ -1134,169 +1221,190 @@ public class MathUtils { * Constants to simplify the log gamma function calculation. */ private static final double - zero = 0.0, - one = 1.0, - half = .5, - a0 = 7.72156649015328655494e-02, - a1 = 3.22467033424113591611e-01, - a2 = 6.73523010531292681824e-02, - a3 = 2.05808084325167332806e-02, - a4 = 7.38555086081402883957e-03, - a5 = 2.89051383673415629091e-03, - a6 = 1.19270763183362067845e-03, - a7 = 5.10069792153511336608e-04, - a8 = 2.20862790713908385557e-04, - a9 = 1.08011567247583939954e-04, - a10 = 2.52144565451257326939e-05, - a11 = 4.48640949618915160150e-05, - tc = 1.46163214496836224576e+00, - tf = -1.21486290535849611461e-01, - tt = -3.63867699703950536541e-18, - t0 = 4.83836122723810047042e-01, - t1 = -1.47587722994593911752e-01, - t2 = 6.46249402391333854778e-02, - t3 = -3.27885410759859649565e-02, - t4 = 1.79706750811820387126e-02, - t5 = -1.03142241298341437450e-02, - t6 = 6.10053870246291332635e-03, - t7 = -3.68452016781138256760e-03, - t8 = 2.25964780900612472250e-03, - t9 = -1.40346469989232843813e-03, - t10 = 8.81081882437654011382e-04, - t11 = -5.38595305356740546715e-04, - t12 = 3.15632070903625950361e-04, - t13 = -3.12754168375120860518e-04, - t14 = 3.35529192635519073543e-04, - u0 = -7.72156649015328655494e-02, - u1 = 6.32827064025093366517e-01, - u2 = 1.45492250137234768737e+00, - u3 = 9.77717527963372745603e-01, - u4 = 2.28963728064692451092e-01, - u5 = 1.33810918536787660377e-02, - v1 = 2.45597793713041134822e+00, - v2 = 2.12848976379893395361e+00, - v3 = 7.69285150456672783825e-01, - v4 = 1.04222645593369134254e-01, - v5 = 3.21709242282423911810e-03, - s0 = -7.72156649015328655494e-02, - s1 = 2.14982415960608852501e-01, - s2 = 3.25778796408930981787e-01, - s3 = 1.46350472652464452805e-01, - s4 = 2.66422703033638609560e-02, - s5 = 1.84028451407337715652e-03, - s6 = 3.19475326584100867617e-05, - r1 = 1.39200533467621045958e+00, - r2 = 7.21935547567138069525e-01, - r3 = 1.71933865632803078993e-01, - r4 = 1.86459191715652901344e-02, - r5 = 7.77942496381893596434e-04, - r6 = 7.32668430744625636189e-06, - w0 = 4.18938533204672725052e-01, - w1 = 8.33333333333329678849e-02, - w2 = -2.77777777728775536470e-03, - w3 = 7.93650558643019558500e-04, - w4 = -5.95187557450339963135e-04, - w5 = 8.36339918996282139126e-04, - w6 = -1.63092934096575273989e-03; + zero = 0.0, + one = 1.0, + half = .5, + a0 = 7.72156649015328655494e-02, + a1 = 3.22467033424113591611e-01, + a2 = 6.73523010531292681824e-02, + a3 = 2.05808084325167332806e-02, + a4 = 7.38555086081402883957e-03, + a5 = 2.89051383673415629091e-03, + a6 = 1.19270763183362067845e-03, + a7 = 5.10069792153511336608e-04, + a8 = 2.20862790713908385557e-04, + a9 = 1.08011567247583939954e-04, + a10 = 2.52144565451257326939e-05, + a11 = 4.48640949618915160150e-05, + tc = 1.46163214496836224576e+00, + tf = -1.21486290535849611461e-01, + tt = -3.63867699703950536541e-18, + t0 = 4.83836122723810047042e-01, + t1 = -1.47587722994593911752e-01, + t2 = 6.46249402391333854778e-02, + t3 = -3.27885410759859649565e-02, + t4 = 1.79706750811820387126e-02, + t5 = -1.03142241298341437450e-02, + t6 = 6.10053870246291332635e-03, + t7 = -3.68452016781138256760e-03, + t8 = 2.25964780900612472250e-03, + t9 = -1.40346469989232843813e-03, + t10 = 8.81081882437654011382e-04, + t11 = -5.38595305356740546715e-04, + t12 = 3.15632070903625950361e-04, + t13 = -3.12754168375120860518e-04, + t14 = 3.35529192635519073543e-04, + u0 = -7.72156649015328655494e-02, + u1 = 6.32827064025093366517e-01, + u2 = 1.45492250137234768737e+00, + u3 = 9.77717527963372745603e-01, + u4 = 2.28963728064692451092e-01, + u5 = 1.33810918536787660377e-02, + v1 = 2.45597793713041134822e+00, + v2 = 2.12848976379893395361e+00, + v3 = 7.69285150456672783825e-01, + v4 = 1.04222645593369134254e-01, + v5 = 3.21709242282423911810e-03, + s0 = -7.72156649015328655494e-02, + s1 = 2.14982415960608852501e-01, + s2 = 3.25778796408930981787e-01, + s3 = 1.46350472652464452805e-01, + s4 = 2.66422703033638609560e-02, + s5 = 1.84028451407337715652e-03, + s6 = 3.19475326584100867617e-05, + r1 = 1.39200533467621045958e+00, + r2 = 7.21935547567138069525e-01, + r3 = 1.71933865632803078993e-01, + r4 = 1.86459191715652901344e-02, + r5 = 7.77942496381893596434e-04, + r6 = 7.32668430744625636189e-06, + w0 = 4.18938533204672725052e-01, + w1 = 8.33333333333329678849e-02, + w2 = -2.77777777728775536470e-03, + w3 = 7.93650558643019558500e-04, + w4 = -5.95187557450339963135e-04, + w5 = 8.36339918996282139126e-04, + w6 = -1.63092934096575273989e-03; /** * Efficient rounding functions to simplify the log gamma function calculation - * double to long with 32 bit shift + * double to long with 32 bit shift */ - private static final int HI (double x) { - return (int)(Double.doubleToLongBits(x) >> 32); + private static final int HI(double x) { + return (int) (Double.doubleToLongBits(x) >> 32); } /** * Efficient rounding functions to simplify the log gamma function calculation - * double to long without shift + * double to long without shift */ - private static final int LO (double x) { - return (int)Double.doubleToLongBits(x); + private static final int LO(double x) { + return (int) Double.doubleToLongBits(x); } /** * Most efficent implementation of the lnGamma (FDLIBM) * Use via the log10Gamma wrapper method. */ - private static double lnGamma (double x) { - double t,y,z,p,p1,p2,p3,q,r,w; + private static double lnGamma(double x) { + double t, y, z, p, p1, p2, p3, q, r, w; int i; int hx = HI(x); int lx = LO(x); /* purge off +-inf, NaN, +-0, and negative arguments */ - int ix = hx&0x7fffffff; + int ix = hx & 0x7fffffff; if (ix >= 0x7ff00000) return Double.POSITIVE_INFINITY; - if ((ix|lx)==0 || hx < 0) return Double.NaN; - if (ix<0x3b900000) { /* |x|<2**-70, return -log(|x|) */ + if ((ix | lx) == 0 || hx < 0) return Double.NaN; + if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ return -Math.log(x); } /* purge off 1 and 2 */ - if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0; - /* for x < 2.0 */ - else if(ix<0x40000000) { - if(ix<=0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) r = 0; + /* for x < 2.0 */ + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ r = -Math.log(x); - if(ix>=0x3FE76944) {y = one-x; i= 0;} - else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;} - else {y = x; i=2;} + if (ix >= 0x3FE76944) { + y = one - x; + i = 0; + } else if (ix >= 0x3FCDA661) { + y = x - (tc - one); + i = 1; + } else { + y = x; + i = 2; + } } else { r = zero; - if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */ - else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */ - else {y=x-one;i=2;} + if (ix >= 0x3FFBB4C3) { + y = 2.0 - x; + i = 0; + } /* [1.7316,2] */ else if (ix >= 0x3FF3B4C4) { + y = x - tc; + i = 1; + } /* [1.23,1.73] */ else { + y = x - one; + i = 2; + } } - switch(i) { - case 0: - z = y*y; - p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10)))); - p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11))))); - p = y*p1+p2; - r += (p-0.5*y); break; - case 1: - z = y*y; - w = z*y; - p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */ - p2 = t1+w*(t4+w*(t7+w*(t10+w*t13))); - p3 = t2+w*(t5+w*(t8+w*(t11+w*t14))); - p = z*p1-(tt-w*(p2+y*p3)); - r += (tf + p); break; - case 2: - p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5))))); - p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5)))); - r += (-0.5*y + p1/p2); + switch (i) { + case 0: + z = y * y; + p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); + p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); + p = y * p1 + p2; + r += (p - 0.5 * y); + break; + case 1: + z = y * y; + w = z * y; + p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ + p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); + p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); + p = z * p1 - (tt - w * (p2 + y * p3)); + r += (tf + p); + break; + case 2: + p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); + p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); + r += (-0.5 * y + p1 / p2); } - } - else if(ix<0x40200000) { /* x < 8.0 */ - i = (int)x; + } else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int) x; t = zero; - y = x-(double)i; - p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6)))))); - q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6))))); - r = half*y+p/q; - z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ - switch(i) { - case 7: z *= (y+6.0); /* FALLTHRU */ - case 6: z *= (y+5.0); /* FALLTHRU */ - case 5: z *= (y+4.0); /* FALLTHRU */ - case 4: z *= (y+3.0); /* FALLTHRU */ - case 3: z *= (y+2.0); /* FALLTHRU */ - r += Math.log(z); break; + y = x - (double) i; + p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); + q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); + r = half * y + p / q; + z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: + z *= (y + 6.0); /* FALLTHRU */ + case 6: + z *= (y + 5.0); /* FALLTHRU */ + case 5: + z *= (y + 4.0); /* FALLTHRU */ + case 4: + z *= (y + 3.0); /* FALLTHRU */ + case 3: + z *= (y + 2.0); /* FALLTHRU */ + r += Math.log(z); + break; } /* 8.0 <= x < 2**58 */ } else if (ix < 0x43900000) { t = Math.log(x); - z = one/x; - y = z*z; - w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6))))); - r = (x-half)*(t-one)+w; + z = one / x; + y = z * z; + w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); + r = (x - half) * (t - one) + w; } else /* 2**58 <= x <= inf */ - r = x*(Math.log(x)-one); + r = x * (Math.log(x) - one); return r; } @@ -1308,7 +1416,7 @@ public class MathUtils { * @param x the x parameter * @return the log10 of the gamma function at x. */ - public static double log10Gamma (double x) { + public static double log10Gamma(double x) { return lnToLog10(lnGamma(x)); } @@ -1320,13 +1428,13 @@ public class MathUtils { * @param k number of successes * @return the log10 of the binomial coefficient */ - public static double log10BinomialCoefficient (int n, int k) { - return log10Gamma(n+1) - log10Gamma(k+1) - log10Gamma(n-k+1); + public static double log10BinomialCoefficient(int n, int k) { + return log10Gamma(n + 1) - log10Gamma(k + 1) - log10Gamma(n - k + 1); } - public static double log10BinomialProbability (int n, int k, double log10p) { - double log10OneMinusP = Math.log10(1-Math.pow(10,log10p)); - return log10BinomialCoefficient(n, k) + log10p*k + log10OneMinusP*(n-k); + public static double log10BinomialProbability(int n, int k, double log10p) { + double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); + return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); } @@ -1338,38 +1446,74 @@ public class MathUtils { * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) * @return */ - public static double log10MultinomialCoefficient (int n, int [] k) { + public static double log10MultinomialCoefficient(int n, int[] k) { double denominator = 0.0; for (int x : k) { - denominator += log10Gamma(x+1); + denominator += log10Gamma(x + 1); } - return log10Gamma(n+1) - denominator; + return log10Gamma(n + 1) - denominator; } /** * Computes the log10 of the multinomial distribution probability given a vector * of log10 probabilities. Designed to prevent overflows even with very large numbers. * - * @param n number of trials - * @param k array of number of successes for each possibility + * @param n number of trials + * @param k array of number of successes for each possibility * @param log10p array of log10 probabilities * @return */ - public static double log10MultinomialProbability (int n, int [] k, double [] log10p) { + public static double log10MultinomialProbability(int n, int[] k, double[] log10p) { if (log10p.length != k.length) throw new UserException.BadArgumentValue("p and k", "Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); double log10Prod = 0.0; - for (int i=0; i filesInPath = new ArrayList(); FilenameFilter filter = new OrFilenameFilter(new DirectoryFilter(), - new ExtensionFilter(extension)); + new ExtensionFilter(extension)); File[] contents = basePath.listFiles( filter ); for (File content : contents) { String relativeFileName = relativePrefix.trim().length() != 0 ? @@ -118,4 +124,47 @@ public class PathUtils { } } -} + + /** + * Walk over the GATK released directories to find the most recent JAR files corresponding + * to the version prefix. For example, providing input "1.2" will + * return the full path to the most recent GenomeAnalysisTK.jar in the GATK_RELEASE_DIR + * in directories that match gatkReleaseDir/GenomeAnalysisTK-1.2* + * + * @param gatkReleaseDir Path to directory containing GATK release binaries (e.g., /humgen/gsa-hpprojects/GATK/bin/) + * @param releaseVersionNumber Desired GATK version number (e.g., 1.2) + * @return A file pointing to the most recent GATK file in the release directory with GATK release number + */ + public static File findMostRecentGATKVersion(final File gatkReleaseDir, final String releaseVersionNumber) { + final String versionString = "GenomeAnalysisTK-" + releaseVersionNumber; + + final List gatkJars = new ArrayList(); + for ( final String path : gatkReleaseDir.list(new isGATKVersion(versionString)) ) { + gatkJars.add(new File(gatkReleaseDir.getAbsolutePath() + "/" + path + "/GenomeAnalysisTK.jar")); + } + + if ( gatkJars.isEmpty() ) + return null; + else { + Collections.sort(gatkJars, LastModifiedFileComparator.LASTMODIFIED_REVERSE); + //for ( File jar : gatkJars ) logger.info(String.format("%s => %d", jar, jar.lastModified())); + final File last = gatkJars.get(0); + logger.debug(String.format("findMostRecentGATKVersion: Found %d jars for %s, keeping last one %s", + gatkJars.size(), releaseVersionNumber, last)); + return last; + } + } + + private final static class isGATKVersion implements FilenameFilter { + private final String versionString; + + private isGATKVersion(final String versionString) { + this.versionString = versionString; + } + + @Override + public boolean accept(final File file, final String s) { + return s.contains(versionString); + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 921a0a599..fb133d902 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -70,27 +70,27 @@ public class ClippingOp { break; case SOFTCLIP_BASES: - if ( read.getReadUnmappedFlag() ) { + if (read.getReadUnmappedFlag()) { // we can't process unmapped reads throw new UserException("Read Clipper cannot soft clip unmapped reads"); } //System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); int myStop = stop; - if ( (stop + 1 - start) == read.getReadLength() ) { + if ((stop + 1 - start) == read.getReadLength()) { // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); //break; myStop--; // just decrement stop } - if ( start > 0 && myStop != read.getReadLength() - 1 ) + if (start > 0 && myStop != read.getReadLength() - 1) throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); Cigar oldCigar = read.getCigar(); int scLeft = 0, scRight = read.getReadLength(); - if ( start == 0 ) + if (start == 0) scLeft = myStop + 1; else scRight = start; @@ -134,8 +134,7 @@ public class ClippingOp { unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); matchesCount = 0; unclippedCigar.add(element); - } - else + } else unclippedCigar.add(element); } if (matchesCount > 0) @@ -284,10 +283,9 @@ public class ClippingOp { } @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) - private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) { + private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { if (start == 0 && stop == read.getReadLength() - 1) return GATKSAMRecord.emptyRead(read); -// return new GATKSAMRecord(read.getHeader()); // If the read is unmapped there is no Cigar string and neither should we create a new cigar string @@ -296,8 +294,8 @@ public class ClippingOp { // the cigar may force a shift left or right (or both) in case we are left with insertions // starting or ending the read after applying the hard clip on start/stop. int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; - byte [] newBases = new byte[newLength]; - byte [] newQuals = new byte[newLength]; + byte[] newBases = new byte[newLength]; + byte[] newQuals = new byte[newLength]; int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); @@ -321,11 +319,11 @@ public class ClippingOp { } @Requires({"!cigar.isEmpty()"}) - private CigarShift hardClipCigar (Cigar cigar, int start, int stop) { + private CigarShift hardClipCigar(Cigar cigar, int start, int stop) { Cigar newCigar = new Cigar(); int index = 0; int totalHardClipCount = stop - start + 1; - int alignmentShift = 0; // caused by hard clipping insertions or deletions + int alignmentShift = 0; // caused by hard clipping deletions // hard clip the beginning of the cigar string if (start == 0) { @@ -353,7 +351,7 @@ public class ClippingOp { // element goes beyond what we need to clip else if (index + shift > stop + 1) { int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1); - alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop-index+1); + alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop - index + 1); newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); } @@ -388,7 +386,7 @@ public class ClippingOp { if (index + shift < start) newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); - // element goes beyond our clip starting position + // element goes beyond our clip starting position else { int elementLengthAfterChopping = start - index; alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength() - (start - index)); @@ -396,7 +394,7 @@ public class ClippingOp { // if this last element is a HARD CLIP operator, just merge it with our hard clip operator to be added later if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) totalHardClipCount += elementLengthAfterChopping; - // otherwise, maintain what's left of this last operator + // otherwise, maintain what's left of this last operator else newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); } @@ -408,7 +406,7 @@ public class ClippingOp { } // check if we are hard clipping indels - while(cigarElementIterator.hasNext()) { + while (cigarElementIterator.hasNext()) { cigarElement = cigarElementIterator.next(); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); @@ -444,34 +442,30 @@ public class ClippingOp { boolean readHasStarted = false; boolean addedHardClips = false; - while(!cigarStack.empty()) { + while (!cigarStack.empty()) { CigarElement cigarElement = cigarStack.pop(); - if ( !readHasStarted && - cigarElement.getOperator() != CigarOperator.INSERTION && + if (!readHasStarted && +// cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.DELETION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) readHasStarted = true; - else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) totalHardClip += cigarElement.getLength(); - else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.INSERTION) - shift += cigarElement.getLength(); - - else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) + else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION) totalHardClip += cigarElement.getLength(); if (readHasStarted) { - if (i==1) { + if (i == 1) { if (!addedHardClips) { if (totalHardClip > 0) inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); addedHardClips = true; } inverseCigarStack.push(cigarElement); - } - else { + } else { if (!addedHardClips) { if (totalHardClip > 0) cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); @@ -498,7 +492,7 @@ public class ClippingOp { int newShift = 0; int oldShift = 0; - boolean readHasStarted = false; // if the new cigar is composed of S and H only, we have to traverse the entire old cigar to calculate the shift + boolean readHasStarted = false; // if the new cigar is composed of S and H only, we have to traverse the entire old cigar to calculate the shift for (CigarElement cigarElement : newCigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) newShift += cigarElement.getLength(); @@ -509,7 +503,7 @@ public class ClippingOp { } for (CigarElement cigarElement : oldCigar.getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP ) + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) oldShift += cigarElement.getLength(); else if (readHasStarted) break; @@ -522,7 +516,7 @@ public class ClippingOp { if (cigarElement.getOperator() == CigarOperator.INSERTION) return -clippedLength; - // Deletions should be added to the total hard clip count + // Deletions should be added to the total hard clip count else if (cigarElement.getOperator() == CigarOperator.DELETION) return cigarElement.getLength(); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index afe7fa975..7a664bd61 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -374,24 +374,43 @@ public class ReadClipper { * Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail * and hardClipByReferenceCoordinatesRightTail. Should not be used directly. * + * Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're clipping the + * left of right tail) by specifying either refStart < 0 or refStop < 0. + * * @param refStart first base to clip (inclusive) * @param refStop last base to clip (inclusive) * @return a new read, without the clipped bases */ - @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip + @Requires({"!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0"}) // can't handle unmapped reads, as we're using reference coordinates to clip protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { - int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); - int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); + if (read.isEmpty()) + return read; - if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) - return GATKSAMRecord.emptyRead(read); -// return new GATKSAMRecord(read.getHeader()); + int start; + int stop; + + // Determine the read coordinate to start and stop hard clipping + if (refStart < 0) { + if (refStop < 0) + throw new ReviewedStingException("Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")"); + start = 0; + stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); + } + else { + if (refStop >= 0) + throw new ReviewedStingException("Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")"); + start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); + stop = read.getReadLength() - 1; + } + +// if ((start == 0 && stop == read.getReadLength() - 1)) +// return GATKSAMRecord.emptyRead(read); if (start < 0 || stop > read.getReadLength() - 1) throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); if ( start > stop ) - throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); + throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen -- call Mauricio!", start, stop)); if ( start > 0 && stop < read.getReadLength() - 1) throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 18051ce92..586b86490 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -526,6 +526,42 @@ public abstract class AbstractReadBackedPileup rgSet) { + if(pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for(final String sample: tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroups(rgSet); + if(pileup != null) + filteredTracker.addElements(sample,pileup.pileupElementTracker); + } + return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + } + else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for(PE p: pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if(rgSet != null && !rgSet.isEmpty()) { + if(read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId())) + filteredTracker.add(p); + } + else { + if(read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; + } + } + @Override public RBP getPileupForLane(String laneID) { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 02767df7c..ccd9d509f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collection; +import java.util.HashSet; import java.util.List; /** @@ -129,6 +130,13 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForReadGroup(String readGroupId); + /** + * Gets all the reads associated with a given read groups. + * @param rgSet Set of identifiers for the read group. + * @return A pileup containing only the reads in the given read groups. + */ + public ReadBackedPileup getPileupForReadGroups(final HashSet rgSet); + /** * Gets all reads in a given lane id. (Lane ID is the read group * id stripped of the last .XX sample identifier added by the GATK). diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index cedd56bdf..542adea77 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -238,7 +238,7 @@ public class ArtificialSAMUtils { */ public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 96713edc2..913548ecc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.utils.sam; -import com.google.java.contract.Ensures; import net.sf.samtools.*; import org.broadinstitute.sting.utils.NGSPlatform; @@ -184,6 +183,12 @@ public class GATKSAMRecord extends BAMRecord { return getReducedReadCounts() != null; } + /** + * The number of bases corresponding the i'th base of the reduced read. + * + * @param i the read based coordinate inside the read + * @return the number of bases corresponding to the i'th base of the reduced read + */ public final byte getReducedCount(final int i) { byte firstCount = getReducedReadCounts()[0]; byte offsetCount = getReducedReadCounts()[i]; @@ -277,7 +282,6 @@ public class GATKSAMRecord extends BAMRecord { * * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ - @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) public int getSoftStart() { int start = this.getUnclippedStart(); for (CigarElement cigarElement : this.getCigar().getCigarElements()) { @@ -286,17 +290,17 @@ public class GATKSAMRecord extends BAMRecord { else break; } + return start; } /** * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. * - * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. * * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ - @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) public int getSoftEnd() { int stop = this.getUnclippedStart(); @@ -313,6 +317,7 @@ public class GATKSAMRecord extends BAMRecord { else shift = 0; } + return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index f2e54713f..7fa2f6230 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -29,6 +29,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -58,7 +59,7 @@ public class ReadUtils { /** * A HashMap of the SAM spec read flag names - *

+ * * Note: This is not being used right now, but can be useful in the future */ private static final Map readFlagNames = new HashMap(); @@ -79,49 +80,47 @@ public class ReadUtils { /** * This enum represents all the different ways in which a read can overlap an interval. - *

+ * * NO_OVERLAP_CONTIG: * read and interval are in different contigs. - *

+ * * NO_OVERLAP_LEFT: * the read does not overlap the interval. - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * NO_OVERLAP_RIGHT: * the read does not overlap the interval. - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * OVERLAP_LEFT: * the read starts before the beginning of the interval but ends inside of it - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * OVERLAP_RIGHT: * the read starts inside the interval but ends outside of it - *

- * |----------------| (interval) - * <----------------> (read) - *

+ * + * |----------------| (interval) + * <----------------> (read) + * * OVERLAP_LEFT_AND_RIGHT: * the read starts before the interval and ends after the interval - *

- * |-----------| (interval) - * <-------------------> (read) - *

+ * + * |-----------| (interval) + * <-------------------> (read) + * * OVERLAP_CONTAINED: * the read starts and ends inside the interval - *

- * |----------------| (interval) - * <--------> (read) + * + * |----------------| (interval) + * <--------> (read) */ - public enum ReadAndIntervalOverlap { - NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED - } + public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} /** * Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular @@ -141,15 +140,15 @@ public class ReadUtils { /** * is this base inside the adaptor of the read? - *

+ * * There are two cases to treat here: - *

+ * * 1) Read is in the negative strand => Adaptor boundary is on the left tail * 2) Read is in the positive strand => Adaptor boundary is on the right tail - *

+ * * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) * - * @param read the read to test + * @param read the read to test * @param basePos base position in REFERENCE coordinates (not read coordinates) * @return whether or not the base is in the adaptor */ @@ -166,22 +165,22 @@ public class ReadUtils { * the read boundary. If the read is in the positive strand, this is the first base after the end of the * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the * beginning of the fragment. - *

+ * * There are two cases we need to treat here: - *

+ * * 1) Our read is in the reverse strand : - *

- * <----------------------| * - * |---------------------> - *

- * in these cases, the adaptor boundary is at the mate start (minus one) - *

+ * + * <----------------------| * + * |---------------------> + * + * in these cases, the adaptor boundary is at the mate start (minus one) + * * 2) Our read is in the forward strand : - *

- * |----------------------> * - * <----------------------| - *

- * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) + * + * |----------------------> * + * <----------------------| + * + * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) * * @param read the read being tested for the adaptor boundary * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the mate is mapped to another contig. @@ -264,7 +263,7 @@ public class ReadUtils { /** * If a read starts in INSERTION, returns the first element length. - *

+ * * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. * * @param read @@ -272,7 +271,7 @@ public class ReadUtils { */ public final static int getFirstInsertionOffset(SAMRecord read) { CigarElement e = read.getCigar().getCigarElement(0); - if (e.getOperator() == CigarOperator.I) + if ( e.getOperator() == CigarOperator.I ) return e.getLength(); else return 0; @@ -280,7 +279,7 @@ public class ReadUtils { /** * If a read ends in INSERTION, returns the last element length. - *

+ * * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. * * @param read @@ -288,7 +287,7 @@ public class ReadUtils { */ public final static int getLastInsertionOffset(SAMRecord read) { CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); - if (e.getOperator() == CigarOperator.I) + if ( e.getOperator() == CigarOperator.I ) return e.getLength(); else return 0; @@ -297,8 +296,7 @@ public class ReadUtils { /** * Determines what is the position of the read in relation to the interval. * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. - * - * @param read the read + * @param read the read * @param interval the interval * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) */ @@ -309,30 +307,30 @@ public class ReadUtils { int uStart = read.getUnclippedStart(); int uStop = read.getUnclippedEnd(); - if (!read.getReferenceName().equals(interval.getContig())) + if ( !read.getReferenceName().equals(interval.getContig()) ) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; - else if (uStop < interval.getStart()) + else if ( uStop < interval.getStart() ) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; - else if (uStart > interval.getStop()) + else if ( uStart > interval.getStop() ) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; - else if (sStop < interval.getStart()) + else if ( sStop < interval.getStart() ) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; - else if (sStart > interval.getStop()) + else if ( sStart > interval.getStop() ) return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; - else if ((sStart >= interval.getStart()) && - (sStop <= interval.getStop())) + else if ( (sStart >= interval.getStart()) && + (sStop <= interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_CONTAINED; - else if ((sStart < interval.getStart()) && - (sStop > interval.getStop())) + else if ( (sStart < interval.getStart()) && + (sStop > interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; - else if ((sStart < interval.getStart())) + else if ( (sStart < interval.getStart()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT; else @@ -340,36 +338,52 @@ public class ReadUtils { } /** - * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in - * a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns - * the base prior to the deletion. If clipping the right tail (end of the read) returns the base after the - * deletion. + * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of + * two corner cases: + * + * 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside + * a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it + * doesn't matter because it already returns the previous base by default. + * + * 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the + * read starts with an insertion, and you're requesting the first read based coordinate, it will skip + * the leading insertion (because it has the same reference coordinate as the following base). * * @param read * @param refCoord * @param tail * @return the read coordinate corresponding to the requested reference coordinate for clipping. */ - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) + @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"}) @Ensures({"result >= 0", "result < read.getReadLength()"}) public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { Pair result = getReadCoordinateForReferenceCoordinate(read, refCoord); int readCoord = result.getFirst(); + // Corner case one: clipping the right tail and falls on deletion, move to the next + // read coordinate. It is not a problem for the left tail because the default answer + // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate. if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL) readCoord++; + // clipping the left tail and first base is insertion, go to the next read coordinate + // with the same reference coordinate. Advance to the next cigar element, or to the + // end of the read if there is no next element. + Pair firstElementIsInsertion = readStartsWithInsertion(read); + if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion.getFirst()) + readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), read.getReadLength() - 1); + return readCoord; } /** * Returns the read coordinate corresponding to the requested reference coordinate. - *

+ * * WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function * will return the last read base before the deletion. This function returns a * Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with * a deletion. - *

+ * * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a * pre-processed result according to normal clipping needs. Or you can use this function and tailor the * behavior to your needs. @@ -421,7 +435,7 @@ public class ReadUtils { if (endsWithinCigar) fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; - // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. else { nextCigarElement = cigarElementIterator.next(); @@ -442,13 +456,13 @@ public class ReadUtils { if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) readBases += shift; - // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need - // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (see warning in function contracts) + // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (see warning in function contracts) else if (fallsInsideDeletion && !endsWithinCigar) readBases += shift - 1; - // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion else if (fallsInsideDeletion && endsWithinCigar) readBases--; } @@ -457,7 +471,6 @@ public class ReadUtils { if (!goalReached) throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); - return new Pair(readBases, fallsInsideDeletion); } @@ -465,12 +478,11 @@ public class ReadUtils { * Compares two SAMRecords only the basis on alignment start. Note that * comparisons are performed ONLY on the basis of alignment start; any * two SAM records with the same alignment start will be considered equal. - *

+ * * Unmapped alignments will all be considered equal. */ @Requires({"read1 != null", "read2 != null"}) - @Ensures("result == 0 || result == 1 || result == -1") public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { AlignmentStartComparator comp = new AlignmentStartComparator(); return comp.compare(read1, read2); @@ -479,7 +491,7 @@ public class ReadUtils { /** * Is a base inside a read? * - * @param read the read to evaluate + * @param read the read to evaluate * @param referenceCoordinate the reference coordinate of the base to test * @return true if it is inside the read, false otherwise. */ @@ -502,4 +514,134 @@ public class ReadUtils { } + /** + * Checks if a read starts with an insertion. It looks beyond Hard and Soft clips + * if there are any. + * + * @param read + * @return A pair with the answer (true/false) and the element or null if it doesn't exist + */ + public static Pair readStartsWithInsertion(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.INSERTION) + return new Pair(true, cigarElement); + + else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP) + break; + } + return new Pair(false, null); + } + + /** + * Returns the coverage distribution of a list of reads within the desired region. + * + * See getCoverageDistributionOfRead for information on how the coverage is calculated. + * + * @param list the list of reads covering the region + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { + int [] totalCoverage = new int[stopLocation - startLocation + 1]; + + for (GATKSAMRecord read : list) { + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage); + } + + return totalCoverage; + } + + /** + * Returns the coverage distribution of a single read within the desired region. + * + * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample + * reads for variant regions, and deletions count as variants) + * + * @param read the read to get the coverage distribution of + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { + int [] coverage = new int[stopLocation - startLocation + 1]; + int refLocation = read.getSoftStart(); + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + switch (cigarElement.getOperator()) { + case S: + case M: + case EQ: + case N: + case X: + case D: + for (int i = 0; i < cigarElement.getLength(); i++) { + if (refLocation >= startLocation && refLocation <= stopLocation) { + int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1; + coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases + } + refLocation++; + } + break; + + case P: + case I: + case H: + break; + } + + if (refLocation > stopLocation) + break; + } + return coverage; + } + + /** + * Makes association maps for the reads and loci coverage as described below : + * + * - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage. + * Note: Locus is in reference coordinates. + * Example: Locus => {read1, read2, ..., readN} + * + * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. + * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with true meaning it contributes to the coverage. + * Example: Read => {true, true, false, ... false} + * + * @param readList the list of reads to generate the association mappings + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return the two hashmaps described above + */ + public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { + int arraySize = stopLocation - startLocation + 1; + + HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); + HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); + + + for (int i = startLocation; i <= stopLocation; i++) + locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists + + for (GATKSAMRecord read : readList) { + readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays + + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + + for (int i=0; i 0) { + // Update the hash for this locus + HashSet readSet = locusToReadMap.get(refLocation); + readSet.add(read); + + // Add this locus to the read hash + readToLocusMap.get(read)[refLocation - startLocation] = true; + } + else + // Update the boolean array with a 'no coverage' from this read to this locus + readToLocusMap.get(read)[refLocation-startLocation] = false; + } + } + return new Pair>, HashMap>(locusToReadMap, readToLocusMap); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index d45e663b0..3976231ef 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -202,7 +202,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "613fb2bbe01af8cbe6a188a10c1582ca" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "2ad4c17ac3ed380071137e4e53a398a5" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -308,7 +308,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "13c83656567cee9e93bda9874ee80234" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "991f093a0e610df235d28ada418ebf33" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 251c105c3..049bdce3e 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -26,22 +26,20 @@ package org.broadinstitute.sting.utils; +import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import org.broadinstitute.sting.BaseTest; - -import java.util.List; -import java.util.ArrayList; -import java.util.Collections; +import java.util.*; /** * Basic unit test for MathUtils */ public class MathUtilsUnitTest extends BaseTest { @BeforeClass - public void init() { } + public void init() { + } /** * Tests that we get the right values from the binomial distribution @@ -66,20 +64,20 @@ public class MathUtilsUnitTest extends BaseTest { public void testMultinomialProbability() { logger.warn("Executing testMultinomialProbability"); - int[] counts0 = { 2, 0, 1 }; - double[] probs0 = { 0.33, 0.33, 0.34 }; + int[] counts0 = {2, 0, 1}; + double[] probs0 = {0.33, 0.33, 0.34}; Assert.assertEquals(MathUtils.multinomialProbability(counts0, probs0), 0.111078, 1e-6); - int[] counts1 = { 10, 20, 30 }; - double[] probs1 = { 0.25, 0.25, 0.50 }; + int[] counts1 = {10, 20, 30}; + double[] probs1 = {0.25, 0.25, 0.50}; Assert.assertEquals(MathUtils.multinomialProbability(counts1, probs1), 0.002870301, 1e-9); - int[] counts2 = { 38, 82, 50, 36 }; - double[] probs2 = { 0.25, 0.25, 0.25, 0.25 }; + int[] counts2 = {38, 82, 50, 36}; + double[] probs2 = {0.25, 0.25, 0.25, 0.25}; Assert.assertEquals(MathUtils.multinomialProbability(counts2, probs2), 1.88221e-09, 1e-10); - int[] counts3 = { 1, 600, 1 }; - double[] probs3 = { 0.33, 0.33, 0.34 }; + int[] counts3 = {1, 600, 1}; + double[] probs3 = {0.33, 0.33, 0.34}; Assert.assertEquals(MathUtils.multinomialProbability(counts3, probs3), 5.20988e-285, 1e-286); } @@ -123,19 +121,21 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertTrue(FiveAlpha.containsAll(BigFiveAlpha)); } - /** Tests that we correctly compute mean and standard deviation from a stream of numbers */ + /** + * Tests that we correctly compute mean and standard deviation from a stream of numbers + */ @Test public void testRunningAverage() { logger.warn("Executing testRunningAverage"); - int [] numbers = {1,2,4,5,3,128,25678,-24}; + int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; MathUtils.RunningAverage r = new MathUtils.RunningAverage(); - for ( int i = 0 ; i < numbers.length ; i++ ) r.add((double)numbers[i]); + for (int i = 0; i < numbers.length; i++) r.add((double) numbers[i]); - Assert.assertEquals((long)numbers.length, r.observationCount()); - Assert.assertTrue(r.mean()- 3224.625 < 2e-10 ); - Assert.assertTrue(r.stddev()-9072.6515881128 < 2e-10); + Assert.assertEquals((long) numbers.length, r.observationCount()); + Assert.assertTrue(r.mean() - 3224.625 < 2e-10); + Assert.assertTrue(r.stddev() - 9072.6515881128 < 2e-10); } @Test @@ -174,4 +174,56 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.log10Factorial(12342), 45138.26, 1e-1); } + @Test(enabled = true) + public void testRandomSubset() { + Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + Assert.assertEquals(MathUtils.randomSubset(x, 0).length, 0); + Assert.assertEquals(MathUtils.randomSubset(x, 1).length, 1); + Assert.assertEquals(MathUtils.randomSubset(x, 2).length, 2); + Assert.assertEquals(MathUtils.randomSubset(x, 3).length, 3); + Assert.assertEquals(MathUtils.randomSubset(x, 4).length, 4); + Assert.assertEquals(MathUtils.randomSubset(x, 5).length, 5); + Assert.assertEquals(MathUtils.randomSubset(x, 6).length, 6); + Assert.assertEquals(MathUtils.randomSubset(x, 7).length, 7); + Assert.assertEquals(MathUtils.randomSubset(x, 8).length, 8); + Assert.assertEquals(MathUtils.randomSubset(x, 9).length, 9); + Assert.assertEquals(MathUtils.randomSubset(x, 10).length, 10); + Assert.assertEquals(MathUtils.randomSubset(x, 11).length, 10); + + for (int i = 0; i < 25; i++) + Assert.assertTrue(hasUniqueElements(MathUtils.randomSubset(x, 5))); + + } + + @Test(enabled = true) + public void testArrayShuffle() { + Integer[] x = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + for (int i = 0; i < 25; i++) { + Object[] t = MathUtils.arrayShuffle(x); + Assert.assertTrue(hasUniqueElements(t)); + Assert.assertTrue(hasAllElements(x, t)); + } + } + + private boolean hasUniqueElements(Object[] x) { + for (int i = 0; i < x.length; i++) + for (int j = i + 1; j < x.length; j++) + if (x[i].equals(x[j]) || x[i] == x[j]) + return false; + return true; + } + + private boolean hasAllElements(final Object[] expected, final Object[] actual) { + HashSet set = new HashSet(); + set.addAll(Arrays.asList(expected)); + set.removeAll(Arrays.asList(actual)); + return set.isEmpty(); + } + + + private void p (Object []x) { + for (Object v: x) + System.out.print((Integer) v + " "); + System.out.println(); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java index 18108e0a1..16b141bc3 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -112,8 +112,9 @@ public class ReadClipperTestUtils { } } - if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) - return true; // we don't accept reads starting or ending in deletions (add any other constraint here) +// if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) + if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION) + return true; // we don't accept reads starting or ending in deletions (add any other constraint here) } return false; @@ -190,4 +191,18 @@ public class ReadClipperTestUtils { return invertedCigar; } + /** + * Checks whether or not the read has any cigar element that is not H or S + * + * @param read + * @return true if it has any M, I or D, false otherwise + */ + public static boolean readHasNonClippedBases(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) + return true; + return false; + } + + } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index 4dad68dc5..bc918c0a4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -30,12 +30,12 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import java.util.HashMap; import java.util.List; /** @@ -59,10 +59,11 @@ public class ReadClipperUnitTest extends BaseTest { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); int readLength = alnStart - alnEnd; - for (int i=0; i= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); + assertUnclippedLimits(read, clippedRead); } } } @@ -72,12 +73,14 @@ public class ReadClipperUnitTest extends BaseTest { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int readLength = read.getReadLength(); - for (int i=0; i %s", i, read.getCigarString(), clipLeft.getCigarString())); + Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); - GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength-1); + GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength - 1); Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); } } } @@ -86,19 +89,27 @@ public class ReadClipperUnitTest extends BaseTest { public void testHardClipByReferenceCoordinates() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - int alnStart = read.getAlignmentStart(); - int alnEnd = read.getAlignmentEnd(); - for (int i=alnStart; i<=alnEnd; i++) { - if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side - GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); - if (!clipLeft.isEmpty()) - Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + int start = read.getSoftStart(); + int stop = read.getSoftEnd(); + +// System.out.println(String.format("CIGAR: %s (%d, %d)", cigar.toString(), start, stop)); + +// if (ReadUtils.readIsEntirelyInsertion(read)) +// System.out.println("debug"); + + for (int i = start; i <= stop; i++) { + GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, i); + if (!clipLeft.isEmpty()) { +// System.out.println(String.format("\t left [%d] %s -> %s ", i-start+1, cigar.toString(), clipLeft.getCigarString())); + Assert.assertTrue(clipLeft.getAlignmentStart() >= Math.min(read.getAlignmentEnd(), i + 1), String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); } - if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side - GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); - if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. - Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, -1); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. +// System.out.println(String.format("\t right [%d] %s -> %s ", i-start+1, cigar.toString(), clipRight.getCigarString())); + Assert.assertTrue(clipRight.getAlignmentEnd() <= Math.max(read.getAlignmentStart(), i - 1), String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); } } } @@ -111,10 +122,14 @@ public class ReadClipperUnitTest extends BaseTest { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side - for (int i=alnStart; i<=alnEnd; i++) { - GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); - if (!clipLeft.isEmpty()) - Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + for (int i = alnStart; i <= alnEnd; i++) { + GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); + + if (!clipLeft.isEmpty()) { +// System.out.println(String.format("Left Tail [%d]: %s (%d,%d,%d : %d,%d,%d) -> %s (%d,%d,%d : %d,%d,%d)", i, cigar.toString(), read.getUnclippedStart(), read.getSoftStart(), read.getAlignmentStart(), read.getAlignmentEnd(), read.getSoftEnd(), read.getUnclippedEnd(), clipLeft.getCigarString(), clipLeft.getUnclippedStart(), clipLeft.getSoftStart(), clipLeft.getAlignmentStart(), clipLeft.getAlignmentEnd(), clipLeft.getSoftEnd(), clipLeft.getUnclippedEnd())); + Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + assertUnclippedLimits(read, clipLeft); + } } } } @@ -127,10 +142,12 @@ public class ReadClipperUnitTest extends BaseTest { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side - for (int i=alnStart; i<=alnEnd; i++) { + for (int i = alnStart; i <= alnEnd; i++) { GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i); - if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + assertUnclippedLimits(read, clipRight); + } } } } @@ -145,43 +162,36 @@ public class ReadClipperUnitTest extends BaseTest { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int readLength = read.getReadLength(); - byte [] quals = new byte[readLength]; + byte[] quals = new byte[readLength]; for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { - - // create a read with nLowQualBases in the left tail - Utils.fillArrayWithByte(quals, HIGH_QUAL); + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the left tail for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) quals[addLeft] = LOW_QUAL; read.setBaseQualities(quals); GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - // Tests + assertUnclippedLimits(read, clipLeft); // Make sure limits haven't changed + assertNoLowQualBases(clipLeft, LOW_QUAL); // Make sure the low qualities are gone + Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped + String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString())); - // Make sure the low qualities are gone - assertNoLowQualBases(clipLeft, LOW_QUAL); - // Can't run this test with the current contract of no hanging insertions -// Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString())); - - // create a read with nLowQualBases in the right tail - Utils.fillArrayWithByte(quals, HIGH_QUAL); + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the right tail for (int addRight = 0; addRight < nLowQualBases; addRight++) quals[readLength - addRight - 1] = LOW_QUAL; read.setBaseQualities(quals); GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - // Tests +// System.out.println(String.format("Debug [%d]: %s -> %s / %s", nLowQualBases, cigar.toString(), clipLeft.getCigarString(), clipRight.getCigarString())); - // Make sure the low qualities are gone - assertNoLowQualBases(clipRight, LOW_QUAL); + assertUnclippedLimits(read, clipRight); // Make sure limits haven't changed + assertNoLowQualBases(clipRight, LOW_QUAL); // Make sure the low qualities are gone + Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped + String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); - // Make sure we haven't clipped any high quals -- Can't run this test with the current contract of no hanging insertions - //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); - - // create a read with nLowQualBases in the both tails - if (nLowQualBases <= readLength/2) { - Utils.fillArrayWithByte(quals, HIGH_QUAL); + if (nLowQualBases <= readLength / 2) { + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases on both tails for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) { quals[addBoth] = LOW_QUAL; quals[readLength - addBoth - 1] = LOW_QUAL; @@ -189,83 +199,25 @@ public class ReadClipperUnitTest extends BaseTest { read.setBaseQualities(quals); GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); - // Tests - - // Make sure the low qualities are gone - assertNoLowQualBases(clipBoth, LOW_QUAL); - - // Can't run this test with the current contract of no hanging insertions - //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2*nLowQualBases), read.getCigarString(), clipBoth.getCigarString())); + assertUnclippedLimits(read, clipBoth); // Make sure limits haven't changed + assertNoLowQualBases(clipBoth, LOW_QUAL); // Make sure the low qualities are gone + Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped + String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2 * nLowQualBases), read.getCigarString(), clipBoth.getCigarString())); } } -// logger.warn(String.format("Testing %s for all combinations of low/high qual... PASSED", read.getCigarString())); } - - // ONE OFF Testing clipping that ends inside an insertion ( Ryan's bug ) - final byte[] BASES = {'A','C','G','T','A','C','G','T'}; - final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2}; - final String CIGAR = "1S1M5I1S"; - - final byte[] CLIPPED_BASES = {}; - final byte[] CLIPPED_QUALS = {}; - final String CLIPPED_CIGAR = ""; - - - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); - GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR); - - ReadClipperTestUtils.assertEqualReads(ReadClipper.hardClipLowQualEnds(read, (byte) 2), expected); } @Test(enabled = true) public void testHardClipSoftClippedBases() { - - // Generate a list of cigars to test for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read); + CigarCounter original = new CigarCounter(read); + CigarCounter clipped = new CigarCounter(clippedRead); - int sumHardClips = 0; - int sumMatches = 0; - - boolean tail = true; - for (CigarElement element : read.getCigar().getCigarElements()) { - // Assuming cigars are well formed, if we see S or H, it means we're on the tail (left or right) - if (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP) - tail = true; - - // Adds all H, S and D's (next to hard/soft clips). - // All these should be hard clips after clipping. - if (tail && (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.DELETION)) - sumHardClips += element.getLength(); - - // this means we're no longer on the tail (insertions can still potentially be the tail because - // of the current contract of clipping out hanging insertions - else if (element.getOperator() != CigarOperator.INSERTION) - tail = false; - - // Adds all matches to verify that they remain the same after clipping - if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) - sumMatches += element.getLength(); - } - - for (CigarElement element : clippedRead.getCigar().getCigarElements()) { - // Test if clipped read has Soft Clips (shouldn't have any!) - Assert.assertTrue( element.getOperator() != CigarOperator.SOFT_CLIP, String.format("Cigar %s -> %s -- FAILED (resulting cigar has soft clips)", read.getCigarString(), clippedRead.getCigarString())); - - // Keep track of the total number of Hard Clips after clipping to make sure everything was accounted for - if (element.getOperator() == CigarOperator.HARD_CLIP) - sumHardClips -= element.getLength(); - - // Make sure all matches are still there - if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) - sumMatches -= element.getLength(); - } - Assert.assertTrue( sumHardClips == 0, String.format("Cigar %s -> %s -- FAILED (number of hard clips mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumHardClips)); - Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches)); - - -// logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); + assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed + original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS } } @@ -276,38 +228,39 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read); + assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed + int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); - if (! clippedRead.isEmpty()) { + if (!clippedRead.isEmpty()) { Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone - } - else + } else Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped } } } @Test(enabled = true) - public void testRevertSoftClippedBases() - { - for (Cigar cigar: cigarList) { + public void testRevertSoftClippedBases() { + for (Cigar cigar : cigarList) { final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); - if ( leadingSoftClips > 0 || tailSoftClips > 0) { + assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed + + if (leadingSoftClips > 0 || tailSoftClips > 0) { final int expectedStart = read.getAlignmentStart() - leadingSoftClips; final int expectedEnd = read.getAlignmentEnd() + tailSoftClips; Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart); Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd); - } - else + } else Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); } } @@ -315,12 +268,25 @@ public class ReadClipperUnitTest extends BaseTest { private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { if (!read.isEmpty()) { - byte [] quals = read.getBaseQualities(); - for (int i=0; i 0; } @@ -335,10 +301,46 @@ public class ReadClipperUnitTest extends BaseTest { return 0; } - private boolean cigarHasElementsDifferentThanInsertionsAndHardClips (Cigar cigar) { + private boolean cigarHasElementsDifferentThanInsertionsAndHardClips(Cigar cigar) { for (CigarElement cigarElement : cigar.getCigarElements()) if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) return true; return false; } + + private class CigarCounter { + private HashMap counter; + + public Integer getCounterForOp(CigarOperator operator) { + return counter.get(operator); + } + + public CigarCounter(GATKSAMRecord read) { + CigarOperator[] operators = CigarOperator.values(); + counter = new HashMap(operators.length); + + for (CigarOperator op : operators) + counter.put(op, 0); + + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + counter.put(cigarElement.getOperator(), counter.get(cigarElement.getOperator()) + cigarElement.getLength()); + } + + public boolean assertHardClippingSoftClips(CigarCounter clipped) { + for (CigarOperator op : counter.keySet()) { + if (op == CigarOperator.HARD_CLIP || op == CigarOperator.SOFT_CLIP) { + int counterTotal = counter.get(CigarOperator.HARD_CLIP) + counter.get(CigarOperator.SOFT_CLIP); + int clippedHard = clipped.getCounterForOp(CigarOperator.HARD_CLIP); + int clippedSoft = clipped.getCounterForOp(CigarOperator.SOFT_CLIP); + + Assert.assertEquals(counterTotal, clippedHard); + Assert.assertTrue(clippedSoft == 0); + } else + Assert.assertEquals(counter.get(op), clipped.getCounterForOp(op)); + } + return true; + } + + } + } \ No newline at end of file