From 8d7ef1bb519d2525ff82812f02ba5f3f18e3066f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 21 Jul 2011 21:39:00 -0400 Subject: [PATCH 01/10] Complete refactor of the ReplicationValidation framework, plus the following new functionality: * merges all pools in a lane. * merges all lanes in a site. --- .../src/org/broadinstitute/sting/utils/MathUtils.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 36ed506aa..f823e6d42 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -727,6 +727,17 @@ public class MathUtils { return count; } + public static int countOccurrences(byte element, byte [] array) { + int count = 0; + for (byte y : array) { + if (element == y) + count++; + } + + return count; + } + + /** * Returns n random indices drawn with replacement from the range 0..(k-1) * From a58ddab93b460378c80ee239c3b2a8fb488d994f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 28 Jul 2011 18:58:36 -0400 Subject: [PATCH 06/10] minQual and minPower filters added. VCF output added. Calls are now made based on the likelihood AC model. Two filters are applied: minQual and minPower. Output is now a VCF file with the variant context. It's now called the gatk's PoolCaller, no longer Replication Validation framework. Lots of testing ensue.... --- .../broadinstitute/sting/utils/BaseUtils.java | 33 +++++++++--- .../broadinstitute/sting/utils/MathUtils.java | 52 ++++++++++++++++++- 2 files changed, 77 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 491e4e25e..de8c81031 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -419,6 +419,32 @@ public class BaseUtils { return new String(simpleComplement(bases.getBytes())); } + /** + * Returns the index of the most common base in the basecounts array. To be used with + * pileup.getBaseCounts. + * + * @param baseCounts counts of a,c,g,t in order. + * @return the index of the most common base + */ + static public int mostFrequentBaseIndex(int[] baseCounts) { + int mostFrequentBaseIndex = 0; + for (int baseIndex = 1; baseIndex < 4; baseIndex++) { + if (baseCounts[baseIndex] > baseCounts[mostFrequentBaseIndex]) { + mostFrequentBaseIndex = baseIndex; + } + } + return mostFrequentBaseIndex; + } + + /** + * Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts. + * + * @param baseCounts counts of a,c,g,t in order. + * @return the most common base + */ + static public byte mostFrequentSimpleBase(int[] baseCounts) { + return baseIndexToSimpleBase(mostFrequentBaseIndex(baseCounts)); + } /** * For the most frequent base in the sequence, return the percentage of the read it constitutes. @@ -437,12 +463,7 @@ public class BaseUtils { } } - int mostFrequentBaseIndex = 0; - for (int baseIndex = 1; baseIndex < 4; baseIndex++) { - if (baseCounts[baseIndex] > baseCounts[mostFrequentBaseIndex]) { - mostFrequentBaseIndex = baseIndex; - } - } + int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts); return ((double) baseCounts[mostFrequentBaseIndex])/((double) sequence.length); } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index f823e6d42..9bd3f603c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; +import org.apache.lucene.messages.NLS; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -124,6 +125,16 @@ public class MathUtils { } + /** + * Calculates the log10 cumulative sum of an array with log10 probabilities + * @param log10p the array with log10 probabilites + * @param upTo index in the array to calculate the cumsum up to + * @return the log10 of the cumulative sum + */ + public static double log10CumulativeSumLog10(double [] log10p, int upTo) { + return log10sumLog10(log10p, 0, upTo); + } + /** * Converts a real space array of probabilities into a log10 array * @param prRealSpace @@ -137,10 +148,14 @@ public class MathUtils { } public static double log10sumLog10(double[] log10p, int start) { + return log10sumLog10(log10p, start, log10p.length); + } + + public static double log10sumLog10(double[] log10p, int start, int finish) { double sum = 0.0; double maxValue = Utils.findMaxEntry(log10p); - for ( int i = start; i < log10p.length; i++ ) { + for ( int i = start; i < finish; i++ ) { sum += Math.pow(10.0, log10p[i] - maxValue); } @@ -471,6 +486,18 @@ public class MathUtils { return maxI; } + public static int maxElementIndex(int[] array) { + if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = -1; + for ( int i = 0; i < array.length; i++ ) { + if ( maxI == -1 || array[i] > array[maxI] ) + maxI = i; + } + + return maxI; + } + public static double arrayMax(double[] array) { return array[maxElementIndex(array)]; } @@ -727,7 +754,7 @@ public class MathUtils { return count; } - public static int countOccurrences(byte element, byte [] array) { + public static int countOccurrences(byte element, byte [] array) { int count = 0; for (byte y : array) { if (element == y) @@ -737,6 +764,27 @@ public class MathUtils { return count; } + /** + * Returns the top (larger) N elements of the array. Naive n^2 implementation (Selection Sort). + * Better than sorting if N (number of elements to return) is small + * + * @param array the array + * @param n number of top elements to return + * @return the n larger elements of the array + */ + public static Collection getNMaxElements(double [] array, int n) { + ArrayList maxN = new ArrayList(n); + double lastMax = Double.MAX_VALUE; + for (int i=0; i Date: Thu, 4 Aug 2011 17:49:08 -0400 Subject: [PATCH 07/10] Functional VCF output. It is outputting a VCF with the 'second best guess' for the alternate allele correctly. Annotations are added at the pool level, but may get overwritten at the lane and site level. Still need to implement the merging of the the annotations at higher levels. --- .../broadinstitute/sting/utils/BaseUtils.java | 12 ++++++++++ .../broadinstitute/sting/utils/MathUtils.java | 22 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index de8c81031..673b1524d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -436,6 +436,18 @@ public class BaseUtils { return mostFrequentBaseIndex; } + static public int mostFrequentBaseIndexNotRef(int[] baseCounts, int refBaseIndex) { + int tmp = baseCounts[refBaseIndex]; + baseCounts[refBaseIndex] = -1; + int result = mostFrequentBaseIndex(baseCounts); + baseCounts[refBaseIndex] = tmp; + return result; + } + + static public int mostFrequentBaseIndexNotRef(int[] baseCounts, byte refSimpleBase) { + return mostFrequentBaseIndexNotRef(baseCounts, simpleBaseToBaseIndex(refSimpleBase)); + } + /** * Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts. * diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 9bd3f603c..a0b970dbc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -360,6 +360,23 @@ public class MathUtils { return Math.pow(10,log10MultinomialProbability(n, k, log10P)); } + /** + * calculate the Root Mean Square of an array of integers + * @param x an byte[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(byte[] x) { + if ( x.length == 0 ) + return 0.0; + + double rms = 0.0; + for (int i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + /** * calculate the Root Mean Square of an array of integers * @param x an int[] of numbers @@ -1074,6 +1091,11 @@ public class MathUtils { return ((-q)/10.0); } + /** + * Returns the phred scaled value of probability p + * @param p probability (between 0 and 1). + * @return phred scaled probability of p + */ public static byte probabilityToPhredScale (double p) { return (byte) ((-10) * Math.log10(p)); }