From d6871967aea416f87b26dee1ce173155214e029d Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Mon, 5 Mar 2012 08:28:42 -0500
Subject: [PATCH 01/25] Adding more unit tests and contracts to PairHMM util
class. Updating HaplotypeCaller to use the new PairHMM util class. Now that
the HMM result isn't dependent on the length of the haplotype there is no
reason to ensure all haplotypes have the save length which simplifies the
code considerably.
---
.../broadinstitute/sting/utils/Haplotype.java | 36 +++++++++----------
.../broadinstitute/sting/MedianUnitTest.java | 7 ----
2 files changed, 16 insertions(+), 27 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index def2fc349..41b73d157 100755
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -113,24 +113,27 @@ public class Haplotype {
return isReference;
}
- public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final byte[] paddedRef, final int refStart,
- final Cigar haplotypeCigar, final int numBasesAddedToStartOfHaplotype, final int refHaplotypeLength ) {
+ public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStart, final Cigar haplotypeCigar ) {
if( refAllele.length() != altAllele.length() ) { refInsertLocation++; }
- int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(refStart + numBasesAddedToStartOfHaplotype, haplotypeCigar, refInsertLocation);
+ int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(hapStart, haplotypeCigar, refInsertLocation);
if( haplotypeInsertLocation == -1 ) { // desired change falls inside deletion so don't bother creating a new haplotype
- return getBases().clone();
+ return bases.clone();
}
- haplotypeInsertLocation += numBasesAddedToStartOfHaplotype;
- final byte[] newHaplotype = getBases().clone();
+ byte[] newHaplotype;
try {
if( refAllele.length() == altAllele.length() ) { // SNP or MNP
+ newHaplotype = bases.clone();
for( int iii = 0; iii < altAllele.length(); iii++ ) {
newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii];
}
- } else if( refAllele.length() < altAllele.length() ) { // insertion
+ } else if( refAllele.length() < altAllele.length() ) { // insertion
final int altAlleleLength = altAllele.length();
+ newHaplotype = new byte[bases.length + altAlleleLength];
+ for( int iii = 0; iii < bases.length; iii++ ) {
+ newHaplotype[iii] = bases[iii];
+ }
for( int iii = newHaplotype.length - 1; iii > haplotypeInsertLocation + altAlleleLength - 1; iii-- ) {
newHaplotype[iii] = newHaplotype[iii-altAlleleLength];
}
@@ -138,24 +141,17 @@ public class Haplotype {
newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii];
}
} else { // deletion
- int refHaplotypeOffset = 0;
- for( final CigarElement ce : haplotypeCigar.getCigarElements()) {
- if(ce.getOperator() == CigarOperator.D) { refHaplotypeOffset += ce.getLength(); }
- else if(ce.getOperator() == CigarOperator.I) { refHaplotypeOffset -= ce.getLength(); }
- }
- for( int iii = 0; iii < altAllele.length(); iii++ ) {
- newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii];
- }
final int shift = refAllele.length() - altAllele.length();
- for( int iii = haplotypeInsertLocation + altAllele.length(); iii < newHaplotype.length - shift; iii++ ) {
- newHaplotype[iii] = newHaplotype[iii+shift];
+ newHaplotype = new byte[bases.length - shift];
+ for( int iii = 0; iii < haplotypeInsertLocation + altAllele.length(); iii++ ) {
+ newHaplotype[iii] = bases[iii];
}
- for( int iii = 0; iii < shift; iii++ ) {
- newHaplotype[iii+newHaplotype.length-shift] = paddedRef[refStart+refHaplotypeLength+refHaplotypeOffset+iii];
+ for( int iii = haplotypeInsertLocation + altAllele.length(); iii < newHaplotype.length; iii++ ) {
+ newHaplotype[iii] = bases[iii+shift];
}
}
} catch (Exception e) { // event already on haplotype is too large/complex to insert another allele, most likely because of not enough reference padding
- return getBases().clone();
+ return bases.clone();
}
return newHaplotype;
diff --git a/public/java/test/org/broadinstitute/sting/MedianUnitTest.java b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java
index c12db9b77..db89aee78 100644
--- a/public/java/test/org/broadinstitute/sting/MedianUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java
@@ -29,7 +29,6 @@ package org.broadinstitute.sting;
// the imports for unit testing.
-import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine;
import org.broadinstitute.sting.utils.Median;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
@@ -42,12 +41,6 @@ import java.util.List;
public class MedianUnitTest extends BaseTest {
- LikelihoodCalculationEngine engine;
-
- @BeforeSuite
- public void before() {
- engine = new LikelihoodCalculationEngine(0, 0, false);
- }
// --------------------------------------------------------------------------------
//
From e9ad382e749773827d78f90a9c5d6595c77aca72 Mon Sep 17 00:00:00 2001
From: Mauricio Carneiro
Date: Sat, 3 Mar 2012 18:01:55 -0500
Subject: [PATCH 04/25] unifying the BQSR argument collection
---
.../bqsr/RecalibrationArgumentCollection.java | 67 ++++++++++++++++++-
1 file changed, 65 insertions(+), 2 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
index 38e7051e4..cc6f67cc9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
@@ -25,8 +25,14 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Hidden;
+import org.broad.tribble.Feature;
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
/**
* Created by IntelliJ IDEA.
@@ -39,6 +45,63 @@ import org.broadinstitute.sting.commandline.Hidden;
public class RecalibrationArgumentCollection {
+ /**
+ * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
+ * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
+ * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
+ * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
+ */
+ @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
+ protected List> knownSites = Collections.emptyList();
+
+ /**
+ * After the header, data records occur one per line until the end of the file. The first several items on a line are the
+ * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
+ * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
+ * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+ */
+ @Gather(CountCovariatesGatherer.class)
+ @Output
+ protected PrintStream RECAL_FILE;
+
+ /**
+ * List all implemented covariates.
+ */
+ @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
+ protected boolean LIST_ONLY = false;
+
+ /**
+ * Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you. See the list of covariates with -list.
+ */
+ @Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
+ protected String[] COVARIATES = null;
+
+ /*
+ * Use the standard set of covariates in addition to the ones listed using the -cov argument
+ */
+ @Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
+ protected boolean USE_STANDARD_COVARIATES = true;
+
+ /////////////////////////////
+ // Debugging-only Arguments
+ /////////////////////////////
+ /**
+ * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
+ */
+ @Hidden
+ @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
+ protected boolean RUN_WITHOUT_DBSNP = false;
+
+ /////////////////////////////
+ // protected Member Variables
+ /////////////////////////////
+ protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables)
+ protected final ArrayList requestedCovariates = new ArrayList();// A list to hold the covariate objects that were requested
+
+ protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
+ protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
+
+
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
From 14a77b1e717d699efbc63796d699b5317fd6a2ae Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Mon, 5 Mar 2012 12:28:32 -0500
Subject: [PATCH 05/25] Getting rid of redundant methods in MathUtils. Adding
unit tests for approximateLog10SumLog10 and normalizeFromLog10. Increasing
the precision of the Jacobian approximation used by approximateLog10SumLog
which changes the UG+HC integration tests ever so slightly.
---
.../indels/HaplotypeIndelErrorModel.java | 3 +-
.../indels/PairHMMIndelErrorModel.java | 17 +--
.../GaussianMixtureModel.java | 11 +-
.../broadinstitute/sting/utils/MathUtils.java | 128 ++++--------------
.../UnifiedGenotyperIntegrationTest.java | 32 ++---
.../sting/utils/MathUtilsUnitTest.java | 101 +++++++++++---
6 files changed, 135 insertions(+), 157 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
index 200a250f2..26023bd2f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
@@ -454,8 +454,7 @@ public class HaplotypeIndelErrorModel {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
// Second term is a constant added to both likelihoods so will be ignored
- haplotypeLikehoodMatrix[i][j] += MathUtils.softMax(readLikelihood[0],
- readLikelihood[1]);
+ haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(readLikelihood[0], readLikelihood[1]);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
index 6410d619d..64993b43a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@@ -166,18 +166,17 @@ public class PairHMMIndelErrorModel {
final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
- matchMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead,
- YMetricArray[im1][jm1] + pBaseRead);
+ matchMetricArray[indI][indJ] = pBaseRead + MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[im1][jm1], XMetricArray[im1][jm1], YMetricArray[im1][jm1]});
final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
- XMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1);
+ XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1);
// update Y array
final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
- YMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2);
+ YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2);
}
}
@@ -316,9 +315,7 @@ public class PairHMMIndelErrorModel {
final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1;
- final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ],
- XMetricArray[bestI][bestJ],
- YMetricArray[bestI][bestJ]);
+ final double bestMetric = MathUtils.approximateLog10SumLog10(new double[]{ matchMetricArray[bestI][bestJ], XMetricArray[bestI][bestJ], YMetricArray[bestI][bestJ] });
/*
if (DEBUG) {
@@ -651,7 +648,7 @@ public class PairHMMIndelErrorModel {
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
- // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix
+ // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix
for (int i=0; i < numHaplotypes; i++) {
for (int j=i; j < numHaplotypes; j++){
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
@@ -665,7 +662,7 @@ public class PairHMMIndelErrorModel {
final double li = readLikelihoods[readIdx][i];
final double lj = readLikelihoods[readIdx][j];
final int readCount = readCounts[readIdx];
- haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF);
+ haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + LOG_ONE_HALF);
}
}
}
@@ -678,7 +675,7 @@ public class PairHMMIndelErrorModel {
}
}
- // renormalize so that max element is zero.
+ // renormalize so that max element is zero.
return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
index 82776ca2e..6f0ae7c25 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
@@ -140,15 +140,16 @@ public class GaussianMixtureModel {
}
for( final VariantDatum datum : data ) {
- final ArrayList pVarInGaussianLog10 = new ArrayList( gaussians.size() );
+ final double[] pVarInGaussianLog10 = new double[gaussians.size()];
+ int gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
final double pVarLog10 = gaussian.evaluateDatumLog10( datum );
- pVarInGaussianLog10.add( pVarLog10 );
+ pVarInGaussianLog10[gaussianIndex++] = pVarLog10;
}
- final double[] pVarInGaussianNormalized = MathUtils.normalizeFromLog10( pVarInGaussianLog10 );
- int iii = 0;
+ final double[] pVarInGaussianNormalized = MathUtils.normalizeFromLog10( pVarInGaussianLog10, false );
+ gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
- gaussian.assignPVarInGaussian( pVarInGaussianNormalized[iii++] ); //BUGBUG: to clean up
+ gaussian.assignPVarInGaussian( pVarInGaussianNormalized[gaussianIndex++] );
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
index 6f2db67ee..a96cbffc5 100644
--- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
@@ -41,14 +41,6 @@ import java.util.*;
* @author Kiran Garimella
*/
public class MathUtils {
- /** Public constants - used for the Lanczos approximation to the factorial function
- * (for the calculation of the binomial/multinomial probability in logspace)
- * @param LANC_SEQ[] - an array holding the constants which correspond to the product
- * of Chebyshev Polynomial coefficients, and points on the Gamma function (for interpolation)
- * [see A Precision Approximation of the Gamma Function J. SIAM Numer. Anal. Ser. B, Vol. 1 1964. pp. 86-96]
- * @param LANC_G - a value for the Lanczos approximation to the gamma function that works to
- * high precision
- */
/**
* Private constructor. No instantiating this class!
@@ -56,6 +48,28 @@ public class MathUtils {
private MathUtils() {
}
+ public static final double[] log10Cache;
+ private static final double[] jacobianLogTable;
+ private static final double JACOBIAN_LOG_TABLE_STEP = 0.001;
+ private static final double MAX_JACOBIAN_TOLERANCE = 10.0;
+ private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1;
+ private static final int MAXN = 11000;
+ private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients
+
+ static {
+ log10Cache = new double[LOG10_CACHE_SIZE];
+ jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE];
+
+ log10Cache[0] = Double.NEGATIVE_INFINITY;
+ for (int k = 1; k < LOG10_CACHE_SIZE; k++)
+ log10Cache[k] = Math.log10(k);
+
+ for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
+ jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP));
+
+ }
+ }
+
// A fast implementation of the Math.round() method. This method does not perform
// under/overflow checking, so this shouldn't be used in the general case (but is fine
// if one is already make those checks before calling in to the rounding).
@@ -536,7 +550,7 @@ public class MathUtils {
// all negative) the largest value; also, we need to convert to normal-space.
double maxValue = Utils.findMaxEntry(array);
- // we may decide to just normalize in log space with converting to linear space
+ // we may decide to just normalize in log space without converting to linear space
if (keepInLogSpace) {
for (int i = 0; i < array.length; i++)
array[i] -= maxValue;
@@ -563,29 +577,6 @@ public class MathUtils {
return normalized;
}
- public static double[] normalizeFromLog10(List array, boolean takeLog10OfOutput) {
- double[] normalized = new double[array.size()];
-
- // for precision purposes, we need to add (or really subtract, since they're
- // all negative) the largest value; also, we need to convert to normal-space.
- double maxValue = MathUtils.arrayMaxDouble(array);
- for (int i = 0; i < array.size(); i++)
- normalized[i] = Math.pow(10, array.get(i) - maxValue);
-
- // normalize
- double sum = 0.0;
- for (int i = 0; i < array.size(); i++)
- sum += normalized[i];
- for (int i = 0; i < array.size(); i++) {
- double x = normalized[i] / sum;
- if (takeLog10OfOutput)
- x = Math.log10(x);
- normalized[i] = x;
- }
-
- return normalized;
- }
-
/**
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
*
@@ -596,10 +587,6 @@ public class MathUtils {
return normalizeFromLog10(array, false);
}
- public static double[] normalizeFromLog10(List array) {
- return normalizeFromLog10(array, false);
- }
-
public static int maxElementIndex(final double[] array) {
return maxElementIndex(array, array.length);
}
@@ -1207,78 +1194,11 @@ public class MathUtils {
return ((double) num) / (Math.max(denom, 1));
}
- public static final double[] log10Cache;
- public static final double[] jacobianLogTable;
- public static final int JACOBIAN_LOG_TABLE_SIZE = 101;
- public static final double JACOBIAN_LOG_TABLE_STEP = 0.1;
- public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP;
- public static final double MAX_JACOBIAN_TOLERANCE = 10.0;
- private static final int MAXN = 11000;
- private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients
-
- static {
- log10Cache = new double[LOG10_CACHE_SIZE];
- jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE];
-
- log10Cache[0] = Double.NEGATIVE_INFINITY;
- for (int k = 1; k < LOG10_CACHE_SIZE; k++)
- log10Cache[k] = Math.log10(k);
-
- for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
- jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP));
-
- }
- }
-
- static public double softMax(final double[] vec) {
- double acc = vec[0];
- for (int k = 1; k < vec.length; k++)
- acc = softMax(acc, vec[k]);
-
- return acc;
-
- }
-
static public double max(double x0, double x1, double x2) {
double a = Math.max(x0, x1);
return Math.max(a, x2);
}
- static public double softMax(final double x0, final double x1, final double x2) {
- // compute naively log10(10^x[0] + 10^x[1]+...)
- // return Math.log10(MathUtils.sumLog10(vec));
-
- // better approximation: do Jacobian logarithm function on data pairs
- double a = softMax(x0, x1);
- return softMax(a, x2);
- }
-
- static public double softMax(final double x, final double y) {
- // we need to compute log10(10^x + 10^y)
- // By Jacobian logarithm identity, this is equal to
- // max(x,y) + log10(1+10^-abs(x-y))
- // we compute the second term as a table lookup
- // with integer quantization
-
- // slow exact version:
- // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
-
- double diff = x - y;
-
- if (diff > MAX_JACOBIAN_TOLERANCE)
- return x;
- else if (diff < -MAX_JACOBIAN_TOLERANCE)
- return y;
- else if (diff >= 0) {
- int ind = (int) (diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
- return x + jacobianLogTable[ind];
- }
- else {
- int ind = (int) (-diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
- return y + jacobianLogTable[ind];
- }
- }
-
public static double phredScaleToProbability(byte q) {
return Math.pow(10, (-q) / 10.0);
}
@@ -1734,6 +1654,4 @@ public class MathUtils {
return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
}
-
-
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
index 823eeeeb9..cfb0d11a1 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
- Arrays.asList("202b337ebbea3def1be8495eb363dfa8"));
+ Arrays.asList("8f81a14fffc1a59b4b066f8595dc1232"));
executeTest("test MultiSample Pilot1", spec);
}
@@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
- Arrays.asList("ae29b9c9aacce8046dc780430540cd62"));
+ Arrays.asList("c5b53231f4f6d9524bc4ec8115f44f5c"));
executeTest("test SingleSample Pilot2", spec);
}
@@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
- Arrays.asList("10027d13befaa07b7900a7af0ae0791c"));
+ Arrays.asList("5af005255240a2186f04cb50851b8b6f"));
executeTest("test Multiple SNP alleles", spec);
}
@@ -70,7 +70,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
- private final static String COMPRESSED_OUTPUT_MD5 = "fda341de80b3f6fd42a83352b18b1d65";
+ private final static String COMPRESSED_OUTPUT_MD5 = "a08df9aea2b3df09cf90ff8e6e3be3ea";
@Test
public void testCompressedOutput() {
@@ -91,7 +91,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
- String md5 = "32a34362dff51d8b73a3335048516d82";
+ String md5 = "6358934c1c26345013a38261b8c45aa4";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@@ -179,8 +179,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testHeterozyosity() {
HashMap e = new HashMap();
- e.put( 0.01, "2cb2544739e01f6c08fd820112914317" );
- e.put( 1.0 / 1850, "730b2b83a4b1f6d46fc3b5cd7d90756c" );
+ e.put( 0.01, "926b58038dd4989bf7eda697a847eea9" );
+ e.put( 1.0 / 1850, "93f44105b43b65730a3b821e27b0fa16" );
for ( Map.Entry entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@@ -204,7 +204,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
- Arrays.asList("f0fbe472f155baf594b1eeb58166edef"));
+ Arrays.asList("a1b75a7e12b160b0be823228c958573f"));
executeTest(String.format("test multiple technologies"), spec);
}
@@ -223,7 +223,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
- Arrays.asList("8c87c749a7bb5a76ed8504d4ec254272"));
+ Arrays.asList("3bda1279cd6dcb47885f3e19466f11b9"));
executeTest(String.format("test calling with BAQ"), spec);
}
@@ -242,7 +242,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
- Arrays.asList("bd9d3d50a1f49605d7cd592a0f446899"));
+ Arrays.asList("d9fc3ba94a0d46029778c7b457e7292a"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@@ -257,7 +257,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
- Arrays.asList("2ad52c2e75b3ffbfd8f03237c444e8e6"));
+ Arrays.asList("b2e30ae3e5ffa6108f9f6178b1d2e679"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
- Arrays.asList("91cd6d2e3972b0b8e4064bb35a33241f"));
+ Arrays.asList("2cd182a84613fa91a6020466d2d327e2"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@@ -280,7 +280,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
- Arrays.asList("c60a44ba94a80a0cb1fba8b6f90a13cd"));
+ Arrays.asList("9cd08dc412a007933381e9c76c073899"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1);
}
@@ -290,7 +290,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
- Arrays.asList("320f61c87253aba77d6dc782242cba8b"));
+ Arrays.asList("5ef1f007d3ef77c1b8f31e5e036eff53"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
}
@@ -300,7 +300,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
- Arrays.asList("c9897b80615c53a4ea10a4b193d56d9c"));
+ Arrays.asList("2609675a356f2dfc86f8a1d911210978"));
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
}
@@ -309,7 +309,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
- Arrays.asList("5282fdb1711a532d726c13507bf80a21"));
+ Arrays.asList("4fdd8da77167881b71b3547da5c13f94"));
executeTest("test MultiSample Phase1 indels with complicated records", spec4);
}
diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java
index 75fc44873..1ba6c74d4 100755
--- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java
@@ -205,6 +205,24 @@ public class MathUtilsUnitTest extends BaseTest {
}
}
+ /**
+ * Private functions used by testArrayShuffle()
+ */
+ private boolean hasUniqueElements(Object[] x) {
+ for (int i = 0; i < x.length; i++)
+ for (int j = i + 1; j < x.length; j++)
+ if (x[i].equals(x[j]) || x[i] == x[j])
+ return false;
+ return true;
+ }
+
+ private boolean hasAllElements(final Object[] expected, final Object[] actual) {
+ HashSet
+ *
+ * @author Kiran Garimella, Mark DePristo
+ */
+public class ErrorRatePerCycle extends LocusWalker {
+ @Output PrintStream out;
+ @Argument(fullName="min_base_quality_score", shortName="mbq", doc="Minimum base quality required to consider a base for calling", required=false)
+ public Integer MIN_BASE_QUAL = 0;
+ @Argument(fullName="min_mapping_quality_score", shortName="mmq", doc="Minimum read mapping quality required to consider a read for calling", required=false)
+ public Integer MIN_MAPPING_QUAL = 20;
+
+ private GATKReport report;
+ private GATKReportTable table;
+ private final static String reportName = "ErrorRatePerCycle";
+ private final static String reportDescription = "The error rate per sequenced position in the reads";
+
+ /**
+ * Allows us to use multiple records for the key (read group x cycle)
+ */
+ private static class TableKey implements Comparable {
+ final String readGroup;
+ final int cycle;
+
+ private TableKey(final String readGroup, final int cycle) {
+ this.readGroup = readGroup;
+ this.cycle = cycle;
+ }
+
+ @Override
+ public int compareTo(final TableKey tableKey) {
+ final int scmp = readGroup.compareTo(tableKey.readGroup);
+ if ( scmp == 0 )
+ return Integer.valueOf(cycle).compareTo(tableKey.cycle);
+ else
+ return scmp;
+ }
+ }
+
+ public void initialize() {
+ report = new GATKReport();
+ report.addTable(reportName, reportDescription);
+ table = report.getTable(reportName);
+ table.addPrimaryKey("key", false);
+ table.addColumn("readgroup", 0);
+ table.addColumn("cycle", 0);
+ table.addColumn("mismatches", 0);
+ table.addColumn("counts", 0);
+ table.addColumn("qual", 0);
+ table.addColumn("errorrate", 0.0f, "%.2e");
+ }
+
+ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ for ( final PileupElement p : context.getBasePileup() ) {
+ final GATKSAMRecord read = p.getRead();
+ final int offset = p.getOffset();
+ final boolean firstOfPair = ! read.getReadPairedFlag() || read.getFirstOfPairFlag();
+
+ if ( firstOfPair && read.getMappingQuality() >= MIN_MAPPING_QUAL && p.getQual() >= MIN_BASE_QUAL ) {
+ final byte readBase = p.getBase();
+ final byte refBase = ref.getBase();
+ final int cycle = offset;
+
+ if ( BaseUtils.isRegularBase(readBase) && BaseUtils.isRegularBase(refBase) ) {
+ final TableKey key = new TableKey(read.getReadGroup().getReadGroupId(), cycle);
+
+ if ( ! table.containsKey(key) ) {
+ table.set(key, "cycle", cycle);
+ table.set(key, "readgroup", read.getReadGroup().getReadGroupId());
+ }
+
+ table.increment(key, "counts");
+ if (readBase != refBase)
+ table.increment(key, "mismatches");
+ }
+ }
+ }
+
+ return null;
+ }
+
+ public Integer reduceInit() { return null; }
+
+ public Integer reduce(Integer value, Integer sum) { return null; }
+
+ public void onTraversalDone(Integer sum) {
+ for ( final Object key : table.getPrimaryKeys() ) {
+ final int mismatches = (Integer)table.get(key, "mismatches");
+ final int count = (Integer)table.get(key, "counts");
+ final double errorRate = (mismatches + 1) / (1.0*(count + 1));
+ final int qual = QualityUtils.probToQual(1-errorRate, 0.0);
+ table.set(key, "qual", qual);
+ table.set(key, "errorrate", errorRate);
+ }
+
+ report.print(out);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
index d7a48d321..14985907d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
@@ -94,9 +94,6 @@ import java.util.Map;
*
* @author Mark DePristo
*/
-
-
-
public class ReadGroupProperties extends ReadWalker {
@Output
public PrintStream out;
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java
new file mode 100644
index 000000000..accb9c0cf
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics;
+
+import org.broadinstitute.sting.WalkerTest;
+import org.testng.annotations.Test;
+
+import java.util.Arrays;
+
+public class ErrorRatePerCycleIntegrationTest extends WalkerTest {
+ @Test
+ public void basicTest() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-T ErrorRatePerCycle -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-10,100,000 -o %s",
+ 1,
+ Arrays.asList("0cc212ecb6df300e321784039ff29f13"));
+ executeTest("ErrorRatePerCycle:", spec);
+ }
+}
\ No newline at end of file
From fbd2f04a04c63726024516ddeadf711afea7e8f5 Mon Sep 17 00:00:00 2001
From: Andrey Sivachenko
Date: Wed, 7 Mar 2012 17:29:42 -0500
Subject: [PATCH 23/25] JEXL support added; intermediate commit, not yet
functional
---
.../indels/SomaticIndelDetectorWalker.java | 66 ++++++++++++++++++-
1 file changed, 64 insertions(+), 2 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
index aa9ae1517..4247ab7a2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
@@ -26,6 +26,10 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.*;
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlContext;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.jexl2.MapContext;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@@ -178,6 +182,10 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
"GENOMIC/UTR/INTRON/CODING and with the gene name", required=false)
String RefseqFileName = null;
+
+ @Argument(shortName="filter", doc="One or more criteria to use when selecting the data", required=false)
+ public ArrayList FILTER_EXPRESSIONS = new ArrayList();
+
//@Argument(fullName="blacklistedLanes", shortName="BL",
// doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+
// "by this application, so they will not contribute indels to consider and will not be counted.", required=false)
@@ -221,7 +229,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
private Writer verboseWriter = null;
- private static String annGenomic = "GENOMIC";
+ private static String annGenomic = "GENOMIC\t";
private static String annIntron = "INTRON";
private static String annUTR = "UTR";
private static String annCoding = "CODING";
@@ -245,6 +253,32 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed;
// can be 1 base before lastGenotyped start
+ private JexlEngine jexlEngine = new JexlEngine();
+ private ArrayList jexlExpressions = new ArrayList();
+
+ // the following arrays store indel source-specific (normal/tumor) metric names
+ // for fast access when populating JEXL expression contexts (see IndelPrecall.fillContext())
+ private final static String[] normalMetricsCassette = new String[4];
+ private final static String[] tumorMetricsCassette = new String[4];
+ private final static String[] singleMetricsCassette = new String[4];
+ private final static int C_COV=0;
+ private final static int C_CONS_CNT=1;
+ private final static int C_INDEL_F=2;
+ private final static int C_INDEL_CF=3;
+ static {
+ normalMetricsCassette[C_COV] = "N_COV";
+ tumorMetricsCassette[C_COV] = "T_COV";
+ singleMetricsCassette[C_COV] = "COV";
+ normalMetricsCassette[C_CONS_CNT] = "N_CONS_CNT";
+ tumorMetricsCassette[C_CONS_CNT] = "T_CONS_CNT";
+ singleMetricsCassette[C_CONS_CNT] = "CONS_CNT";
+ normalMetricsCassette[C_INDEL_F] = "N_INDEL_F";
+ tumorMetricsCassette[C_INDEL_F] = "T_INDEL_F";
+ singleMetricsCassette[C_INDEL_F] = "INDEL_F";
+ normalMetricsCassette[C_INDEL_CF] = "N_INDEL_CF";
+ tumorMetricsCassette[C_INDEL_CF] = "T_INDEL_CF";
+ singleMetricsCassette[C_INDEL_CF] = "INDEL_CF";
+ }
// "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt"
@@ -389,6 +423,17 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ;
refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile);
+
+ // Now initialize JEXL expressions:
+ for ( String s : FILTER_EXPRESSIONS ) {
+ try {
+ Expression e = jexlEngine.createExpression(s);
+ jexlExpressions.add(e);
+ } catch (Exception e) {
+ throw new UserException.BadArgumentValue("Filter expression", "Invalid expression used (" + s + "). Please see the JEXL docs for correct syntax.") ;
+ }
+
+ }
}
@@ -829,6 +874,15 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH);
IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH);
+ JexlContext jc = new MapContext();
+ tumorCall.fillContext(jc,tumorMetricsCassette);
+ normalCall.fillContext(jc,normalMetricsCassette);
+ boolean result = false;
+
+ for ( Expression e : jexlExpressions ) {
+ if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { result=true; break; }
+ }
+
if ( tumorCall.getCoverage() < minCoverage && ! genotype ) {
if ( DEBUG ) {
System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)");
@@ -1602,6 +1656,13 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
public IndelVariant getVariant() { return consensus_indel; }
+ public void fillContext(JexlContext context,String[] cassette) {
+ context.set(cassette[C_INDEL_F],((double)consensus_indel_count)/total_coverage);
+ context.set(cassette[C_INDEL_CF],((double)consensus_indel_count/all_indel_count));
+ context.set(cassette[C_COV],total_coverage);
+ context.set(cassette[C_CONS_CNT],consensus_indel_count);
+ }
+
public boolean isCall() {
boolean ret = ( consensus_indel_count >= minIndelCount &&
(double)consensus_indel_count > minFraction * total_coverage &&
@@ -1610,8 +1671,9 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
" total_count="+all_indel_count+" cov="+total_coverage+
" minConsensusF="+((double)consensus_indel_count)/all_indel_count+
" minF="+((double)consensus_indel_count)/total_coverage);
- return ret;
+ return ret;
+// return true;
}
/** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed
From 497a1b059ef906d2f085b77a88732ff9eeca03d3 Mon Sep 17 00:00:00 2001
From: Andrey Sivachenko
Date: Wed, 7 Mar 2012 18:34:11 -0500
Subject: [PATCH 24/25] transition to JEXL completed, old parameters setting
individual cutoffs now deprecated
---
.../indels/SomaticIndelDetectorWalker.java | 155 ++++++++++--------
1 file changed, 90 insertions(+), 65 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
index 4247ab7a2..733d32e3d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
@@ -151,30 +151,44 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false)
java.io.File bedOutput = null;
+ @Deprecated
@Argument(fullName="minCoverage", shortName="minCoverage",
doc="indel calls will be made only at sites with tumor coverage of minCoverage or more reads; "+
- "with --unpaired (single sample) option, this value is used for minimum sample coverage", required=false)
+ "with --unpaired (single sample) option, this value is used for minimum sample coverage. "+
+ "INSTEAD USE: T_COV {
String RefseqFileName = null;
- @Argument(shortName="filter", doc="One or more criteria to use when selecting the data", required=false)
+ @Argument(shortName="filter", doc="One or more logical expressions. If any of the expressions is TRUE, " +
+ "putative indel will be discarded and nothing will be printed into the output (unless genotyping "+
+ "at the specific position is explicitly requested, see -genotype). "+
+ "Default: T_COV<6||N_COV<4||T_INDEL_F<0.3||T_INDEL_CF<0.7", required=false)
public ArrayList FILTER_EXPRESSIONS = new ArrayList();
//@Argument(fullName="blacklistedLanes", shortName="BL",
@@ -425,6 +442,13 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile);
// Now initialize JEXL expressions:
+ if ( FILTER_EXPRESSIONS.size() == 0 ) {
+ if ( call_unpaired ) {
+ FILTER_EXPRESSIONS.add("COV<6||INDEL_F<0.3||INDEL_CF<0.7");
+ } else {
+ FILTER_EXPRESSIONS.add("T_COV<6||N_COV<4||T_INDEL_F<0.3||T_INDEL_CF<0.7");
+ }
+ }
for ( String s : FILTER_EXPRESSIONS ) {
try {
Expression e = jexlEngine.createExpression(s);
@@ -706,14 +730,26 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue;
IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH);
+ JexlContext jc = new MapContext();
+ normalCall.fillContext(jc,singleMetricsCassette);
+ boolean discard_event = false;
- if ( normalCall.getCoverage() < minCoverage && ! genotype ) {
- if ( DEBUG ) {
- System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
- }
- continue; // low coverage
+ for ( Expression e : jexlExpressions ) {
+ if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; }
}
+ if ( discard_event && ! genotype ) {
+ normal_context.indelsAt(pos).clear();
+ continue; //*
+ }
+
+// if ( normalCall.getCoverage() < minCoverage && ! genotype ) {
+// if ( DEBUG ) {
+// System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
+// }
+// continue; // low coverage
+// }
+
if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos);
long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() );
@@ -742,24 +778,16 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos);
- boolean haveCall = normalCall.isCall(); // cache the value
-
- if ( haveCall || genotype) {
- if ( haveCall ) normalCallsMade++;
- printVCFLine(vcf_writer,normalCall);
- if ( bedWriter != null ) normalCall.printBedLine(bedWriter);
- if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall);
- lastGenotypedPosition = pos;
- }
+ if ( ! discard_event ) normalCallsMade++;
+ printVCFLine(vcf_writer,normalCall, discard_event);
+ if ( bedWriter != null ) normalCall.printBedLine(bedWriter);
+ if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, discard_event);
+ lastGenotypedPosition = pos;
normal_context.indelsAt(pos).clear();
// we dealt with this indel; don't want to see it again
// (we might otherwise in the case when 1) there is another indel that follows
// within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel)
-
-// for ( IndelVariant var : variants ) {
-// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
-// }
}
if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")");
@@ -877,24 +905,29 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
JexlContext jc = new MapContext();
tumorCall.fillContext(jc,tumorMetricsCassette);
normalCall.fillContext(jc,normalMetricsCassette);
- boolean result = false;
+ boolean discard_event = false;
for ( Expression e : jexlExpressions ) {
- if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { result=true; break; }
+ if ( ((Boolean)e.evaluate(jc)).booleanValue() ) { discard_event=true; break; }
}
- if ( tumorCall.getCoverage() < minCoverage && ! genotype ) {
- if ( DEBUG ) {
- System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)");
- }
- continue; // low coverage
- }
- if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) {
- if ( DEBUG ) {
- System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
- }
- continue; // low coverage
+ if ( discard_event && ! genotype ) {
+ tumor_context.indelsAt(pos).clear();
+ normal_context.indelsAt(pos).clear();
+ continue; //*
}
+// if ( tumorCall.getCoverage() < minCoverage && ! genotype ) {
+// if ( DEBUG ) {
+// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)");
+// }
+// continue; // low coverage
+// }
+// if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) {
+// if ( DEBUG ) {
+// System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)");
+// }
+// continue; // low coverage
+// }
if ( DEBUG ) {
System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, ");
@@ -922,32 +955,24 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right
-// location = getToolkit().getGenomeLocParser().setStart(location,pos);
-// location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data
-
location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data
- boolean haveCall = tumorCall.isCall(); // cache the value
+// boolean haveCall = tumorCall.isCall(); // cache the value
- if ( haveCall || genotype ) {
- if ( haveCall ) tumorCallsMade++;
+ if ( ! discard_event ) tumorCallsMade++;
- printVCFLine(vcf_writer,normalCall,tumorCall);
+ printVCFLine(vcf_writer,normalCall,tumorCall,discard_event);
- if ( bedWriter != null ) tumorCall.printBedLine(bedWriter);
+ if ( bedWriter != null ) tumorCall.printBedLine(bedWriter);
+
+ if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall, discard_event );
+ lastGenotypedPosition = pos;
- if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall );
- lastGenotypedPosition = pos;
- }
tumor_context.indelsAt(pos).clear();
normal_context.indelsAt(pos).clear();
// we dealt with this indel; don't want to see it again
// (we might otherwise in the case when 1) there is another indel that follows
// within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel)
-
-// for ( IndelVariant var : variants ) {
-// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
-// }
}
if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")");
@@ -1001,14 +1026,14 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
}
- public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall) {
+ public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, boolean discard_event) {
RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location));
String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList));
StringBuilder fullRecord = new StringBuilder();
fullRecord.append(makeFullRecord(normalCall));
fullRecord.append(annotationString);
- if ( ! normalCall.isCall() && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
+ if ( discard_event && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
try {
verboseWriter.write(fullRecord.toString());
verboseWriter.write('\n');
@@ -1019,7 +1044,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
}
- public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall) {
+ public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall, boolean discard_event) {
RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location));
String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList));
@@ -1067,7 +1092,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
fullRecord.append('\t');
fullRecord.append(annotationString);
- if ( ! tumorCall.isCall() && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
+ if ( discard_event && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL");
try {
verboseWriter.write(fullRecord.toString());
@@ -1077,7 +1102,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
}
}
- public void printVCFLine(VCFWriter vcf, IndelPrecall call) {
+ public void printVCFLine(VCFWriter vcf, IndelPrecall call, boolean discard_event) {
long start = call.getPosition()-1;
// If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed.
@@ -1114,14 +1139,14 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
Map attrs = call.makeStatsAttributes(null);
- if ( call.isCall() ) // we made a call - put actual het genotype here:
+ if ( ! discard_event ) // we made a call - put actual het genotype here:
genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
}
Set filters = null;
- if ( call.getVariant() != null && ! call.isCall() ) {
+ if ( call.getVariant() != null && discard_event ) {
filters = new HashSet();
filters.add("NoCall");
}
@@ -1149,7 +1174,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
}
}
- public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) {
+ public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall, boolean discard_event) {
long start = tCall.getPosition()-1;
long stop = start;
@@ -1166,7 +1191,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
Map attrs = new HashMap();
boolean isSomatic = false;
- if ( nCall.getCoverage() >= minNormalCoverage && nCall.getVariant() == null && tCall.getVariant() != null ) {
+ if ( nCall.getVariant() == null && tCall.getVariant() != null ) {
isSomatic = true;
attrs.put(VCFConstants.SOMATIC_KEY,true);
}
@@ -1209,7 +1234,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
}
Set filters = null;
- if ( tCall.getVariant() != null && ! tCall.isCall() ) {
+ if ( tCall.getVariant() != null && discard_event ) {
filters = new HashSet();
filters.add("NoCall");
}
@@ -1662,7 +1687,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
context.set(cassette[C_COV],total_coverage);
context.set(cassette[C_CONS_CNT],consensus_indel_count);
}
-
+/*
public boolean isCall() {
boolean ret = ( consensus_indel_count >= minIndelCount &&
(double)consensus_indel_count > minFraction * total_coverage &&
@@ -1675,7 +1700,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
return ret;
// return true;
}
-
+*/
/** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed
* variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus)
* @param variants
From 56f074b520e83e13ead8bb12d57541f12491498a Mon Sep 17 00:00:00 2001
From: Andrey Sivachenko
Date: Wed, 7 Mar 2012 18:47:15 -0500
Subject: [PATCH 25/25] docs updated
---
.../walkers/indels/SomaticIndelDetectorWalker.java | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
index 733d32e3d..59a7bd01a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java
@@ -75,7 +75,7 @@ import java.util.*;
*
* This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing
* data. Supported output formats are: BED format, extended verbose output (tab separated), and VCF. The latter two outputs
- * include additional statistics such as mismtaches and base qualitites around the calls, read strandness (how many
+ * include additional statistics such as mismatches and base qualitites around the calls, read strandness (how many
* forward/reverse reads support ref and indel alleles) etc. It is highly recommended to use these additional
* statistics to perform post-filtering of the calls as the tool is tuned for sensitivity (in other words it will
* attempt to "call" anything remotely reasonable based only on read counts and will generate all the additional
@@ -92,6 +92,16 @@ import java.util.*;
* bam tagging is not required in this case, and tags are completely ignored if still used: all input bams will be merged
* on the fly and assumed to represent a single sample - this tool does not check for sample id in the read groups).
*
+ * Which (putative) calls will make it into the output file(s) is controlled by an expression/list of expressions passed with -filter
+ * flag: if any of the expressions evaluate to TRUE, the site will be discarded. Otherwise the putative call and all the
+ * associated statistics will be printed into the output. Expressions recognize the following variables(in paired-sample
+ * somatic mode variables are prefixed with T_ and N_ for Tumor and Normal, e.g. N_COV and T_COV are defined instead of COV):
+ * COV for coverage at the site, INDEL_F for fraction of reads supporting consensus indel at the site (wrt total coverage),
+ * INDEL_CF for fraction of reads with consensus indel wrt all reads with an indel at the site, CONS_CNT for the count of
+ * reads supporting the consensus indel at the site. Conventional arithmetic and logical operations are supported. For instance,
+ * N_COV<4||T_COV<6||T_INDEL_F<0.3||T_INDEL_CF<0.7 instructs the tool to only output indel calls with at least 30% observed
+ * allelic fraction and with consensus indel making at least 70% of all indel observations at the site, and only at the sites
+ * where tumor coverage and normal coverage are at least 6 and 4, respectively.
*
Input
*
* Tumor and normal bam files (or single sample bam file(s) in --unpaired mode).