Logistic transform of MQ + jitter to capped MQ in VariantDataManager

2015-02-12 12:34:16 -05:00 · 2015-02-12 12:34:16 -05:00 · eae4c875a9
parent cd9db60ce2
commit eae4c875a9
3 changed files with 49 additions and 8 deletions
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java
@ -84,6 +84,8 @@ public class VariantDataManager {
    private final VariantRecalibratorArgumentCollection VRAC;
    protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
    protected final List<TrainingSet> trainingSets;
+    private static final double SAFETY_OFFSET = 0.01;     //To use for example as 1/(X + SAFETY_OFFSET) to protect against dividing or taking log of X=0.
+    private static final double PRECISION = 0.01;         //To use mainly with MathUrils.compareDoubles(a,b,PRECISON)

    public VariantDataManager( final List<String> annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) {
        this.data = Collections.emptyList();
@ -334,15 +336,19 @@ public class VariantDataManager {
        int iii = 0;
        for( final String key : annotationKeys ) {
            isNull[iii] = false;
-            annotations[iii] = decodeAnnotation( key, vc, jitter );
+            annotations[iii] = decodeAnnotation( key, vc, jitter, VRAC );
            if( Double.isNaN(annotations[iii]) ) { isNull[iii] = true; }
            iii++;
        }
        datum.annotations = annotations;
        datum.isNull = isNull;
    }
+    /** Transforms an interval [xmin, xmax] to (-inf, +inf) **/
+    private static double logitTransform( final double x, final double xmin, final double xmax) {
+        return Math.log((x - xmin)/(xmax - x));
+    }

-    private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter ) {
+    private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter, final VariantRecalibratorArgumentCollection vrac ) {
        double value;

        final double LOG_OF_TWO = 0.6931472;
@ -350,10 +356,20 @@ public class VariantDataManager {
        try {
            value = vc.getAttributeAsDouble( annotationKey, Double.NaN );
            if( Double.isInfinite(value) ) { value = Double.NaN; }
-            if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
-            if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
-            if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
-            if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }   //min SOR is 2.0, then we take ln
+            if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
+            if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
+            if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
+            if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }   //min SOR is 2.0, then we take ln
+            if( jitter && annotationKey.equalsIgnoreCase("MQ")) {
+                if( vrac.MQ_CAP > 0) {
+                    value = logitTransform(value, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET);
+                    if (MathUtils.compareDoubles(value, logitTransform(vrac.MQ_CAP, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET), PRECISION) == 0 ) {
+                        value += vrac.MQ_JITTER * Utils.getRandomGenerator().nextGaussian();
+                    }
+                } else if( MathUtils.compareDoubles(value, vrac.MQ_CAP, PRECISION) == 0 ) {
+                    value += vrac.MQ_JITTER * Utils.getRandomGenerator().nextGaussian();
+                }
+            }
        } catch( Exception e ) {
            value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
        }
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java
@ -77,7 +77,8 @@ public class VariantRecalibratorArgumentCollection {
        throw new ReviewedGATKException("VariantRecalibrator mode string is unrecognized, input = " + input);
    }
    /**
-     * Use either SNP for recalibrating only SNPs (emitting indels untouched in the output VCF) or INDEL for indels (emitting SNPs untouched in the output VCF). There is also a BOTH option for recalibrating both SNPs and indels simultaneously, but this is meant for testing purposes only and should not be used in actual analyses.
+     * Use either SNP for recalibrating only SNPs (emitting indels untouched in the output VCF) or INDEL for indels (emitting SNPs untouched in the output VCF).
+     * There is also a BOTH option for recalibrating both SNPs and indels simultaneously, but this is meant for testing purposes only and should not be used in actual analyses.
     */
    @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ", required = true)
    public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP;
@ -158,6 +159,30 @@ public class VariantRecalibratorArgumentCollection {
    @Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="LOD score cutoff for selecting bad variants", required=false)
    public double BAD_LOD_CUTOFF = -5.0;

+    @Advanced
+    @Argument(fullName="MQCapForLogitJitterTransform", shortName = "MQCap", doc="MQ is capped at a \"max\" value (60 for bwa-mem) when the alignment is considered perfect." +
+            "Since often a huge proportion of reads are perfectly mapped, this yields a distribution with a blob < max and a huge peak at max" +
+            "This is not good for the mixture of Gaussian VQSR model and has been observed to yield a ROC curve with a jump." +
+            "Using MQCap = X has 2 effects:  (1) MQs are transformed by a scaled logit on [0,X] (+ epsilon to avoid division by zero)" +
+            "to make the blob more Gaussian-like and (2) The transformed MQ=X are jittered to break the peak into a narrow Gaussian." +
+            "Beware that IndelRealigner, if used, adds 10 to MQ for successfully realigned indels." +
+            "We recommend to use --read-filter ReassignOriginalMQAfterIndelRealignment with HaplotypeCaller, but if not, use a MQCap=max+10 to take that into account." +
+            "If this option is not used, or if MQCap is set to 0, MQ will not be transformed.")
+    public int MQ_CAP = 0;
+    /**
+     * The following 2 arguments are hidden because they are only for testing different jitter amounts with and without logit transform.
+     * Once this will have been tested, and the correct jitter amount chosen (perhaps as a function of the logit range [0,max]) they can be removed.
+     */
+    @Hidden
+    @Advanced
+    @Argument(fullName = "no_MQ_logit", shortName = "NoMQLogit", doc="MQ is by default transformed to log[(MQ_cap + epsilon - MQ)/(MQ + epsilon)] to make it more Gaussian-like.  Use this flag to not do that.", required = false)
+    public boolean NO_MQ_LOGIT = false;
+
+    @Hidden
+    @Advanced
+    @Argument(fullName="MQ_jitter", shortName="MQJitt", doc="Amount of jitter (as a factor to a Normal(0,1) noise) to add to the MQ capped values", required = false)
+    public double MQ_JITTER = 0.05;
+
    /////////////////////////////
    // Deprecated Arguments
    // Keeping them here is meant to provide users with error messages that are more informative than "arg not defined" when they use an argument that has been put out of service
--- a/protected/gatk-tools-protected/src/main/resources/org.broadinstitute.gatk/tools.walkers/variantrecalibration/plot_Tranches.R
+++ b/protected/gatk-tools-protected/src/main/resources/org.broadinstitute.gatk/tools.walkers/variantrecalibration/plot_Tranches.R
@ -41,7 +41,7 @@ leftShift <- function(x, leftValue = 0) {
 # Tranches plot
 # -----------------------------------------------------------------------------------------------
 data2 = read.table(tranchesFile,sep=",",head=T)
-data2 = data2[order(data2$targetTruthSensitivity, decreasing=T),]
+data2 = data2[order(data2$novelTiTv, decreasing=F),]
 #data2 = data2[order(data2$FDRtranche, decreasing=T),]
 cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
 density=c(20, -1, -1, 20)