diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java index a2c49df4d..cac206083 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java @@ -84,6 +84,8 @@ public class VariantDataManager { private final VariantRecalibratorArgumentCollection VRAC; protected final static Logger logger = Logger.getLogger(VariantDataManager.class); protected final List trainingSets; + private static final double SAFETY_OFFSET = 0.01; //To use for example as 1/(X + SAFETY_OFFSET) to protect against dividing or taking log of X=0. + private static final double PRECISION = 0.01; //To use mainly with MathUrils.compareDoubles(a,b,PRECISON) public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { this.data = Collections.emptyList(); @@ -334,15 +336,19 @@ public class VariantDataManager { int iii = 0; for( final String key : annotationKeys ) { isNull[iii] = false; - annotations[iii] = decodeAnnotation( key, vc, jitter ); + annotations[iii] = decodeAnnotation( key, vc, jitter, VRAC ); if( Double.isNaN(annotations[iii]) ) { isNull[iii] = true; } iii++; } datum.annotations = annotations; datum.isNull = isNull; } + /** Transforms an interval [xmin, xmax] to (-inf, +inf) **/ + private static double logitTransform( final double x, final double xmin, final double xmax) { + return Math.log((x - xmin)/(xmax - x)); + } - private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter ) { + private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter, final VariantRecalibratorArgumentCollection vrac ) { double value; final double LOG_OF_TWO = 0.6931472; @@ -350,10 +356,20 @@ public class VariantDataManager { try { value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); if( Double.isInfinite(value) ) { value = Double.NaN; } - if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln + if( jitter && annotationKey.equalsIgnoreCase("MQ")) { + if( vrac.MQ_CAP > 0) { + value = logitTransform(value, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET); + if (MathUtils.compareDoubles(value, logitTransform(vrac.MQ_CAP, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET), PRECISION) == 0 ) { + value += vrac.MQ_JITTER * Utils.getRandomGenerator().nextGaussian(); + } + } else if( MathUtils.compareDoubles(value, vrac.MQ_CAP, PRECISION) == 0 ) { + value += vrac.MQ_JITTER * Utils.getRandomGenerator().nextGaussian(); + } + } } catch( Exception e ) { value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java index 1547bbf2a..01970d9d0 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java @@ -77,7 +77,8 @@ public class VariantRecalibratorArgumentCollection { throw new ReviewedGATKException("VariantRecalibrator mode string is unrecognized, input = " + input); } /** - * Use either SNP for recalibrating only SNPs (emitting indels untouched in the output VCF) or INDEL for indels (emitting SNPs untouched in the output VCF). There is also a BOTH option for recalibrating both SNPs and indels simultaneously, but this is meant for testing purposes only and should not be used in actual analyses. + * Use either SNP for recalibrating only SNPs (emitting indels untouched in the output VCF) or INDEL for indels (emitting SNPs untouched in the output VCF). + * There is also a BOTH option for recalibrating both SNPs and indels simultaneously, but this is meant for testing purposes only and should not be used in actual analyses. */ @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ", required = true) public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; @@ -158,6 +159,30 @@ public class VariantRecalibratorArgumentCollection { @Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="LOD score cutoff for selecting bad variants", required=false) public double BAD_LOD_CUTOFF = -5.0; + @Advanced + @Argument(fullName="MQCapForLogitJitterTransform", shortName = "MQCap", doc="MQ is capped at a \"max\" value (60 for bwa-mem) when the alignment is considered perfect." + + "Since often a huge proportion of reads are perfectly mapped, this yields a distribution with a blob < max and a huge peak at max" + + "This is not good for the mixture of Gaussian VQSR model and has been observed to yield a ROC curve with a jump." + + "Using MQCap = X has 2 effects: (1) MQs are transformed by a scaled logit on [0,X] (+ epsilon to avoid division by zero)" + + "to make the blob more Gaussian-like and (2) The transformed MQ=X are jittered to break the peak into a narrow Gaussian." + + "Beware that IndelRealigner, if used, adds 10 to MQ for successfully realigned indels." + + "We recommend to use --read-filter ReassignOriginalMQAfterIndelRealignment with HaplotypeCaller, but if not, use a MQCap=max+10 to take that into account." + + "If this option is not used, or if MQCap is set to 0, MQ will not be transformed.") + public int MQ_CAP = 0; + /** + * The following 2 arguments are hidden because they are only for testing different jitter amounts with and without logit transform. + * Once this will have been tested, and the correct jitter amount chosen (perhaps as a function of the logit range [0,max]) they can be removed. + */ + @Hidden + @Advanced + @Argument(fullName = "no_MQ_logit", shortName = "NoMQLogit", doc="MQ is by default transformed to log[(MQ_cap + epsilon - MQ)/(MQ + epsilon)] to make it more Gaussian-like. Use this flag to not do that.", required = false) + public boolean NO_MQ_LOGIT = false; + + @Hidden + @Advanced + @Argument(fullName="MQ_jitter", shortName="MQJitt", doc="Amount of jitter (as a factor to a Normal(0,1) noise) to add to the MQ capped values", required = false) + public double MQ_JITTER = 0.05; + ///////////////////////////// // Deprecated Arguments // Keeping them here is meant to provide users with error messages that are more informative than "arg not defined" when they use an argument that has been put out of service diff --git a/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/tools/walkers/variantrecalibration/plot_Tranches.R b/protected/gatk-tools-protected/src/main/resources/org.broadinstitute.gatk/tools.walkers/variantrecalibration/plot_Tranches.R old mode 100755 new mode 100644 similarity index 98% rename from public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/tools/walkers/variantrecalibration/plot_Tranches.R rename to protected/gatk-tools-protected/src/main/resources/org.broadinstitute.gatk/tools.walkers/variantrecalibration/plot_Tranches.R index 4fe59083c..d96add768 --- a/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/tools/walkers/variantrecalibration/plot_Tranches.R +++ b/protected/gatk-tools-protected/src/main/resources/org.broadinstitute.gatk/tools.walkers/variantrecalibration/plot_Tranches.R @@ -41,7 +41,7 @@ leftShift <- function(x, leftValue = 0) { # Tranches plot # ----------------------------------------------------------------------------------------------- data2 = read.table(tranchesFile,sep=",",head=T) -data2 = data2[order(data2$targetTruthSensitivity, decreasing=T),] +data2 = data2[order(data2$novelTiTv, decreasing=F),] #data2 = data2[order(data2$FDRtranche, decreasing=T),] cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange") density=c(20, -1, -1, 20)