From 907d1d6160cac43298c364c0128c960450df3b6e Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 12 Mar 2014 10:25:12 -0400 Subject: [PATCH] Fix for non-determinism in the VQSR with very large data sets --- .../variantrecalibration/VariantDataManager.java | 10 +++++----- .../sting/gatk/walkers/annotator/RankSumUnitTest.java | 7 ++++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 1f355359d..f16399e62 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -247,7 +247,7 @@ public class VariantDataManager { logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." ); } else if( trainingData.size() > VRAC.MAX_NUM_TRAINING_DATA ) { logger.warn( "WARNING: Very large training set detected. Downsampling to " + VRAC.MAX_NUM_TRAINING_DATA + " training variants." ); - Collections.shuffle(trainingData); + Collections.shuffle(trainingData, GenomeAnalysisEngine.getRandomGenerator()); return trainingData.subList(0, VRAC.MAX_NUM_TRAINING_DATA); } return trainingData; @@ -295,13 +295,13 @@ public class VariantDataManager { public List getRandomDataForPlotting( final int numToAdd, final List trainingData, final List antiTrainingData, final List evaluationData ) { final List returnData = new ExpandingArrayList<>(); - Collections.shuffle(trainingData); - Collections.shuffle(antiTrainingData); - Collections.shuffle(evaluationData); + Collections.shuffle(trainingData, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(antiTrainingData, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(evaluationData, GenomeAnalysisEngine.getRandomGenerator()); returnData.addAll(trainingData.subList(0, Math.min(numToAdd, trainingData.size()))); returnData.addAll(antiTrainingData.subList(0, Math.min(numToAdd, antiTrainingData.size()))); returnData.addAll(evaluationData.subList(0, Math.min(numToAdd, evaluationData.size()))); - Collections.shuffle(returnData); + Collections.shuffle(returnData, GenomeAnalysisEngine.getRandomGenerator()); return returnData; } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java index b1c280748..0ec1dd996 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MannWhitneyU; import org.testng.Assert; import org.testng.annotations.BeforeClass; @@ -75,9 +76,9 @@ public class RankSumUnitTest { makeDistribution(distribution20_40, 40, skew, observations/2); // shuffle the observations - Collections.shuffle(distribution20); - Collections.shuffle(distribution30); - Collections.shuffle(distribution20_40); + Collections.shuffle(distribution20, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(distribution30, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(distribution20_40, GenomeAnalysisEngine.getRandomGenerator()); } private static void makeDistribution(final List result, final int target, final int skew, final int numObservations) {