Merge pull request #558 from broadinstitute/rp_vqsr_nondeterminism_fix

Fix for non-determinism in the VQSR with very large data sets
This commit is contained in:
Eric Banks 2014-03-12 14:35:51 -04:00
commit 7c7ff90266
2 changed files with 9 additions and 8 deletions

View File

@ -247,7 +247,7 @@ public class VariantDataManager {
logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." );
} else if( trainingData.size() > VRAC.MAX_NUM_TRAINING_DATA ) {
logger.warn( "WARNING: Very large training set detected. Downsampling to " + VRAC.MAX_NUM_TRAINING_DATA + " training variants." );
Collections.shuffle(trainingData);
Collections.shuffle(trainingData, GenomeAnalysisEngine.getRandomGenerator());
return trainingData.subList(0, VRAC.MAX_NUM_TRAINING_DATA);
}
return trainingData;
@ -295,13 +295,13 @@ public class VariantDataManager {
public List<VariantDatum> getRandomDataForPlotting( final int numToAdd, final List<VariantDatum> trainingData, final List<VariantDatum> antiTrainingData, final List<VariantDatum> evaluationData ) {
final List<VariantDatum> returnData = new ExpandingArrayList<>();
Collections.shuffle(trainingData);
Collections.shuffle(antiTrainingData);
Collections.shuffle(evaluationData);
Collections.shuffle(trainingData, GenomeAnalysisEngine.getRandomGenerator());
Collections.shuffle(antiTrainingData, GenomeAnalysisEngine.getRandomGenerator());
Collections.shuffle(evaluationData, GenomeAnalysisEngine.getRandomGenerator());
returnData.addAll(trainingData.subList(0, Math.min(numToAdd, trainingData.size())));
returnData.addAll(antiTrainingData.subList(0, Math.min(numToAdd, antiTrainingData.size())));
returnData.addAll(evaluationData.subList(0, Math.min(numToAdd, evaluationData.size())));
Collections.shuffle(returnData);
Collections.shuffle(returnData, GenomeAnalysisEngine.getRandomGenerator());
return returnData;
}

View File

@ -46,6 +46,7 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
@ -75,9 +76,9 @@ public class RankSumUnitTest {
makeDistribution(distribution20_40, 40, skew, observations/2);
// shuffle the observations
Collections.shuffle(distribution20);
Collections.shuffle(distribution30);
Collections.shuffle(distribution20_40);
Collections.shuffle(distribution20, GenomeAnalysisEngine.getRandomGenerator());
Collections.shuffle(distribution30, GenomeAnalysisEngine.getRandomGenerator());
Collections.shuffle(distribution20_40, GenomeAnalysisEngine.getRandomGenerator());
}
private static void makeDistribution(final List<Integer> result, final int target, final int skew, final int numObservations) {