From dac1309dbd2d84c23d399caf3984edb0f31d441f Mon Sep 17 00:00:00 2001 From: carneiro Date: Thu, 31 Mar 2011 21:12:40 +0000 Subject: [PATCH] Added two modes for selecting variants at random (random sampling). -number N -- generates a VCF with exactly N randomly chosen variants with equal probability. -fraction F -- generates a VCF with approximately F (between 0-1) randomly chosen variants with equal probability. (Similar behavior to RandomlySplitVariants walker). The reason for two modes is that the first one may need a lot of memory if your sample size is too large. The wiki is being updated with this information now. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5545 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variantutils/SelectVariants.java | 120 +++++++++++++++--- 1 file changed, 104 insertions(+), 16 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index ccf5ac29a..3c28deef7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -82,6 +82,30 @@ public class SelectVariants extends RodWalker { @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; + @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track. Variants are kept in memory to guarantee that n variants will be output, so use it only for a reasonable number of variants. Use select_random_fraction for larger numbers of variants", required=false) + private int numRandom = 0; + + @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (0,1) of variants at random from the variant track. Routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions", required=false) + private double fractionRandom = 0; + + + /* Private class used to store the intermediate variants in the integer random selection process */ + private class RandomVariantStructure { + private VariantContext vc; + private byte refBase; + + RandomVariantStructure(VariantContext vcP, byte refBaseP) { + vc = vcP; + refBase = refBaseP; + } + + public void set (VariantContext vcP, byte refBaseP) { + vc = vcP; + refBase = refBaseP; + } + + } + private ArrayList selectNames = new ArrayList(); private List jexls = null; @@ -97,6 +121,19 @@ public class SelectVariants extends RodWalker { private final String variantRodName = "variant"; + /* variables used by the SELECT RANDOM modules */ + private boolean SELECT_RANDOM_NUMBER = false; + private boolean SELECT_RANDOM_FRACTION = false; + private int variantNumber = 0; + private int nVariantsAdded = 0; + private int positionToAdd = 0; + private Random generator; + private RandomVariantStructure [] variantArray; + + + + + /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ @@ -113,6 +150,20 @@ public class SelectVariants extends RodWalker { logger.info("Including sample '" + sample + "'"); } + // Initialize VCF header + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); + headerLines.add(new VCFHeaderLine("source", "SelectVariants")); + vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + + for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) { + // It's not necessary that the user supply select names for the JEXL expressions, since those + // expressions will only be needed for omitting records. Make up the select names here. + selectNames.add(String.format("select-%d", i)); + } + + jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS); + + // Look at the parameters to decide which analysis to perform DISCORDANCE_ONLY = discordanceRodName.length() > 0; if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName); @@ -126,18 +177,16 @@ public class SelectVariants extends RodWalker { MENDELIAN_VIOLATIONS = true; } - // Initialize VCF header - Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); - headerLines.add(new VCFHeaderLine("source", "SelectVariants")); - vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); - - for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) { - // It's not necessary that the user supply select names for the JEXL expressions, since those - // expressions will only be needed for omitting records. Make up the select names here. - selectNames.add(String.format("select-%d", i)); + SELECT_RANDOM_NUMBER = numRandom > 0; + if (SELECT_RANDOM_NUMBER) { + logger.info("Selecting " + numRandom + " variants at random from the variant track"); + variantArray = new RandomVariantStructure[numRandom]; } - jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS); + SELECT_RANDOM_FRACTION = fractionRandom > 0; + if (SELECT_RANDOM_FRACTION) logger.info("Selecting " + fractionRandom + " variants at random from the variant track"); + + generator = new Random(); } /** @@ -201,7 +250,12 @@ public class SelectVariants extends RodWalker { return 0; } } - vcfWriter.add(sub, ref.getBase()); + if (SELECT_RANDOM_NUMBER) { + randomlyAddVariant(++variantNumber, sub, ref.getBase()); + } + else if (!SELECT_RANDOM_FRACTION || generator.nextDouble() < fractionRandom) { + vcfWriter.add(sub, ref.getBase()); + } } } } @@ -209,6 +263,27 @@ public class SelectVariants extends RodWalker { return 1; } + + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(Integer value, Integer sum) { return value + sum; } + + public void onTraversalDone(Integer result) { + logger.info(result + " records processed."); + + if (SELECT_RANDOM_NUMBER) { + int positionToPrint = positionToAdd; + for (int i=0; i { return sub; } - @Override - public Integer reduceInit() { return 0; } + private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) { + if (nVariantsAdded < numRandom) + variantArray[nVariantsAdded++] = new RandomVariantStructure(vc, refBase); - @Override - public Integer reduce(Integer value, Integer sum) { return value + sum; } + else { + double v = generator.nextDouble(); + double t = (1.0/(rank-numRandom+1)); + if ( v < t) { + variantArray[positionToAdd].set(vc, refBase); + nVariantsAdded++; + positionToAdd = nextCircularPosition(positionToAdd, numRandom); + } + } + } - public void onTraversalDone(Integer result) { logger.info(result + " records processed."); } + private int nextCircularPosition(int cur, int size) { + if ((cur + 1) == size) + return 0; + return cur + 1; + } }