From 8907e42007c34b43dd59b47fb665527ff9df81dc Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 27 Oct 2011 20:53:48 -0400 Subject: [PATCH 3/3] First fully functional implementation of ValidationSiteSelectorWalker. User gives a) a set of input variants, b) a desired number of output variants, b) Optionally, a set of samples which will restrict sites to be polymorphic in those samples, c) a frequency selection mode: either uniform (no AF matching), or matching AF so that output sites mirror the input AF spectrum as closely as possible. More testing is needed and docs need improving but so far all functionality seems up and running --- .../walkers/variantutils/SelectVariants.java | 86 +------------------ 1 file changed, 2 insertions(+), 84 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 9b625fb04..2e17d68d8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -255,14 +255,6 @@ public class SelectVariants extends RodWalker { @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; - @Hidden - @Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) - private boolean KEEP_AF_SPECTRUM = false; - - @Hidden - @Argument(fullName="afFile", shortName="afFile", doc="The output recal file used by ApplyRecalibration", required=false) - private File AF_FILE = new File(""); - @Hidden @Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false) private File FAMILY_STRUCTURE_FILE = null; @@ -442,7 +434,7 @@ public class SelectVariants extends RodWalker { mvSet.add(mv); } } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(AF_FILE, e); + throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e); } if (outMVFile != null) try { @@ -469,31 +461,7 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); - if (KEEP_AF_SPECTRUM) { - try { - afBreakpoints = new ArrayList(); - afBoosts = new ArrayList(); - logger.info("Reading in AF boost table..."); - boolean firstLine = false; - for ( final String line : new XReadLines( AF_FILE ) ) { - if (!firstLine) { - firstLine = true; - continue; - } - final String[] vals = line.split(" "); - double bkp = Double.valueOf(vals[0]); - double afb = Double.valueOf(vals[1]); - afBreakpoints.add(bkp); - afBoosts.add(afb); - - } - bkDelta = afBreakpoints.get(0); - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(AF_FILE, e); - } - - } } /** @@ -566,61 +534,11 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_NUMBER) { randomlyAddVariant(++variantNumber, sub, ref.getBase()); } - else if (!SELECT_RANDOM_FRACTION || (!KEEP_AF_SPECTRUM && GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { + else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { vcfWriter.add(sub); } - else { - if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) { - // ok we have a comp VC and we need to match the AF spectrum of inputAFRodName. - // We then pick a variant with probablity AF*desiredFraction - if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { - String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null); - - double af; - double afBoost = 1.0; - if (afo.contains(",")) { - String[] afs = afo.split(","); - afs[0] = afs[0].substring(1,afs[0].length()); - afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); - - double[] afd = new double[afs.length]; - - for (int k=0; k < afd.length; k++) - afd[k] = Double.valueOf(afs[k]); - - af = MathUtils.arrayMax(afd); - //af = Double.valueOf(afs[0]); - - } - else - af = Double.valueOf(afo); - - // now boost af by table read from file if desired - //double bkpt = 0.0; - int bkidx = 0; - if (!afBreakpoints.isEmpty()) { - for ( Double bkpt : afBreakpoints) { - if (af < bkpt + bkDelta) - break; - else bkidx++; - } - if (bkidx >=afBoosts.size()) - bkidx = afBoosts.size()-1; - afBoost = afBoosts.get(bkidx); - //System.out.formatPrin("af:%f bkidx:%d afboost:%f\n",af,bkidx,afBoost); - - } - - //System.out.format("%s .. %4.4f\n",afo.toString(), af); - if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * afBoost * afBoost) - vcfWriter.add(sub); - } - - - } - } } }