From e9201b81d11a6a42b5e08b68c18added2e21a0b5 Mon Sep 17 00:00:00 2001 From: kiran Date: Thu, 6 Jan 2011 14:54:56 +0000 Subject: [PATCH] A more general method for specifying samples to act on from the command-line. Supports samples specified individually on the console, a file of samples, or regular expressions to select multiple samples. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4945 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variantutils/SelectVariants.java | 68 +------------------ .../sting/utils/SampleUtils.java | 56 ++++++++++++++- 2 files changed, 58 insertions(+), 66 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index db8583cfd..90cb66b36 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -75,87 +75,25 @@ public class SelectVariants extends RodWalker { private List jexls = null; private Set samples = new HashSet(); - private Set possibleSampleRegexs = new HashSet(); - private Set sampleExpressionsThatDidNotWork = new HashSet(); /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ public void initialize() { + // Get list of samples to include in the output ArrayList rodNames = new ArrayList(); rodNames.add("variant"); Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - if (SAMPLE_EXPRESSIONS != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // sample list set, and treat the entries as if they had been specified on the command line. - Set samplesFromFiles = new HashSet(); - for (String SAMPLE_EXPRESSION : SAMPLE_EXPRESSIONS) { - File sampleFile = new File(SAMPLE_EXPRESSION); - - try { - XReadLines reader = new XReadLines(sampleFile); - - List lines = reader.readLines(); - for (String line : lines) { - samplesFromFiles.add(line); - } - } catch (FileNotFoundException e) { - // ignore exception - } - } - - SAMPLE_EXPRESSIONS.addAll(samplesFromFiles); - - // Let's now assume that the values in SAMPLE_EXPRESSIONS are literal sample names and not regular - // expressions. Extract those samples specifically so we don't make the mistake of selecting more - // than what the user really wants. - for (String SAMPLE_EXPRESSION : SAMPLE_EXPRESSIONS) { - if (!(new File(SAMPLE_EXPRESSION).exists())) { - if (vcfSamples.contains(SAMPLE_EXPRESSION)) { - samples.add(SAMPLE_EXPRESSION); - } else { - possibleSampleRegexs.add(SAMPLE_EXPRESSION); - } - } - } - - // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions - for (String sampleRegex : possibleSampleRegexs) { - Pattern p = Pattern.compile(sampleRegex); - - boolean patternWorked = false; - - for (String vcfSample : vcfSamples) { - Matcher m = p.matcher(vcfSample); - if (m.find()) { - samples.add(vcfSample); - - patternWorked = true; - } - } - - if (!patternWorked) { - sampleExpressionsThatDidNotWork.add(sampleRegex); - } - } - - // Finally, warn the user about any leftover sample expressions that had no effect - if (sampleExpressionsThatDidNotWork.size() > 0) { - for (String exp : sampleExpressionsThatDidNotWork) { - logger.warn("The sample expression '" + exp + "' had no effect (no matching sample or pattern match found). Skipping."); - } - } - } else { - samples.addAll(vcfSamples); - } + samples = SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS); for (String sample : samples) { logger.info("Including sample '" + sample + "'"); } + // Initialize VCF header Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); headerLines.add(new VCFHeaderLine("source", "SelectVariants")); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); diff --git a/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/java/src/org/broadinstitute/sting/utils/SampleUtils.java index de0d7e411..88187e525 100755 --- a/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -215,5 +215,59 @@ public class SampleUtils { return new ArrayList(); } - + public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { + Set samples = new HashSet(); + + if (sampleExpressions != null) { + // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our + // sample list set, and treat the entries as if they had been specified on the command line. + Set samplesFromFiles = new HashSet(); + for (String sampleExpression : sampleExpressions) { + File sampleFile = new File(sampleExpression); + + try { + XReadLines reader = new XReadLines(sampleFile); + + List lines = reader.readLines(); + for (String line : lines) { + samplesFromFiles.add(line); + } + } catch (FileNotFoundException e) { + // ignore exception + } + } + + sampleExpressions.addAll(samplesFromFiles); + + // Let's now assume that the values in sampleExpressions are literal sample names and not regular + // expressions. Extract those samples specifically so we don't make the mistake of selecting more + // than what the user really wants. + Set possibleSampleRegexs = new HashSet(); + for (String sampleExpression : sampleExpressions) { + if (!(new File(sampleExpression).exists())) { + if (vcfSamples.contains(sampleExpression)) { + samples.add(sampleExpression); + } else { + possibleSampleRegexs.add(sampleExpression); + } + } + } + + // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions + for (String sampleRegex : possibleSampleRegexs) { + Pattern p = Pattern.compile(sampleRegex); + + for (String vcfSample : vcfSamples) { + Matcher m = p.matcher(vcfSample); + if (m.find()) { + samples.add(vcfSample); + } + } + } + } else { + samples.addAll(vcfSamples); + } + + return samples; + } } \ No newline at end of file