From 6e389059cf906d33ea60f6c516fdf154c4f63fe3 Mon Sep 17 00:00:00 2001 From: kiran Date: Sun, 8 Aug 2010 02:57:06 +0000 Subject: [PATCH] An improved version of VariantSubset and VariantSelect, meant to replace those walkers. Takes in a VCF and creates a subsetted VCF by sample(s), JEXL expressions, or both. When subsetting by sample, the -SN argument is treated as a literal sample name and, if no match is found, as a regular expression. This allows for a large number of samples to be selected at once (useful when, for instance, cases are given one sample name prefix and controls are given another). After the subsetting procedure, the INFO-field annotations AC, AN, AF, and DP are all recalculated to properly reflect the new contents of the VCF. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3968 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variantutils/SelectVariants.java | 224 ++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java new file mode 100755 index 000000000..80b6e8895 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -0,0 +1,224 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFHeaderLineType; +import org.broad.tribble.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Takes a VCF file, selects variants based on sample(s) in which it was found and/or on various annotation criteria, + * recompute the value of certain annotations based on the new sample set, and output a new VCF with the results. + */ +@Requires(value={},referenceMetaData=@RMD(name="variant", type=ReferenceOrderedDatum.class)) +public class SelectVariants extends RodWalker { + @Argument(fullName="sample", shortName="SN", doc="Sample(s) to include. Can be a single sample, specified multiple times for many samples, or a regular expression to select many samples", required=false) + public Set SAMPLE_EXPRESSIONS; + + @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false) + public ArrayList SELECT_EXPRESSIONS = new ArrayList(); + + @Argument(fullName="excludeNonVariants", shortName="ENV", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) + private boolean EXCLUDE_NON_VARIANTS = false; + + @Argument(fullName="excludeFiltered", shortName="EF", doc="Don't include filtered loci", required=false) + private boolean EXCLUDE_FILTERED = false; + + private ArrayList selectNames = new ArrayList(); + private List jexls = null; + + private Set samples = new HashSet(); + private Set possibleSampleRegexs = new HashSet(); + private Set sampleExpressionsThatDidNotWork = new HashSet(); + + private VCFWriter vcfWriter = null; + + /** + * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher + */ + public void initialize() { + vcfWriter = new VCFWriter(out); + + ArrayList rodNames = new ArrayList(); + rodNames.add("variant"); + + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + + if (SAMPLE_EXPRESSIONS != null) { + // Let's first assume that the values in SAMPLE_EXPRESSIONS are literal sample names and not regular + // expressions. Extract those samples specifically so we don't make the mistake of selecting more + // than what the user really wants. + for (String SAMPLE_EXPRESSION : SAMPLE_EXPRESSIONS) { + if (vcfSamples.contains(SAMPLE_EXPRESSION)) { + samples.add(SAMPLE_EXPRESSION); + } else { + possibleSampleRegexs.add(SAMPLE_EXPRESSION); + } + } + + // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions + for (String sampleRegex : possibleSampleRegexs) { + Pattern p = Pattern.compile(sampleRegex); + + boolean patternWorked = false; + + for (String vcfSample : vcfSamples) { + Matcher m = p.matcher(vcfSample); + if (m.find()) { + samples.add(vcfSample); + + patternWorked = true; + } + } + + if (!patternWorked) { + sampleExpressionsThatDidNotWork.add(sampleRegex); + } + } + + // Finally, warn the user about any leftover sample expressions that had no effect + if (sampleExpressionsThatDidNotWork.size() > 0) { + logger.warn("The following arguments to --sample did not work:"); + + for (String exp : sampleExpressionsThatDidNotWork) { + logger.warn("\t" + exp); + } + + logger.warn("Continuing on with the expressions that did work."); + } + } else { + samples.addAll(vcfSamples); + } + + Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); + headerLines.add(new VCFHeaderLine("source", "SelectVariants")); + vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + + for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) { + // It's not necessary that the user supply select names for the JEXL expressions, since those + // expressions will only be needed for omitting records. Make up the select names here. + selectNames.add(String.format("select-%d", i)); + } + + jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS); + } + + /** + * If JEXL expressions are supplied, include only records that satisfy the expression + * + * @param tracker the ROD tracker + * @param ref reference information + * @param context alignment info + * @return true if no JEXL expressions are supplied or if a record satisfies all JEXL criteria, false if otherwise + */ + public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + VariantContext vc = tracker.getVariantContext(ref, "variant", null, context.getLocation(), true); + + for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { + if ( !VariantContextUtils.match(vc, jexl) ) { + return false; + } + } + + return true; + } + + /** + * Subset VC record if necessary and emit the modified record (provided it satisfies criteria for printing) + * + * @param tracker the ROD tracker + * @param ref reference information + * @param context alignment info + * @return 1 if the record was printed to the output file, 0 if otherwise + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + VariantContext vc = tracker.getVariantContext(ref, "variant", null, context.getLocation(), true); + VariantContext sub = subsetRecord(vc, samples); + + if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { + vcfWriter.add(sub, ref.getBase()); + } + + return 1; + } + + /** + * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF). + * + * @param vc the VariantContext record to subset + * @param samples the samples to extract + * @return the subsetted VariantContext + */ + private VariantContext subsetRecord(VariantContext vc, Set samples) { + if ( samples == null || samples.isEmpty() ) + return vc; + + ArrayList genotypes = new ArrayList(); + for ( Map.Entry genotypePair : vc.getGenotypes().entrySet() ) { + if ( samples.contains(genotypePair.getKey()) ) + genotypes.add(genotypePair.getValue()); + } + + VariantContext sub = vc.subContextFromGenotypes(genotypes); + + HashMap attributes = new HashMap(sub.getAttributes()); + + int alleleCount = 0; + int numberOfAlleles = 0; + int depth = 0; + for (String sample : sub.getSampleNames()) { + Genotype g = sub.getGenotype(sample); + + if (g.isNotFiltered() && g.isCalled()) { + numberOfAlleles += g.getPloidy(); + + if (g.isHet()) { alleleCount++; } + else if (g.isHomVar()) { alleleCount += 2; } + + String dp = (String) g.getAttribute("DP"); + if (dp != null) { + depth += Integer.valueOf(dp); + } + } + } + + attributes.put("AC", alleleCount); + attributes.put("AN", numberOfAlleles); + attributes.put("AF", ((double) alleleCount) / ((double) numberOfAlleles)); + attributes.put("DP", depth); + + sub = VariantContextUtils.modifyAttributes(sub, attributes); + + return sub; + } + + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(Integer value, Integer sum) { return value + sum; } + + public void onTraversalDone(Integer result) { logger.info(result + " records processed."); } +}