From 91fb664135aba1f318f4845159327aa993cc66b6 Mon Sep 17 00:00:00 2001 From: carneiro Date: Thu, 23 Jun 2011 20:18:45 +0000 Subject: [PATCH] Many updates to SelectVariants : 1) There is now a different parameter for sample name (-sn), sample file (-sf) or sample expression (-se). The unexpected behavior of the previous implementation was way too tricky to leave unchecked. (if you had a file or directory named after a sample name, SV wouldn't work) 1b) Fixed a TODO added by Eric -- now the output vcf always has the samples sorted alphabetically regardless of input (this came as a byproduct of the implementation of 1) 2) Discordance and Concordance now work in combination with all other parameters. 3) Discordance now follows Guillermo's suggestion where the discordance track is your VCF and the variant track is the one you are comparing to. I have updated the example in the wiki to reflect this change in interpretation. 4) If you DON'T provide any samples (-sn, -se or -sf), SelectVariants works with all samples from the VCF and ignores sample/genotype information when doing concordance or discordance. That is, it will report every "missing line" or "concordant line" in the two vcfs, regardless of sample or genotype information. 5) When samples are provided (-sn, -se or -sf) discordance and concordance will go down to the genotypes to determine whether or not you have a discordance/concordance event. In this case, a concordance happens only when the two VCFs display the same sample/genotype information for that locus, and discordance happens when the disc track is missing the line or has a different genotype information for that sample. 6) When dealing with multiple samples, concordance only happens if ALL your samples agree, and discordance happens if AT LEAST ONE of your samples disagree. --- Integration tests: 1) Discordance and concordance test added 2) All other tests updated to comply with the new 'sorted output' format and different inputs for samples. --- Methods for handling sample expressions and files with list of samples were added to SampleUtils. I recommend *NOT USING* the old getSamplesFromCommandLineInput as this mixing of sample names with expressions and files creates a rogue error that can be challenging to catch. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6072 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variantutils/SelectVariants.java | 127 +++++++++++++++--- .../sting/utils/SampleUtils.java | 47 +++++++ .../org/broadinstitute/sting/BaseTest.java | 5 + .../SelectVariantsIntegrationTest.java | 31 ++++- 4 files changed, 193 insertions(+), 17 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 316b98424..08f55b533 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MendelianViolation; @@ -44,6 +45,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.vcf.VCFUtils; +import java.io.File; import java.util.*; /** @@ -56,8 +58,14 @@ public class SelectVariants extends RodWalker { @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="sample", shortName="sn", doc="Sample(s) to include. Can be a single sample, specified multiple times for many samples, a file containing sample names, a regular expression to select many samples, or any combination thereof.", required=false) - public Set SAMPLE_EXPRESSIONS; + @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) + public Set sampleNames; + + @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times.", required=false) + public Set sampleExpressions; + + @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) + public Set sampleFiles; @Argument(shortName="select", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false) public ArrayList SELECT_EXPRESSIONS = new ArrayList(); @@ -65,10 +73,9 @@ public class SelectVariants extends RodWalker { @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) private boolean EXCLUDE_NON_VARIANTS = false; - @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci.", required=false) + @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis.", required=false) private boolean EXCLUDE_FILTERED = false; - @Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called on a ROD comparison track. Use -disc ROD_NAME", required=false) private String discordanceRodName = ""; @@ -117,7 +124,8 @@ public class SelectVariants extends RodWalker { private ArrayList selectNames = new ArrayList(); private List jexls = null; - private Set samples = new HashSet(); + private TreeSet samples = new TreeSet(); + private boolean NO_SAMPLES_SPECIFIED = false; private boolean DISCORDANCE_ONLY = false; private boolean CONCORDANCE_ONLY = false; @@ -149,11 +157,20 @@ public class SelectVariants extends RodWalker { rodNames.add(variantRodName); Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + + Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); + Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); + + samples.addAll(samplesFromFile); + samples.addAll(samplesFromExpressions); + samples.addAll(sampleNames); + + if(samples.isEmpty()) { + samples.addAll(vcfSamples); + NO_SAMPLES_SPECIFIED = true; + } - // TODO -- This causes the sample ordering in the VCF to be "random" (i.e. whatever order they are pulled from the HashSet). - // TODO -- We should either enforce alphabetical ordering here (as with other walkers) or base it on the order from the command-line - but we shouldn't leave this up to Java to decide. - samples = SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS); for (String sample : samples) { logger.info("Including sample '" + sample + "'"); } @@ -192,7 +209,7 @@ public class SelectVariants extends RodWalker { } SELECT_RANDOM_FRACTION = fractionRandom > 0; - if (SELECT_RANDOM_FRACTION) logger.info("Selecting " + fractionRandom + " variants at random from the variant track"); + if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + fractionRandom + "% of the variants at random from the variant track"); } /** @@ -222,13 +239,13 @@ public class SelectVariants extends RodWalker { } if (DISCORDANCE_ONLY) { Collection compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false); - if (!compVCs.isEmpty()) - return 0; // return is correct: no matter what vcs has, if compVC is not empty there's nothing more to do at this site + if (!isDiscordant(vc, compVCs)) + return 0; } if (CONCORDANCE_ONLY) { Collection compVCs = tracker.getVariantContexts(ref, concordanceRodName, null, context.getLocation(), true, false); - if (compVCs.isEmpty()) - return 0; // return is correct: no matter what vcs has, if compVC is empty there's nothing more to do at this site + if (!isConcordant(vc, compVCs)) + return 0; } // TODO - add ability to also select MNPs @@ -241,7 +258,6 @@ public class SelectVariants extends RodWalker { VariantContext sub = subsetRecord(vc, samples); if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { - //System.out.printf("%s%n",sub.toString()); for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { if ( !VariantContextUtils.match(sub, jexl) ) { return 0; @@ -260,7 +276,88 @@ public class SelectVariants extends RodWalker { return 1; } + /** + * Checks if vc has a variant call for (at least one of) the samples. + * @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap) + * @param compVCs the comparison VariantContext (discordance + * @return + */ + private boolean isDiscordant (VariantContext vc, Collection compVCs) { + if (vc == null) + return false; + // if we're not looking at specific samples then the absense of a compVC means discordance + if (NO_SAMPLES_SPECIFIED && (compVCs == null || compVCs.isEmpty())) + return true; + + // check if we find it in the variant rod + Map genotypes = vc.getGenotypes(samples); + for (Genotype g : genotypes.values()) { + if (sampleHasVariant(g)) { + // There is a variant called (or filtered with not exclude filtered option set) that is not HomRef for at least one of the samples. + if (compVCs == null) + return true; + // Look for this sample in the all vcs of the comp ROD track. + boolean foundVariant = false; + for (VariantContext compVC : compVCs) { + if (sampleHasVariant(compVC.getGenotype(g.getSampleName()))) { + foundVariant = true; + break; + } + } + // if (at least one sample) was not found in all VCs of the comp ROD, we have discordance + if (!foundVariant) + return true; + } + } + return false; // we only get here if all samples have a variant in the comp rod. + } + + private boolean isConcordant (VariantContext vc, Collection compVCs) { + if (vc == null || compVCs == null || compVCs.isEmpty()) + return false; + + // if we're not looking for specific samples then the fact that we have both VCs is enough to call it concordant. + if (NO_SAMPLES_SPECIFIED) + return true; + + // make a list of all samples contained in this variant VC that are being tracked by the user command line arguments. + Set variantSamples = vc.getSampleNames(); + variantSamples.retainAll(samples); + + // check if we can find all samples from the variant rod in the comp rod. + for (String sample : variantSamples) { + boolean foundSample = false; + for (VariantContext compVC : compVCs) { + Genotype varG = vc.getGenotype(sample); + Genotype compG = compVC.getGenotype(sample); + if (haveSameGenotypes(varG, compG)) { + foundSample = true; + break; + } + } + // if at least one sample doesn't have the same genotype, we don't have concordance + if (!foundSample) { + return false; + } + } + return true; + } + + private boolean sampleHasVariant(Genotype g) { + return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED))); + } + + private boolean haveSameGenotypes(Genotype g1, Genotype g2) { + if ((g1.isCalled() && g2.isFiltered()) || + (g2.isCalled() && g1.isFiltered()) || + (g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED)) + return false; + + List a1s = g1.getAlleles(); + List a2s = g2.getAlleles(); + return (a1s.containsAll(a2s) && a2s.containsAll(a1s)); + } @Override public Integer reduceInit() { return 0; } diff --git a/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 88187e525..d15eb8e86 100755 --- a/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -270,4 +270,51 @@ public class SampleUtils { return samples; } + + /** + * Given a collection of samples and a collection of regular expressions, generates the set of samples that match each expression + * @param originalSamples list of samples to select samples from + * @param sampleExpressions list of expressions to use for matching samples + * @return the set of samples from originalSamples that satisfy at least one of the expressions in sampleExpressions + */ + public static Collection matchSamplesExpressions (Collection originalSamples, Collection sampleExpressions) { + // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions + Set samples = new HashSet(); + if (sampleExpressions != null) { + for (String expression : sampleExpressions) { + Pattern p = Pattern.compile(expression); + + for (String originalSample : originalSamples) { + Matcher m = p.matcher(originalSample); + if (m.find()) { + samples.add(originalSample); + } + } + } + } + return samples; + } + + /** + * Given a list of files with sample names it reads all files and creates a list of unique samples from all these files. + * @param files list of files with sample names in + * @return a collection of unique samples from all files + */ + public static Collection getSamplesFromFiles (Collection files) { + Set samplesFromFiles = new HashSet(); + if (files != null) { + for (File file : files) { + try { + XReadLines reader = new XReadLines(file); + List lines = reader.readLines(); + for (String line : lines) { + samplesFromFiles.add(line); + } + } catch (FileNotFoundException e) { + // ignore exception + } + } + } + return samplesFromFiles; + } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/BaseTest.java b/java/test/org/broadinstitute/sting/BaseTest.java index 693a593f0..07f46c1c3 100755 --- a/java/test/org/broadinstitute/sting/BaseTest.java +++ b/java/test/org/broadinstitute/sting/BaseTest.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.commandline.CommandLineUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; +import javax.swing.*; import java.io.*; import java.math.BigInteger; import java.security.MessageDigest; @@ -66,6 +67,10 @@ public abstract class BaseTest { public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.rod"; public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf"; + public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/"; + public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf"; + public static final String b37hapmapSites = hapmapDataLocation + "sites_r27_nr.b37_fwd.vcf"; + public static final String intervalsLocation = GATKDataLocation; public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 66a7b4d9b..e18287a21 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -16,9 +16,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn '[CDH]' -sn " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF3 " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF3 " + testfile + " -NO_HEADER"), 1, - Arrays.asList("1df84e2b755cce19f1876710ec38dd2c") + Arrays.asList("1b9d551298dc048c7d36b60440ff4d50") ); executeTest("testComplexSelection--" + testfile, spec); @@ -36,4 +36,31 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testRepeatedLineSelection--" + testfile, spec); } + + @Test + public void testDiscordance() { + String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -disc myvar -L 20:1012700-1020000 -B:variant,VCF " + b37hapmapGenotypes + " -B:myvar,VCF " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("97621ae8f29955eedfc4e0be3515fcb9") + ); + + executeTest("testDiscordance--" + testFile, spec); + } + + @Test + public void testConcordance() { + String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -conc hapmap -L 20:1012700-1020000 -B:hapmap,VCF " + b37hapmapGenotypes + " -B:variant,VCF " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("a0ae016fdffcbe7bfb99fd3dbc311407") + ); + + executeTest("testConcordance--" + testFile, spec); + } + }