diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 316b98424..08f55b533 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MendelianViolation; @@ -44,6 +45,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.vcf.VCFUtils; +import java.io.File; import java.util.*; /** @@ -56,8 +58,14 @@ public class SelectVariants extends RodWalker { @Output(doc="File to which variants should be written",required=true) protected VCFWriter vcfWriter = null; - @Argument(fullName="sample", shortName="sn", doc="Sample(s) to include. Can be a single sample, specified multiple times for many samples, a file containing sample names, a regular expression to select many samples, or any combination thereof.", required=false) - public Set SAMPLE_EXPRESSIONS; + @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) + public Set sampleNames; + + @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times.", required=false) + public Set sampleExpressions; + + @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) + public Set sampleFiles; @Argument(shortName="select", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false) public ArrayList SELECT_EXPRESSIONS = new ArrayList(); @@ -65,10 +73,9 @@ public class SelectVariants extends RodWalker { @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) private boolean EXCLUDE_NON_VARIANTS = false; - @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci.", required=false) + @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis.", required=false) private boolean EXCLUDE_FILTERED = false; - @Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called on a ROD comparison track. Use -disc ROD_NAME", required=false) private String discordanceRodName = ""; @@ -117,7 +124,8 @@ public class SelectVariants extends RodWalker { private ArrayList selectNames = new ArrayList(); private List jexls = null; - private Set samples = new HashSet(); + private TreeSet samples = new TreeSet(); + private boolean NO_SAMPLES_SPECIFIED = false; private boolean DISCORDANCE_ONLY = false; private boolean CONCORDANCE_ONLY = false; @@ -149,11 +157,20 @@ public class SelectVariants extends RodWalker { rodNames.add(variantRodName); Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); - Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + + Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); + Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); + + samples.addAll(samplesFromFile); + samples.addAll(samplesFromExpressions); + samples.addAll(sampleNames); + + if(samples.isEmpty()) { + samples.addAll(vcfSamples); + NO_SAMPLES_SPECIFIED = true; + } - // TODO -- This causes the sample ordering in the VCF to be "random" (i.e. whatever order they are pulled from the HashSet). - // TODO -- We should either enforce alphabetical ordering here (as with other walkers) or base it on the order from the command-line - but we shouldn't leave this up to Java to decide. - samples = SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS); for (String sample : samples) { logger.info("Including sample '" + sample + "'"); } @@ -192,7 +209,7 @@ public class SelectVariants extends RodWalker { } SELECT_RANDOM_FRACTION = fractionRandom > 0; - if (SELECT_RANDOM_FRACTION) logger.info("Selecting " + fractionRandom + " variants at random from the variant track"); + if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + fractionRandom + "% of the variants at random from the variant track"); } /** @@ -222,13 +239,13 @@ public class SelectVariants extends RodWalker { } if (DISCORDANCE_ONLY) { Collection compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false); - if (!compVCs.isEmpty()) - return 0; // return is correct: no matter what vcs has, if compVC is not empty there's nothing more to do at this site + if (!isDiscordant(vc, compVCs)) + return 0; } if (CONCORDANCE_ONLY) { Collection compVCs = tracker.getVariantContexts(ref, concordanceRodName, null, context.getLocation(), true, false); - if (compVCs.isEmpty()) - return 0; // return is correct: no matter what vcs has, if compVC is empty there's nothing more to do at this site + if (!isConcordant(vc, compVCs)) + return 0; } // TODO - add ability to also select MNPs @@ -241,7 +258,6 @@ public class SelectVariants extends RodWalker { VariantContext sub = subsetRecord(vc, samples); if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { - //System.out.printf("%s%n",sub.toString()); for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { if ( !VariantContextUtils.match(sub, jexl) ) { return 0; @@ -260,7 +276,88 @@ public class SelectVariants extends RodWalker { return 1; } + /** + * Checks if vc has a variant call for (at least one of) the samples. + * @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap) + * @param compVCs the comparison VariantContext (discordance + * @return + */ + private boolean isDiscordant (VariantContext vc, Collection compVCs) { + if (vc == null) + return false; + // if we're not looking at specific samples then the absense of a compVC means discordance + if (NO_SAMPLES_SPECIFIED && (compVCs == null || compVCs.isEmpty())) + return true; + + // check if we find it in the variant rod + Map genotypes = vc.getGenotypes(samples); + for (Genotype g : genotypes.values()) { + if (sampleHasVariant(g)) { + // There is a variant called (or filtered with not exclude filtered option set) that is not HomRef for at least one of the samples. + if (compVCs == null) + return true; + // Look for this sample in the all vcs of the comp ROD track. + boolean foundVariant = false; + for (VariantContext compVC : compVCs) { + if (sampleHasVariant(compVC.getGenotype(g.getSampleName()))) { + foundVariant = true; + break; + } + } + // if (at least one sample) was not found in all VCs of the comp ROD, we have discordance + if (!foundVariant) + return true; + } + } + return false; // we only get here if all samples have a variant in the comp rod. + } + + private boolean isConcordant (VariantContext vc, Collection compVCs) { + if (vc == null || compVCs == null || compVCs.isEmpty()) + return false; + + // if we're not looking for specific samples then the fact that we have both VCs is enough to call it concordant. + if (NO_SAMPLES_SPECIFIED) + return true; + + // make a list of all samples contained in this variant VC that are being tracked by the user command line arguments. + Set variantSamples = vc.getSampleNames(); + variantSamples.retainAll(samples); + + // check if we can find all samples from the variant rod in the comp rod. + for (String sample : variantSamples) { + boolean foundSample = false; + for (VariantContext compVC : compVCs) { + Genotype varG = vc.getGenotype(sample); + Genotype compG = compVC.getGenotype(sample); + if (haveSameGenotypes(varG, compG)) { + foundSample = true; + break; + } + } + // if at least one sample doesn't have the same genotype, we don't have concordance + if (!foundSample) { + return false; + } + } + return true; + } + + private boolean sampleHasVariant(Genotype g) { + return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED))); + } + + private boolean haveSameGenotypes(Genotype g1, Genotype g2) { + if ((g1.isCalled() && g2.isFiltered()) || + (g2.isCalled() && g1.isFiltered()) || + (g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED)) + return false; + + List a1s = g1.getAlleles(); + List a2s = g2.getAlleles(); + return (a1s.containsAll(a2s) && a2s.containsAll(a1s)); + } @Override public Integer reduceInit() { return 0; } diff --git a/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 88187e525..d15eb8e86 100755 --- a/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -270,4 +270,51 @@ public class SampleUtils { return samples; } + + /** + * Given a collection of samples and a collection of regular expressions, generates the set of samples that match each expression + * @param originalSamples list of samples to select samples from + * @param sampleExpressions list of expressions to use for matching samples + * @return the set of samples from originalSamples that satisfy at least one of the expressions in sampleExpressions + */ + public static Collection matchSamplesExpressions (Collection originalSamples, Collection sampleExpressions) { + // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions + Set samples = new HashSet(); + if (sampleExpressions != null) { + for (String expression : sampleExpressions) { + Pattern p = Pattern.compile(expression); + + for (String originalSample : originalSamples) { + Matcher m = p.matcher(originalSample); + if (m.find()) { + samples.add(originalSample); + } + } + } + } + return samples; + } + + /** + * Given a list of files with sample names it reads all files and creates a list of unique samples from all these files. + * @param files list of files with sample names in + * @return a collection of unique samples from all files + */ + public static Collection getSamplesFromFiles (Collection files) { + Set samplesFromFiles = new HashSet(); + if (files != null) { + for (File file : files) { + try { + XReadLines reader = new XReadLines(file); + List lines = reader.readLines(); + for (String line : lines) { + samplesFromFiles.add(line); + } + } catch (FileNotFoundException e) { + // ignore exception + } + } + } + return samplesFromFiles; + } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/BaseTest.java b/java/test/org/broadinstitute/sting/BaseTest.java index 693a593f0..07f46c1c3 100755 --- a/java/test/org/broadinstitute/sting/BaseTest.java +++ b/java/test/org/broadinstitute/sting/BaseTest.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.commandline.CommandLineUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; +import javax.swing.*; import java.io.*; import java.math.BigInteger; import java.security.MessageDigest; @@ -66,6 +67,10 @@ public abstract class BaseTest { public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.rod"; public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf"; + public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/"; + public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf"; + public static final String b37hapmapSites = hapmapDataLocation + "sites_r27_nr.b37_fwd.vcf"; + public static final String intervalsLocation = GATKDataLocation; public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 66a7b4d9b..e18287a21 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -16,9 +16,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn '[CDH]' -sn " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF3 " + testfile + " -NO_HEADER"), + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF3 " + testfile + " -NO_HEADER"), 1, - Arrays.asList("1df84e2b755cce19f1876710ec38dd2c") + Arrays.asList("1b9d551298dc048c7d36b60440ff4d50") ); executeTest("testComplexSelection--" + testfile, spec); @@ -36,4 +36,31 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testRepeatedLineSelection--" + testfile, spec); } + + @Test + public void testDiscordance() { + String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -disc myvar -L 20:1012700-1020000 -B:variant,VCF " + b37hapmapGenotypes + " -B:myvar,VCF " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("97621ae8f29955eedfc4e0be3515fcb9") + ); + + executeTest("testDiscordance--" + testFile, spec); + } + + @Test + public void testConcordance() { + String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -conc hapmap -L 20:1012700-1020000 -B:hapmap,VCF " + b37hapmapGenotypes + " -B:variant,VCF " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("a0ae016fdffcbe7bfb99fd3dbc311407") + ); + + executeTest("testConcordance--" + testFile, spec); + } + }