Many updates to SelectVariants :
1) There is now a different parameter for sample name (-sn), sample file (-sf) or sample expression (-se). The unexpected behavior of the previous implementation was way too tricky to leave unchecked. (if you had a file or directory named after a sample name, SV wouldn't work) 1b) Fixed a TODO added by Eric -- now the output vcf always has the samples sorted alphabetically regardless of input (this came as a byproduct of the implementation of 1) 2) Discordance and Concordance now work in combination with all other parameters. 3) Discordance now follows Guillermo's suggestion where the discordance track is your VCF and the variant track is the one you are comparing to. I have updated the example in the wiki to reflect this change in interpretation. 4) If you DON'T provide any samples (-sn, -se or -sf), SelectVariants works with all samples from the VCF and ignores sample/genotype information when doing concordance or discordance. That is, it will report every "missing line" or "concordant line" in the two vcfs, regardless of sample or genotype information. 5) When samples are provided (-sn, -se or -sf) discordance and concordance will go down to the genotypes to determine whether or not you have a discordance/concordance event. In this case, a concordance happens only when the two VCFs display the same sample/genotype information for that locus, and discordance happens when the disc track is missing the line or has a different genotype information for that sample. 6) When dealing with multiple samples, concordance only happens if ALL your samples agree, and discordance happens if AT LEAST ONE of your samples disagree. --- Integration tests: 1) Discordance and concordance test added 2) All other tests updated to comply with the new 'sorted output' format and different inputs for samples. --- Methods for handling sample expressions and files with list of samples were added to SampleUtils. I recommend *NOT USING* the old getSamplesFromCommandLineInput as this mixing of sample names with expressions and files creates a rogue error that can be challenging to catch. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6072 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
48055d45cb
commit
91fb664135
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||
|
|
@ -44,6 +45,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
|||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -56,8 +58,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter vcfWriter = null;
|
||||
|
||||
@Argument(fullName="sample", shortName="sn", doc="Sample(s) to include. Can be a single sample, specified multiple times for many samples, a file containing sample names, a regular expression to select many samples, or any combination thereof.", required=false)
|
||||
public Set<String> SAMPLE_EXPRESSIONS;
|
||||
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
|
||||
public Set<String> sampleNames;
|
||||
|
||||
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times.", required=false)
|
||||
public Set<String> sampleExpressions;
|
||||
|
||||
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
|
||||
public Set<File> sampleFiles;
|
||||
|
||||
@Argument(shortName="select", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false)
|
||||
public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<String>();
|
||||
|
|
@ -65,10 +73,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
|
||||
private boolean EXCLUDE_NON_VARIANTS = false;
|
||||
|
||||
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci.", required=false)
|
||||
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis.", required=false)
|
||||
private boolean EXCLUDE_FILTERED = false;
|
||||
|
||||
|
||||
@Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called on a ROD comparison track. Use -disc ROD_NAME", required=false)
|
||||
private String discordanceRodName = "";
|
||||
|
||||
|
|
@ -117,7 +124,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
private ArrayList<String> selectNames = new ArrayList<String>();
|
||||
private List<VariantContextUtils.JexlVCMatchExp> jexls = null;
|
||||
|
||||
private Set<String> samples = new HashSet<String>();
|
||||
private TreeSet<String> samples = new TreeSet<String>();
|
||||
private boolean NO_SAMPLES_SPECIFIED = false;
|
||||
|
||||
private boolean DISCORDANCE_ONLY = false;
|
||||
private boolean CONCORDANCE_ONLY = false;
|
||||
|
|
@ -149,11 +157,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
rodNames.add(variantRodName);
|
||||
|
||||
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
|
||||
Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
|
||||
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
|
||||
|
||||
Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
|
||||
Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);
|
||||
|
||||
samples.addAll(samplesFromFile);
|
||||
samples.addAll(samplesFromExpressions);
|
||||
samples.addAll(sampleNames);
|
||||
|
||||
if(samples.isEmpty()) {
|
||||
samples.addAll(vcfSamples);
|
||||
NO_SAMPLES_SPECIFIED = true;
|
||||
}
|
||||
|
||||
// TODO -- This causes the sample ordering in the VCF to be "random" (i.e. whatever order they are pulled from the HashSet).
|
||||
// TODO -- We should either enforce alphabetical ordering here (as with other walkers) or base it on the order from the command-line - but we shouldn't leave this up to Java to decide.
|
||||
samples = SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS);
|
||||
for (String sample : samples) {
|
||||
logger.info("Including sample '" + sample + "'");
|
||||
}
|
||||
|
|
@ -192,7 +209,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
||||
if (SELECT_RANDOM_FRACTION) logger.info("Selecting " + fractionRandom + " variants at random from the variant track");
|
||||
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + fractionRandom + "% of the variants at random from the variant track");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -222,13 +239,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
if (DISCORDANCE_ONLY) {
|
||||
Collection<VariantContext> compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false);
|
||||
if (!compVCs.isEmpty())
|
||||
return 0; // return is correct: no matter what vcs has, if compVC is not empty there's nothing more to do at this site
|
||||
if (!isDiscordant(vc, compVCs))
|
||||
return 0;
|
||||
}
|
||||
if (CONCORDANCE_ONLY) {
|
||||
Collection<VariantContext> compVCs = tracker.getVariantContexts(ref, concordanceRodName, null, context.getLocation(), true, false);
|
||||
if (compVCs.isEmpty())
|
||||
return 0; // return is correct: no matter what vcs has, if compVC is empty there's nothing more to do at this site
|
||||
if (!isConcordant(vc, compVCs))
|
||||
return 0;
|
||||
}
|
||||
|
||||
// TODO - add ability to also select MNPs
|
||||
|
|
@ -241,7 +258,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
VariantContext sub = subsetRecord(vc, samples);
|
||||
if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) {
|
||||
//System.out.printf("%s%n",sub.toString());
|
||||
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
|
||||
if ( !VariantContextUtils.match(sub, jexl) ) {
|
||||
return 0;
|
||||
|
|
@ -260,7 +276,88 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if vc has a variant call for (at least one of) the samples.
|
||||
* @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)
|
||||
* @param compVCs the comparison VariantContext (discordance
|
||||
* @return
|
||||
*/
|
||||
private boolean isDiscordant (VariantContext vc, Collection<VariantContext> compVCs) {
|
||||
if (vc == null)
|
||||
return false;
|
||||
|
||||
// if we're not looking at specific samples then the absense of a compVC means discordance
|
||||
if (NO_SAMPLES_SPECIFIED && (compVCs == null || compVCs.isEmpty()))
|
||||
return true;
|
||||
|
||||
// check if we find it in the variant rod
|
||||
Map<String, Genotype> genotypes = vc.getGenotypes(samples);
|
||||
for (Genotype g : genotypes.values()) {
|
||||
if (sampleHasVariant(g)) {
|
||||
// There is a variant called (or filtered with not exclude filtered option set) that is not HomRef for at least one of the samples.
|
||||
if (compVCs == null)
|
||||
return true;
|
||||
// Look for this sample in the all vcs of the comp ROD track.
|
||||
boolean foundVariant = false;
|
||||
for (VariantContext compVC : compVCs) {
|
||||
if (sampleHasVariant(compVC.getGenotype(g.getSampleName()))) {
|
||||
foundVariant = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// if (at least one sample) was not found in all VCs of the comp ROD, we have discordance
|
||||
if (!foundVariant)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false; // we only get here if all samples have a variant in the comp rod.
|
||||
}
|
||||
|
||||
private boolean isConcordant (VariantContext vc, Collection<VariantContext> compVCs) {
|
||||
if (vc == null || compVCs == null || compVCs.isEmpty())
|
||||
return false;
|
||||
|
||||
// if we're not looking for specific samples then the fact that we have both VCs is enough to call it concordant.
|
||||
if (NO_SAMPLES_SPECIFIED)
|
||||
return true;
|
||||
|
||||
// make a list of all samples contained in this variant VC that are being tracked by the user command line arguments.
|
||||
Set<String> variantSamples = vc.getSampleNames();
|
||||
variantSamples.retainAll(samples);
|
||||
|
||||
// check if we can find all samples from the variant rod in the comp rod.
|
||||
for (String sample : variantSamples) {
|
||||
boolean foundSample = false;
|
||||
for (VariantContext compVC : compVCs) {
|
||||
Genotype varG = vc.getGenotype(sample);
|
||||
Genotype compG = compVC.getGenotype(sample);
|
||||
if (haveSameGenotypes(varG, compG)) {
|
||||
foundSample = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// if at least one sample doesn't have the same genotype, we don't have concordance
|
||||
if (!foundSample) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean sampleHasVariant(Genotype g) {
|
||||
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED)));
|
||||
}
|
||||
|
||||
private boolean haveSameGenotypes(Genotype g1, Genotype g2) {
|
||||
if ((g1.isCalled() && g2.isFiltered()) ||
|
||||
(g2.isCalled() && g1.isFiltered()) ||
|
||||
(g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))
|
||||
return false;
|
||||
|
||||
List<Allele> a1s = g1.getAlleles();
|
||||
List<Allele> a2s = g2.getAlleles();
|
||||
return (a1s.containsAll(a2s) && a2s.containsAll(a1s));
|
||||
}
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
|
|
|
|||
|
|
@ -270,4 +270,51 @@ public class SampleUtils {
|
|||
|
||||
return samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a collection of samples and a collection of regular expressions, generates the set of samples that match each expression
|
||||
* @param originalSamples list of samples to select samples from
|
||||
* @param sampleExpressions list of expressions to use for matching samples
|
||||
* @return the set of samples from originalSamples that satisfy at least one of the expressions in sampleExpressions
|
||||
*/
|
||||
public static Collection<String> matchSamplesExpressions (Collection<String> originalSamples, Collection<String> sampleExpressions) {
|
||||
// Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions
|
||||
Set<String> samples = new HashSet<String>();
|
||||
if (sampleExpressions != null) {
|
||||
for (String expression : sampleExpressions) {
|
||||
Pattern p = Pattern.compile(expression);
|
||||
|
||||
for (String originalSample : originalSamples) {
|
||||
Matcher m = p.matcher(originalSample);
|
||||
if (m.find()) {
|
||||
samples.add(originalSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a list of files with sample names it reads all files and creates a list of unique samples from all these files.
|
||||
* @param files list of files with sample names in
|
||||
* @return a collection of unique samples from all files
|
||||
*/
|
||||
public static Collection<String> getSamplesFromFiles (Collection<File> files) {
|
||||
Set<String> samplesFromFiles = new HashSet<String>();
|
||||
if (files != null) {
|
||||
for (File file : files) {
|
||||
try {
|
||||
XReadLines reader = new XReadLines(file);
|
||||
List<String> lines = reader.readLines();
|
||||
for (String line : lines) {
|
||||
samplesFromFiles.add(line);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
// ignore exception
|
||||
}
|
||||
}
|
||||
}
|
||||
return samplesFromFiles;
|
||||
}
|
||||
}
|
||||
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.commandline.CommandLineUtils;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.testng.Assert;
|
||||
|
||||
import javax.swing.*;
|
||||
import java.io.*;
|
||||
import java.math.BigInteger;
|
||||
import java.security.MessageDigest;
|
||||
|
|
@ -66,6 +67,10 @@ public abstract class BaseTest {
|
|||
public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.rod";
|
||||
public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf";
|
||||
|
||||
public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/";
|
||||
public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf";
|
||||
public static final String b37hapmapSites = hapmapDataLocation + "sites_r27_nr.b37_fwd.vcf";
|
||||
|
||||
public static final String intervalsLocation = GATKDataLocation;
|
||||
public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list";
|
||||
public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list";
|
||||
|
|
|
|||
|
|
@ -16,9 +16,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString(" -sn A -sn '[CDH]' -sn " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF3 " + testfile + " -NO_HEADER"),
|
||||
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' -B:variant,VCF3 " + testfile + " -NO_HEADER"),
|
||||
1,
|
||||
Arrays.asList("1df84e2b755cce19f1876710ec38dd2c")
|
||||
Arrays.asList("1b9d551298dc048c7d36b60440ff4d50")
|
||||
);
|
||||
|
||||
executeTest("testComplexSelection--" + testfile, spec);
|
||||
|
|
@ -36,4 +36,31 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
|
||||
executeTest("testRepeatedLineSelection--" + testfile, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDiscordance() {
|
||||
String testFile = validationDataLocation + "NA12878.hg19.example1.vcf";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -disc myvar -L 20:1012700-1020000 -B:variant,VCF " + b37hapmapGenotypes + " -B:myvar,VCF " + testFile + " -o %s -NO_HEADER",
|
||||
1,
|
||||
Arrays.asList("97621ae8f29955eedfc4e0be3515fcb9")
|
||||
);
|
||||
|
||||
executeTest("testDiscordance--" + testFile, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConcordance() {
|
||||
String testFile = validationDataLocation + "NA12878.hg19.example1.vcf";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -conc hapmap -L 20:1012700-1020000 -B:hapmap,VCF " + b37hapmapGenotypes + " -B:variant,VCF " + testFile + " -o %s -NO_HEADER",
|
||||
1,
|
||||
Arrays.asList("a0ae016fdffcbe7bfb99fd3dbc311407")
|
||||
);
|
||||
|
||||
executeTest("testConcordance--" + testFile, spec);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue