SelectVariants: added parameters for mendelian violation. Given a trio vcf, it will generate a VCF with the sites that are mendelian violations.

GenotypeAndValidate: now annotates the validations with callStatus.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5425 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-03-11 17:47:53 +00:00
parent b03055099a
commit 4a84a81d17
2 changed files with 45 additions and 19 deletions

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils; package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.MendelianViolation;
import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants; import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.vcf.VCFHeader; import org.broad.tribble.vcf.VCFHeader;
@ -40,15 +41,9 @@ import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.vcf.VCFUtils; import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.omg.PortableInterceptor.DISCARDING;
import java.util.*; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.File;
import java.io.FileNotFoundException;
/** /**
* Takes a VCF file, selects variants based on sample(s) in which it was found and/or on various annotation criteria, * Takes a VCF file, selects variants based on sample(s) in which it was found and/or on various annotation criteria,
@ -75,6 +70,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called from a ROD comparison track. Use -disc ROD_NAME", required=false) @Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called from a ROD comparison track. Use -disc ROD_NAME", required=false)
private String discordanceRodName = ""; private String discordanceRodName = "";
@Argument(fullName="family", shortName="fam", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String FAMILY_STRUCTURE = "";
@Argument(fullName="mendelian_violation_quality", shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
private ArrayList<String> selectNames = new ArrayList<String>(); private ArrayList<String> selectNames = new ArrayList<String>();
private List<VariantContextUtils.JexlVCMatchExp> jexls = null; private List<VariantContextUtils.JexlVCMatchExp> jexls = null;
@ -82,6 +84,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private boolean DISCORDANCE_ONLY = false; private boolean DISCORDANCE_ONLY = false;
private MendelianViolation mv;
private boolean MENDELIAN_VIOLATIONS = false;
/** /**
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
*/ */
@ -93,13 +99,19 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
DISCORDANCE_ONLY = discordanceRodName.length() > 0;
samples = SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS); samples = SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS);
for (String sample : samples) { for (String sample : samples) {
logger.info("Including sample '" + sample + "'"); logger.info("Including sample '" + sample + "'");
} }
DISCORDANCE_ONLY = discordanceRodName.length() > 0;
if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName); if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName);
if (!FAMILY_STRUCTURE.isEmpty()) {
mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
MENDELIAN_VIOLATIONS = true;
}
// Initialize VCF header // Initialize VCF header
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
headerLines.add(new VCFHeaderLine("source", "SelectVariants")); headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
@ -145,25 +157,31 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
} }
} }
} }
return 0;
} }
if ( vcs == null || vcs.size() == 0) { if ( vcs == null || vcs.size() == 0) {
return 0; return 0;
} }
for (VariantContext vc : vcs) { for (VariantContext vc : vcs) {
VariantContext sub = subsetRecord(vc, samples); if (MENDELIAN_VIOLATIONS) {
if (mv.isViolation(vc)) {
if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { vcfWriter.add(vc, ref.getBase());
//System.out.printf("%s%n",sub.toString());
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
if ( !VariantContextUtils.match(sub, jexl) ) {
return 0;
}
} }
}
vcfWriter.add(sub, ref.getBase()); else {
VariantContext sub = subsetRecord(vc, samples);
if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) {
//System.out.printf("%s%n",sub.toString());
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
if ( !VariantContextUtils.match(sub, jexl) ) {
return 0;
}
}
vcfWriter.add(sub, ref.getBase());
}
} }
} }

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.oneoffprojects.walkers; package org.broadinstitute.sting.oneoffprojects.walkers;
import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableSet;
import org.broad.tribble.util.variantcontext.MutableVariantContext;
import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFHeader; import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine; import org.broad.tribble.vcf.VCFHeaderLine;
@ -45,6 +46,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils; import org.broadinstitute.sting.utils.vcf.VCFUtils;
import sun.management.counter.Variability;
import java.util.*; import java.util.*;
@ -199,7 +201,13 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
else else
counter.numFP = 1L; counter.numFP = 1L;
} }
vcfWriter.add(vcComp, ref.getBase()); if (!vcComp.hasAttribute("callStatus")) {
MutableVariantContext mvc = new MutableVariantContext(vcComp);
mvc.putAttribute("callStatus", call.confidentlyCalled ? "confident" : "notConfident" );
vcfWriter.add(mvc, ref.getBase());
}
else
vcfWriter.add(vcComp, ref.getBase());
return counter; return counter;
} }
@ -224,7 +232,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
logger.info("FP = " + reduceSum.numFP); logger.info("FP = " + reduceSum.numFP);
logger.info("FN = " + reduceSum.numFN); logger.info("FN = " + reduceSum.numFN);
logger.info("PPV = " + ((double) reduceSum.numTP /( reduceSum.numTP + reduceSum.numFP))); logger.info("PPV = " + ((double) reduceSum.numTP /( reduceSum.numTP + reduceSum.numFP)));
logger.info("FPV = " + ((double) reduceSum.numTN /( reduceSum.numTN + reduceSum.numFN))); logger.info("NPV = " + ((double) reduceSum.numTN /( reduceSum.numTN + reduceSum.numFN)));
logger.info("Uncovered = " + reduceSum.numUncovered); logger.info("Uncovered = " + reduceSum.numUncovered);
logger.info("confidently called = " + reduceSum.numConfidentCalls); logger.info("confidently called = " + reduceSum.numConfidentCalls);
logger.info("not confidently called = " + reduceSum.numNotConfidentCalls ); logger.info("not confidently called = " + reduceSum.numNotConfidentCalls );