SelectVariants: added parameters for mendelian violation. Given a trio vcf, it will generate a VCF with the sites that are mendelian violations.

GenotypeAndValidate: now annotates the validations with callStatus.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5425 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-03-11 17:47:53 +00:00
parent b03055099a
commit 4a84a81d17
2 changed files with 45 additions and 19 deletions

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.MendelianViolation;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.vcf.VCFHeader;
@ -40,15 +41,9 @@ import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.omg.PortableInterceptor.DISCARDING;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.File;
import java.io.FileNotFoundException;
/**
* Takes a VCF file, selects variants based on sample(s) in which it was found and/or on various annotation criteria,
@ -75,6 +70,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called from a ROD comparison track. Use -disc ROD_NAME", required=false)
private String discordanceRodName = "";
@Argument(fullName="family", shortName="fam", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String FAMILY_STRUCTURE = "";
@Argument(fullName="mendelian_violation_quality", shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
private ArrayList<String> selectNames = new ArrayList<String>();
private List<VariantContextUtils.JexlVCMatchExp> jexls = null;
@ -82,6 +84,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private boolean DISCORDANCE_ONLY = false;
private MendelianViolation mv;
private boolean MENDELIAN_VIOLATIONS = false;
/**
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
*/
@ -93,13 +99,19 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
DISCORDANCE_ONLY = discordanceRodName.length() > 0;
samples = SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS);
for (String sample : samples) {
logger.info("Including sample '" + sample + "'");
}
DISCORDANCE_ONLY = discordanceRodName.length() > 0;
if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName);
if (!FAMILY_STRUCTURE.isEmpty()) {
mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
MENDELIAN_VIOLATIONS = true;
}
// Initialize VCF header
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
@ -145,25 +157,31 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
}
}
}
return 0;
}
if ( vcs == null || vcs.size() == 0) {
return 0;
}
for (VariantContext vc : vcs) {
VariantContext sub = subsetRecord(vc, samples);
if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) {
//System.out.printf("%s%n",sub.toString());
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
if ( !VariantContextUtils.match(sub, jexl) ) {
return 0;
}
if (MENDELIAN_VIOLATIONS) {
if (mv.isViolation(vc)) {
vcfWriter.add(vc, ref.getBase());
}
}
vcfWriter.add(sub, ref.getBase());
else {
VariantContext sub = subsetRecord(vc, samples);
if ( (sub.isPolymorphic() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) {
//System.out.printf("%s%n",sub.toString());
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
if ( !VariantContextUtils.match(sub, jexl) ) {
return 0;
}
}
vcfWriter.add(sub, ref.getBase());
}
}
}

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.oneoffprojects.walkers;
import com.google.common.collect.ImmutableSet;
import org.broad.tribble.util.variantcontext.MutableVariantContext;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
@ -45,6 +46,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import sun.management.counter.Variability;
import java.util.*;
@ -199,7 +201,13 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
else
counter.numFP = 1L;
}
vcfWriter.add(vcComp, ref.getBase());
if (!vcComp.hasAttribute("callStatus")) {
MutableVariantContext mvc = new MutableVariantContext(vcComp);
mvc.putAttribute("callStatus", call.confidentlyCalled ? "confident" : "notConfident" );
vcfWriter.add(mvc, ref.getBase());
}
else
vcfWriter.add(vcComp, ref.getBase());
return counter;
}
@ -224,7 +232,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
logger.info("FP = " + reduceSum.numFP);
logger.info("FN = " + reduceSum.numFN);
logger.info("PPV = " + ((double) reduceSum.numTP /( reduceSum.numTP + reduceSum.numFP)));
logger.info("FPV = " + ((double) reduceSum.numTN /( reduceSum.numTN + reduceSum.numFN)));
logger.info("NPV = " + ((double) reduceSum.numTN /( reduceSum.numTN + reduceSum.numFN)));
logger.info("Uncovered = " + reduceSum.numUncovered);
logger.info("confidently called = " + reduceSum.numConfidentCalls);
logger.info("not confidently called = " + reduceSum.numNotConfidentCalls );