YAML integrated mendelian violation utility class, integrated and tested through select variants. Wiki is updated.

ps: I moved it out of tribble. If you think it should reside in a different place, just yell at me.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5436 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-03-14 16:43:37 +00:00
parent 42f70d9e07
commit 33c7593218
2 changed files with 157 additions and 6 deletions

View File

@ -25,7 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.MendelianViolation;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.vcf.VCFHeader;
@ -70,9 +70,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called from a ROD comparison track. Use -disc ROD_NAME", required=false)
private String discordanceRodName = "";
@Argument(fullName="family", shortName="fam", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
@Deprecated
@Argument(fullName="family", shortName="fam", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String FAMILY_STRUCTURE = "";
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only. Sample metadata information will be taken from YAML file (passed with -SM)", required=false)
private Boolean MENDELIAN_VIOLATIONS = false;
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
@ -85,7 +89,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private boolean DISCORDANCE_ONLY = false;
private MendelianViolation mv;
private boolean MENDELIAN_VIOLATIONS = false;
/**
@ -107,10 +110,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
DISCORDANCE_ONLY = discordanceRodName.length() > 0;
if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName);
if (!FAMILY_STRUCTURE.isEmpty()) {
if (MENDELIAN_VIOLATIONS)
mv = new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD);
else if (!FAMILY_STRUCTURE.isEmpty())
mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
MENDELIAN_VIOLATIONS = true;
}
// Initialize VCF header
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);

View File

@ -0,0 +1,148 @@
package org.broadinstitute.sting.utils;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* User: carneiro
* Date: 3/9/11
* Time: 12:38 PM
*/
public class MendelianViolation {
String sampleMom;
String sampleDad;
String sampleChild;
List allelesMom;
List allelesDad;
List allelesChild;
double minGenotypeQuality;
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
/**
*
* @param sampleMomP - sample name of mom
* @param sampleDadP - sample name of dad
* @param sampleChildP - sample name of child
*/
public MendelianViolation (String sampleMomP, String sampleDadP, String sampleChildP) {
sampleMom = sampleMomP;
sampleDad = sampleDadP;
sampleChild = sampleChildP;
}
/**
*
* @param family - the sample names string "mom+dad=child"
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
*/
public MendelianViolation(String family, double minGenotypeQualityP) {
minGenotypeQuality = minGenotypeQualityP;
Matcher m = FAMILY_PATTERN.matcher(family);
if (m.matches()) {
sampleMom = m.group(1);
sampleDad = m.group(2);
sampleChild = m.group(3);
}
else
throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child");
}
/**
* An alternative to the more general constructor if you want to get the Sample information from the engine yourself.
* @param sample - the sample object extracted from the sample metadata YAML file given to the engine.
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
*/
public MendelianViolation(Sample sample, double minGenotypeQualityP) {
sampleMom = sample.getMother().getId();
sampleDad = sample.getFather().getId();
sampleChild = sample.getId();
minGenotypeQuality = minGenotypeQualityP;
}
/**
* The most common constructor to be used when give a YAML file with the relationships to the engine with the -SM option.
* @param engine - The GATK engine, use getToolkit(). That's where the sample information is stored.
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
*/
public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) {
boolean gotSampleInformation = false;
Collection<Sample> samples = engine.getSamples();
// Iterate through all samples in the sample_metadata file but we really can only take one.
for (Sample sample : samples) {
if (sample.getMother() != null && sample.getFather() != null) {
sampleMom = sample.getMother().getId();
sampleDad = sample.getFather().getId();
sampleChild = sample.getId();
minGenotypeQuality = minGenotypeQualityP;
gotSampleInformation = true;
break; // we can only deal with one trio information
}
}
if (!gotSampleInformation)
throw new UserException("YAML file has no sample with relationship information (mother/father)");
}
/**
* @param vc - the variant context to extract the genotypes and alleles for mom, dad and child.
* @return false if couldn't find the genotypes or context has empty alleles. True otherwise.
*/
private boolean setAlleles (VariantContext vc)
{
Genotype gMom = vc.getGenotypes(sampleMom).get(sampleMom);
Genotype gDad = vc.getGenotypes(sampleDad).get(sampleDad);
Genotype gChild = vc.getGenotypes(sampleChild).get(sampleChild);
if (gMom == null || gDad == null || gChild == null)
throw new IllegalArgumentException(String.format("Variant %s:%d didn't contain genotypes for all family members: mom=%s dad=%s child=%s", vc.getChr(), vc.getStart(), sampleMom, sampleDad, sampleChild));
if (gMom.isNoCall() || gDad.isNoCall() || gChild.isNoCall() ||
gMom.getPhredScaledQual() < minGenotypeQuality ||
gDad.getPhredScaledQual() < minGenotypeQuality ||
gChild.getPhredScaledQual() < minGenotypeQuality ) {
return false;
}
allelesMom = gMom.getAlleles();
allelesDad = gDad.getAlleles();
allelesChild = gChild.getAlleles();
return !allelesMom.isEmpty() && !allelesDad.isEmpty() && !allelesChild.isEmpty();
}
/**
*
* @param vc the variant context to extract the genotypes and alleles for mom, dad and child.
* @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation.
*
*/
public boolean isViolation (VariantContext vc)
{
return setAlleles(vc) && isViolation();
}
/**
* @return whether or not there is a mendelian violation at the site.
*/
private boolean isViolation() {
if (allelesMom.contains(allelesChild.get(0)) && allelesDad.contains(allelesChild.get(1)) ||
allelesMom.contains(allelesChild.get(1)) && allelesDad.contains(allelesChild.get(0)))
return false;
return true;
}
}