YAML integrated mendelian violation utility class, integrated and tested through select variants. Wiki is updated.
ps: I moved it out of tribble. If you think it should reside in a different place, just yell at me. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5436 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
42f70d9e07
commit
33c7593218
|
|
@ -25,7 +25,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.MendelianViolation;
|
||||
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
|
|
@ -70,9 +70,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called from a ROD comparison track. Use -disc ROD_NAME", required=false)
|
||||
private String discordanceRodName = "";
|
||||
|
||||
@Argument(fullName="family", shortName="fam", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||
@Deprecated
|
||||
@Argument(fullName="family", shortName="fam", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||
private String FAMILY_STRUCTURE = "";
|
||||
|
||||
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only. Sample metadata information will be taken from YAML file (passed with -SM)", required=false)
|
||||
private Boolean MENDELIAN_VIOLATIONS = false;
|
||||
|
||||
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
||||
|
||||
|
|
@ -85,7 +89,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
private boolean DISCORDANCE_ONLY = false;
|
||||
|
||||
private MendelianViolation mv;
|
||||
private boolean MENDELIAN_VIOLATIONS = false;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -107,10 +110,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
|||
DISCORDANCE_ONLY = discordanceRodName.length() > 0;
|
||||
if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName);
|
||||
|
||||
if (!FAMILY_STRUCTURE.isEmpty()) {
|
||||
if (MENDELIAN_VIOLATIONS)
|
||||
mv = new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD);
|
||||
else if (!FAMILY_STRUCTURE.isEmpty())
|
||||
mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
|
||||
MENDELIAN_VIOLATIONS = true;
|
||||
}
|
||||
|
||||
// Initialize VCF header
|
||||
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,148 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
* Date: 3/9/11
|
||||
* Time: 12:38 PM
|
||||
*/
|
||||
public class MendelianViolation {
|
||||
|
||||
String sampleMom;
|
||||
String sampleDad;
|
||||
String sampleChild;
|
||||
|
||||
List allelesMom;
|
||||
List allelesDad;
|
||||
List allelesChild;
|
||||
|
||||
double minGenotypeQuality;
|
||||
|
||||
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
|
||||
|
||||
/**
|
||||
*
|
||||
* @param sampleMomP - sample name of mom
|
||||
* @param sampleDadP - sample name of dad
|
||||
* @param sampleChildP - sample name of child
|
||||
*/
|
||||
public MendelianViolation (String sampleMomP, String sampleDadP, String sampleChildP) {
|
||||
sampleMom = sampleMomP;
|
||||
sampleDad = sampleDadP;
|
||||
sampleChild = sampleChildP;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param family - the sample names string "mom+dad=child"
|
||||
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
|
||||
*/
|
||||
public MendelianViolation(String family, double minGenotypeQualityP) {
|
||||
minGenotypeQuality = minGenotypeQualityP;
|
||||
|
||||
Matcher m = FAMILY_PATTERN.matcher(family);
|
||||
if (m.matches()) {
|
||||
sampleMom = m.group(1);
|
||||
sampleDad = m.group(2);
|
||||
sampleChild = m.group(3);
|
||||
}
|
||||
else
|
||||
throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child");
|
||||
}
|
||||
|
||||
/**
|
||||
* An alternative to the more general constructor if you want to get the Sample information from the engine yourself.
|
||||
* @param sample - the sample object extracted from the sample metadata YAML file given to the engine.
|
||||
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
|
||||
*/
|
||||
public MendelianViolation(Sample sample, double minGenotypeQualityP) {
|
||||
sampleMom = sample.getMother().getId();
|
||||
sampleDad = sample.getFather().getId();
|
||||
sampleChild = sample.getId();
|
||||
minGenotypeQuality = minGenotypeQualityP;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The most common constructor to be used when give a YAML file with the relationships to the engine with the -SM option.
|
||||
* @param engine - The GATK engine, use getToolkit(). That's where the sample information is stored.
|
||||
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
|
||||
*/
|
||||
public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) {
|
||||
boolean gotSampleInformation = false;
|
||||
Collection<Sample> samples = engine.getSamples();
|
||||
// Iterate through all samples in the sample_metadata file but we really can only take one.
|
||||
for (Sample sample : samples) {
|
||||
if (sample.getMother() != null && sample.getFather() != null) {
|
||||
sampleMom = sample.getMother().getId();
|
||||
sampleDad = sample.getFather().getId();
|
||||
sampleChild = sample.getId();
|
||||
minGenotypeQuality = minGenotypeQualityP;
|
||||
gotSampleInformation = true;
|
||||
break; // we can only deal with one trio information
|
||||
}
|
||||
}
|
||||
if (!gotSampleInformation)
|
||||
throw new UserException("YAML file has no sample with relationship information (mother/father)");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param vc - the variant context to extract the genotypes and alleles for mom, dad and child.
|
||||
* @return false if couldn't find the genotypes or context has empty alleles. True otherwise.
|
||||
*/
|
||||
private boolean setAlleles (VariantContext vc)
|
||||
{
|
||||
Genotype gMom = vc.getGenotypes(sampleMom).get(sampleMom);
|
||||
Genotype gDad = vc.getGenotypes(sampleDad).get(sampleDad);
|
||||
Genotype gChild = vc.getGenotypes(sampleChild).get(sampleChild);
|
||||
|
||||
if (gMom == null || gDad == null || gChild == null)
|
||||
throw new IllegalArgumentException(String.format("Variant %s:%d didn't contain genotypes for all family members: mom=%s dad=%s child=%s", vc.getChr(), vc.getStart(), sampleMom, sampleDad, sampleChild));
|
||||
|
||||
if (gMom.isNoCall() || gDad.isNoCall() || gChild.isNoCall() ||
|
||||
gMom.getPhredScaledQual() < minGenotypeQuality ||
|
||||
gDad.getPhredScaledQual() < minGenotypeQuality ||
|
||||
gChild.getPhredScaledQual() < minGenotypeQuality ) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
allelesMom = gMom.getAlleles();
|
||||
allelesDad = gDad.getAlleles();
|
||||
allelesChild = gChild.getAlleles();
|
||||
return !allelesMom.isEmpty() && !allelesDad.isEmpty() && !allelesChild.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param vc the variant context to extract the genotypes and alleles for mom, dad and child.
|
||||
* @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation.
|
||||
*
|
||||
*/
|
||||
public boolean isViolation (VariantContext vc)
|
||||
{
|
||||
return setAlleles(vc) && isViolation();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return whether or not there is a mendelian violation at the site.
|
||||
*/
|
||||
private boolean isViolation() {
|
||||
if (allelesMom.contains(allelesChild.get(0)) && allelesDad.contains(allelesChild.get(1)) ||
|
||||
allelesMom.contains(allelesChild.get(1)) && allelesDad.contains(allelesChild.get(0)))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue