YAML integrated mendelian violation utility class, integrated and tested through select variants. Wiki is updated.
ps: I moved it out of tribble. If you think it should reside in a different place, just yell at me. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5436 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
42f70d9e07
commit
33c7593218
|
|
@ -25,7 +25,7 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
import org.broad.tribble.util.variantcontext.Genotype;
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
import org.broad.tribble.util.variantcontext.MendelianViolation;
|
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broad.tribble.vcf.VCFConstants;
|
import org.broad.tribble.vcf.VCFConstants;
|
||||||
import org.broad.tribble.vcf.VCFHeader;
|
import org.broad.tribble.vcf.VCFHeader;
|
||||||
|
|
@ -70,9 +70,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
@Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called from a ROD comparison track. Use -disc ROD_NAME", required=false)
|
@Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called from a ROD comparison track. Use -disc ROD_NAME", required=false)
|
||||||
private String discordanceRodName = "";
|
private String discordanceRodName = "";
|
||||||
|
|
||||||
@Argument(fullName="family", shortName="fam", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
@Deprecated
|
||||||
|
@Argument(fullName="family", shortName="fam", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||||
private String FAMILY_STRUCTURE = "";
|
private String FAMILY_STRUCTURE = "";
|
||||||
|
|
||||||
|
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only. Sample metadata information will be taken from YAML file (passed with -SM)", required=false)
|
||||||
|
private Boolean MENDELIAN_VIOLATIONS = false;
|
||||||
|
|
||||||
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||||
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
||||||
|
|
||||||
|
|
@ -85,7 +89,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
private boolean DISCORDANCE_ONLY = false;
|
private boolean DISCORDANCE_ONLY = false;
|
||||||
|
|
||||||
private MendelianViolation mv;
|
private MendelianViolation mv;
|
||||||
private boolean MENDELIAN_VIOLATIONS = false;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -107,10 +110,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
|
||||||
DISCORDANCE_ONLY = discordanceRodName.length() > 0;
|
DISCORDANCE_ONLY = discordanceRodName.length() > 0;
|
||||||
if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName);
|
if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName);
|
||||||
|
|
||||||
if (!FAMILY_STRUCTURE.isEmpty()) {
|
if (MENDELIAN_VIOLATIONS)
|
||||||
|
mv = new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD);
|
||||||
|
else if (!FAMILY_STRUCTURE.isEmpty())
|
||||||
mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
|
mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD);
|
||||||
MENDELIAN_VIOLATIONS = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize VCF header
|
// Initialize VCF header
|
||||||
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
|
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,148 @@
|
||||||
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User: carneiro
|
||||||
|
* Date: 3/9/11
|
||||||
|
* Time: 12:38 PM
|
||||||
|
*/
|
||||||
|
public class MendelianViolation {
|
||||||
|
|
||||||
|
String sampleMom;
|
||||||
|
String sampleDad;
|
||||||
|
String sampleChild;
|
||||||
|
|
||||||
|
List allelesMom;
|
||||||
|
List allelesDad;
|
||||||
|
List allelesChild;
|
||||||
|
|
||||||
|
double minGenotypeQuality;
|
||||||
|
|
||||||
|
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param sampleMomP - sample name of mom
|
||||||
|
* @param sampleDadP - sample name of dad
|
||||||
|
* @param sampleChildP - sample name of child
|
||||||
|
*/
|
||||||
|
public MendelianViolation (String sampleMomP, String sampleDadP, String sampleChildP) {
|
||||||
|
sampleMom = sampleMomP;
|
||||||
|
sampleDad = sampleDadP;
|
||||||
|
sampleChild = sampleChildP;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param family - the sample names string "mom+dad=child"
|
||||||
|
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
|
||||||
|
*/
|
||||||
|
public MendelianViolation(String family, double minGenotypeQualityP) {
|
||||||
|
minGenotypeQuality = minGenotypeQualityP;
|
||||||
|
|
||||||
|
Matcher m = FAMILY_PATTERN.matcher(family);
|
||||||
|
if (m.matches()) {
|
||||||
|
sampleMom = m.group(1);
|
||||||
|
sampleDad = m.group(2);
|
||||||
|
sampleChild = m.group(3);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An alternative to the more general constructor if you want to get the Sample information from the engine yourself.
|
||||||
|
* @param sample - the sample object extracted from the sample metadata YAML file given to the engine.
|
||||||
|
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
|
||||||
|
*/
|
||||||
|
public MendelianViolation(Sample sample, double minGenotypeQualityP) {
|
||||||
|
sampleMom = sample.getMother().getId();
|
||||||
|
sampleDad = sample.getFather().getId();
|
||||||
|
sampleChild = sample.getId();
|
||||||
|
minGenotypeQuality = minGenotypeQualityP;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The most common constructor to be used when give a YAML file with the relationships to the engine with the -SM option.
|
||||||
|
* @param engine - The GATK engine, use getToolkit(). That's where the sample information is stored.
|
||||||
|
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
|
||||||
|
*/
|
||||||
|
public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) {
|
||||||
|
boolean gotSampleInformation = false;
|
||||||
|
Collection<Sample> samples = engine.getSamples();
|
||||||
|
// Iterate through all samples in the sample_metadata file but we really can only take one.
|
||||||
|
for (Sample sample : samples) {
|
||||||
|
if (sample.getMother() != null && sample.getFather() != null) {
|
||||||
|
sampleMom = sample.getMother().getId();
|
||||||
|
sampleDad = sample.getFather().getId();
|
||||||
|
sampleChild = sample.getId();
|
||||||
|
minGenotypeQuality = minGenotypeQualityP;
|
||||||
|
gotSampleInformation = true;
|
||||||
|
break; // we can only deal with one trio information
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!gotSampleInformation)
|
||||||
|
throw new UserException("YAML file has no sample with relationship information (mother/father)");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param vc - the variant context to extract the genotypes and alleles for mom, dad and child.
|
||||||
|
* @return false if couldn't find the genotypes or context has empty alleles. True otherwise.
|
||||||
|
*/
|
||||||
|
private boolean setAlleles (VariantContext vc)
|
||||||
|
{
|
||||||
|
Genotype gMom = vc.getGenotypes(sampleMom).get(sampleMom);
|
||||||
|
Genotype gDad = vc.getGenotypes(sampleDad).get(sampleDad);
|
||||||
|
Genotype gChild = vc.getGenotypes(sampleChild).get(sampleChild);
|
||||||
|
|
||||||
|
if (gMom == null || gDad == null || gChild == null)
|
||||||
|
throw new IllegalArgumentException(String.format("Variant %s:%d didn't contain genotypes for all family members: mom=%s dad=%s child=%s", vc.getChr(), vc.getStart(), sampleMom, sampleDad, sampleChild));
|
||||||
|
|
||||||
|
if (gMom.isNoCall() || gDad.isNoCall() || gChild.isNoCall() ||
|
||||||
|
gMom.getPhredScaledQual() < minGenotypeQuality ||
|
||||||
|
gDad.getPhredScaledQual() < minGenotypeQuality ||
|
||||||
|
gChild.getPhredScaledQual() < minGenotypeQuality ) {
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
allelesMom = gMom.getAlleles();
|
||||||
|
allelesDad = gDad.getAlleles();
|
||||||
|
allelesChild = gChild.getAlleles();
|
||||||
|
return !allelesMom.isEmpty() && !allelesDad.isEmpty() && !allelesChild.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param vc the variant context to extract the genotypes and alleles for mom, dad and child.
|
||||||
|
* @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public boolean isViolation (VariantContext vc)
|
||||||
|
{
|
||||||
|
return setAlleles(vc) && isViolation();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return whether or not there is a mendelian violation at the site.
|
||||||
|
*/
|
||||||
|
private boolean isViolation() {
|
||||||
|
if (allelesMom.contains(allelesChild.get(0)) && allelesDad.contains(allelesChild.get(1)) ||
|
||||||
|
allelesMom.contains(allelesChild.get(1)) && allelesDad.contains(allelesChild.get(0)))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue