gatk-3.8/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java

212 lines
7.9 KiB
Java
Raw Normal View History

package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* User: carneiro
* Date: 3/9/11
* Time: 12:38 PM
*/
public class MendelianViolation {
String sampleMom;
String sampleDad;
String sampleChild;
List allelesMom;
List allelesDad;
List allelesChild;
double minGenotypeQuality;
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 };
static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 };
public String getSampleMom() {
return sampleMom;
}
public String getSampleDad() {
return sampleDad;
}
public String getSampleChild() {
return sampleChild;
}
public double getMinGenotypeQuality() {
return minGenotypeQuality;
}
/**
*
* @param sampleMomP - sample name of mom
* @param sampleDadP - sample name of dad
* @param sampleChildP - sample name of child
*/
public MendelianViolation (String sampleMomP, String sampleDadP, String sampleChildP) {
sampleMom = sampleMomP;
sampleDad = sampleDadP;
sampleChild = sampleChildP;
}
/**
*
* @param family - the sample names string "mom+dad=child"
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
*/
public MendelianViolation(String family, double minGenotypeQualityP) {
minGenotypeQuality = minGenotypeQualityP;
Matcher m = FAMILY_PATTERN.matcher(family);
if (m.matches()) {
sampleMom = m.group(1);
sampleDad = m.group(2);
sampleChild = m.group(3);
}
else
throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child");
}
/**
* An alternative to the more general constructor if you want to get the Sample information from the engine yourself.
* @param sample - the sample object extracted from the sample metadata YAML file given to the engine.
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
*/
public MendelianViolation(Sample sample, double minGenotypeQualityP) {
sampleMom = sample.getMother().getId();
sampleDad = sample.getFather().getId();
sampleChild = sample.getId();
minGenotypeQuality = minGenotypeQualityP;
}
/**
* The most common constructor to be used when give a YAML file with the relationships to the engine with the -SM option.
* @param engine - The GATK engine, use getToolkit(). That's where the sample information is stored.
* @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
*/
public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) {
boolean gotSampleInformation = false;
Collection<Sample> samples = engine.getSamples();
// Iterate through all samples in the sample_metadata file but we really can only take one.
for (Sample sample : samples) {
if (sample.getMother() != null && sample.getFather() != null) {
sampleMom = sample.getMother().getId();
sampleDad = sample.getFather().getId();
sampleChild = sample.getId();
minGenotypeQuality = minGenotypeQualityP;
gotSampleInformation = true;
break; // we can only deal with one trio information
}
}
if (!gotSampleInformation)
throw new UserException("YAML file has no sample with relationship information (mother/father)");
}
/**
* This method prepares the object to evaluate for violation. Typically you won't call it directly, a call to
* isViolation(vc) will take care of this. But if you want to know whether your site was a valid comparison site
* before evaluating it for mendelian violation, you can call setAlleles and then isViolation().
* @param vc - the variant context to extract the genotypes and alleles for mom, dad and child.
* @return false if couldn't find the genotypes or context has empty alleles. True otherwise.
*/
public boolean setAlleles (VariantContext vc)
{
Genotype gMom = vc.getGenotypes(sampleMom).get(sampleMom);
Genotype gDad = vc.getGenotypes(sampleDad).get(sampleDad);
Genotype gChild = vc.getGenotypes(sampleChild).get(sampleChild);
if (gMom == null || gDad == null || gChild == null)
throw new IllegalArgumentException(String.format("Variant %s:%d didn't contain genotypes for all family members: mom=%s dad=%s child=%s", vc.getChr(), vc.getStart(), sampleMom, sampleDad, sampleChild));
if (gMom.isNoCall() || gDad.isNoCall() || gChild.isNoCall() ||
gMom.getPhredScaledQual() < minGenotypeQuality ||
gDad.getPhredScaledQual() < minGenotypeQuality ||
gChild.getPhredScaledQual() < minGenotypeQuality ) {
return false;
}
allelesMom = gMom.getAlleles();
allelesDad = gDad.getAlleles();
allelesChild = gChild.getAlleles();
return !allelesMom.isEmpty() && !allelesDad.isEmpty() && !allelesChild.isEmpty();
}
/**
*
* @param vc the variant context to extract the genotypes and alleles for mom, dad and child.
* @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation.
*
*/
public boolean isViolation (VariantContext vc)
{
return setAlleles(vc) && isViolation();
}
/**
* @return whether or not there is a mendelian violation at the site.
*/
public boolean isViolation() {
if (allelesMom.contains(allelesChild.get(0)) && allelesDad.contains(allelesChild.get(1)) ||
allelesMom.contains(allelesChild.get(1)) && allelesDad.contains(allelesChild.get(0)))
return false;
return true;
}
/**
* @return the likelihood ratio for a mendelian violation
*/
public double violationLikelihoodRatio(VariantContext vc) {
double[] logLikAssignments = new double[27];
// the matrix to set up is
// MOM DAD CHILD
// |- AA
// AA AA | AB
// |- BB
// |- AA
// AA AB | AB
// |- BB
// etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs
double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector();
double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector();
double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector();
int offset = 0;
for ( int oMom = 0; oMom < 3; oMom++ ) {
for ( int oDad = 0; oDad < 3; oDad++ ) {
for ( int oChild = 0; oChild < 3; oChild ++ ) {
logLikAssignments[offset++] = momGL[oMom] + dadGL[oDad] + childGL[oChild];
}
}
}
double[] mvLiks = new double[12];
double[] nonMVLiks = new double[15];
for ( int i = 0; i < 12; i ++ ) {
mvLiks[i] = logLikAssignments[mvOffsets[i]];
}
for ( int i = 0; i < 15; i++) {
nonMVLiks[i] = logLikAssignments[nonMVOffsets[i]];
}
return MathUtils.log10sumLog10(mvLiks) - MathUtils.log10sumLog10(nonMVLiks);
}
}