diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 792900a4e..4925a1708 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.MendelianViolation; +import org.broadinstitute.sting.utils.MendelianViolation; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.VCFConstants; import org.broad.tribble.vcf.VCFHeader; @@ -70,9 +70,13 @@ public class SelectVariants extends RodWalker { @Argument(fullName="discordance", shortName = "disc", doc="Output variants that were not called from a ROD comparison track. Use -disc ROD_NAME", required=false) private String discordanceRodName = ""; - @Argument(fullName="family", shortName="fam", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + @Deprecated + @Argument(fullName="family", shortName="fam", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) private String FAMILY_STRUCTURE = ""; + @Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only. Sample metadata information will be taken from YAML file (passed with -SM)", required=false) + private Boolean MENDELIAN_VIOLATIONS = false; + @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; @@ -85,7 +89,6 @@ public class SelectVariants extends RodWalker { private boolean DISCORDANCE_ONLY = false; private MendelianViolation mv; - private boolean MENDELIAN_VIOLATIONS = false; /** @@ -107,10 +110,10 @@ public class SelectVariants extends RodWalker { DISCORDANCE_ONLY = discordanceRodName.length() > 0; if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceRodName); - if (!FAMILY_STRUCTURE.isEmpty()) { + if (MENDELIAN_VIOLATIONS) + mv = new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD); + else if (!FAMILY_STRUCTURE.isEmpty()) mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD); - MENDELIAN_VIOLATIONS = true; - } // Initialize VCF header Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); diff --git a/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/java/src/org/broadinstitute/sting/utils/MendelianViolation.java new file mode 100755 index 000000000..01c076a1c --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/MendelianViolation.java @@ -0,0 +1,148 @@ +package org.broadinstitute.sting.utils; + +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * User: carneiro + * Date: 3/9/11 + * Time: 12:38 PM + */ +public class MendelianViolation { + + String sampleMom; + String sampleDad; + String sampleChild; + + List allelesMom; + List allelesDad; + List allelesChild; + + double minGenotypeQuality; + + private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); + + /** + * + * @param sampleMomP - sample name of mom + * @param sampleDadP - sample name of dad + * @param sampleChildP - sample name of child + */ + public MendelianViolation (String sampleMomP, String sampleDadP, String sampleChildP) { + sampleMom = sampleMomP; + sampleDad = sampleDadP; + sampleChild = sampleChildP; + } + + /** + * + * @param family - the sample names string "mom+dad=child" + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + */ + public MendelianViolation(String family, double minGenotypeQualityP) { + minGenotypeQuality = minGenotypeQualityP; + + Matcher m = FAMILY_PATTERN.matcher(family); + if (m.matches()) { + sampleMom = m.group(1); + sampleDad = m.group(2); + sampleChild = m.group(3); + } + else + throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); + } + + /** + * An alternative to the more general constructor if you want to get the Sample information from the engine yourself. + * @param sample - the sample object extracted from the sample metadata YAML file given to the engine. + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + */ + public MendelianViolation(Sample sample, double minGenotypeQualityP) { + sampleMom = sample.getMother().getId(); + sampleDad = sample.getFather().getId(); + sampleChild = sample.getId(); + minGenotypeQuality = minGenotypeQualityP; + } + + + /** + * The most common constructor to be used when give a YAML file with the relationships to the engine with the -SM option. + * @param engine - The GATK engine, use getToolkit(). That's where the sample information is stored. + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + */ + public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) { + boolean gotSampleInformation = false; + Collection samples = engine.getSamples(); + // Iterate through all samples in the sample_metadata file but we really can only take one. + for (Sample sample : samples) { + if (sample.getMother() != null && sample.getFather() != null) { + sampleMom = sample.getMother().getId(); + sampleDad = sample.getFather().getId(); + sampleChild = sample.getId(); + minGenotypeQuality = minGenotypeQualityP; + gotSampleInformation = true; + break; // we can only deal with one trio information + } + } + if (!gotSampleInformation) + throw new UserException("YAML file has no sample with relationship information (mother/father)"); + } + + + /** + * @param vc - the variant context to extract the genotypes and alleles for mom, dad and child. + * @return false if couldn't find the genotypes or context has empty alleles. True otherwise. + */ + private boolean setAlleles (VariantContext vc) + { + Genotype gMom = vc.getGenotypes(sampleMom).get(sampleMom); + Genotype gDad = vc.getGenotypes(sampleDad).get(sampleDad); + Genotype gChild = vc.getGenotypes(sampleChild).get(sampleChild); + + if (gMom == null || gDad == null || gChild == null) + throw new IllegalArgumentException(String.format("Variant %s:%d didn't contain genotypes for all family members: mom=%s dad=%s child=%s", vc.getChr(), vc.getStart(), sampleMom, sampleDad, sampleChild)); + + if (gMom.isNoCall() || gDad.isNoCall() || gChild.isNoCall() || + gMom.getPhredScaledQual() < minGenotypeQuality || + gDad.getPhredScaledQual() < minGenotypeQuality || + gChild.getPhredScaledQual() < minGenotypeQuality ) { + + return false; + } + + allelesMom = gMom.getAlleles(); + allelesDad = gDad.getAlleles(); + allelesChild = gChild.getAlleles(); + return !allelesMom.isEmpty() && !allelesDad.isEmpty() && !allelesChild.isEmpty(); + } + + + /** + * + * @param vc the variant context to extract the genotypes and alleles for mom, dad and child. + * @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation. + * + */ + public boolean isViolation (VariantContext vc) + { + return setAlleles(vc) && isViolation(); + } + + /** + * @return whether or not there is a mendelian violation at the site. + */ + private boolean isViolation() { + if (allelesMom.contains(allelesChild.get(0)) && allelesDad.contains(allelesChild.get(1)) || + allelesMom.contains(allelesChild.get(1)) && allelesDad.contains(allelesChild.get(0))) + return false; + return true; + } + +}