From 795c99d693409878dc64fad5232bb4574e7fa31d Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 28 Nov 2011 17:13:14 +0100 Subject: [PATCH] Adapted MendelianViolation to the new ped family representation. Adapted all classes using MendelianViolation too. MendelianViolationEvaluator was added a number of useful metrics on allele transmission and MVs --- .../walkers/annotator/MVLikelihoodRatio.java | 50 +- .../MendelianViolationEvaluator.java | 221 ++++---- .../walkers/variantutils/SelectVariants.java | 90 ++-- .../sting/utils/MendelianViolation.java | 476 +++++++++++++----- 4 files changed, 561 insertions(+), 276 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index bd0d4e3fb..b9e6a5b2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -3,22 +3,18 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MendelianViolation; -import org.broadinstitute.sting.utils.codecs.vcf.VCFFilterHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -30,23 +26,26 @@ import java.util.Map; public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation { private MendelianViolation mendelianViolation = null; + private String motherId; + private String fatherId; + private String childId; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( mendelianViolation == null ) { - if ( walker instanceof VariantAnnotator && ((VariantAnnotator) walker).familyStr != null) { - mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).familyStr, ((VariantAnnotator)walker).minGenotypeQualityP ); + if (checkAndSetSamples(((VariantAnnotator) walker).getSampleDB())) { + mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } else { - throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid Family String file (-family) on the command line."); + throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio."); } } Map toRet = new HashMap(1); - boolean hasAppropriateGenotypes = vc.hasGenotype(mendelianViolation.getSampleChild()) && vc.getGenotype(mendelianViolation.getSampleChild()).hasLikelihoods() && - vc.hasGenotype(mendelianViolation.getSampleDad()) && vc.getGenotype(mendelianViolation.getSampleDad()).hasLikelihoods() && - vc.hasGenotype(mendelianViolation.getSampleMom()) && vc.getGenotype(mendelianViolation.getSampleMom()).hasLikelihoods(); + boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() && + vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() && + vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods(); if ( hasAppropriateGenotypes ) - toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc)); + toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId)); return toRet; } @@ -55,4 +54,27 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment public List getKeyNames() { return Arrays.asList("MVLR"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } + + private boolean checkAndSetSamples(SampleDB db){ + Set families = db.getFamilyIDs(); + if(families.size() != 1) + return false; + + Set family = db.getFamily(families.iterator().next()); + if(family.size() != 3) + return false; + + Iterator sampleIter = family.iterator(); + Sample sample; + for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){ + if(sample.getParents().size()==2){ + motherId = sample.getMaternalID(); + fatherId = sample.getPaternalID(); + childId = sample.getID(); + return true; + } + } + return false; + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java index 0cadf6c0d..363f5665f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -7,9 +9,11 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.utils.MendelianViolation; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Map; +import java.util.Set; /** * Mendelian violation detection and counting @@ -40,12 +44,25 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; @Analysis(name = "Mendelian Violation Evaluator", description = "Mendelian Violation Evaluator") public class MendelianViolationEvaluator extends VariantEvaluator { - @DataPoint(description = "Number of mendelian variants found") + @DataPoint(description = "Number of variants found with at least one family having genotypes") long nVariants; + @DataPoint(description = "Number of variants found with no family having genotypes -- these sites do not count in the nNoCall") + long nSkipped; + @DataPoint(description="Number of variants x families called (no missing genotype or lowqual)") + long nFamCalled; + @DataPoint(description="Number of variants x families called (no missing genotype or lowqual) that contain at least one var allele.") + long nVarFamCalled; + @DataPoint(description="Number of variants x families discarded as low quality") + long nLowQual; + @DataPoint(description="Number of variants x families discarded as no call") + long nNoCall; + @DataPoint(description="Number of loci with mendelian violations") + long nLociViolations; @DataPoint(description = "Number of mendelian violations found") long nViolations; - @DataPoint(description = "number of child hom ref calls where the parent was hom variant") + + /*@DataPoint(description = "number of child hom ref calls where the parent was hom variant") long KidHomRef_ParentHomVar; @DataPoint(description = "number of child het calls where the parent was hom ref") long KidHet_ParentsHomRef; @@ -53,11 +70,65 @@ public class MendelianViolationEvaluator extends VariantEvaluator { long KidHet_ParentsHomVar; @DataPoint(description = "number of child hom variant calls where the parent was hom ref") long KidHomVar_ParentHomRef; + */ + + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HOM_VAR") + long mvRefRef_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HET") + long mvRefRef_Het; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HET -> HOM_VAR") + long mvRefHet_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_VAR") + long mvRefVar_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_REF") + long mvRefVar_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HET -> HOM_REF") + long mvVarHet_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HOM_REF") + long mvVarVar_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HET") + long mvVarVar_Het; + + + /*@DataPoint(description ="Number of inherited var alleles from het parents") + long nInheritedVar; + @DataPoint(description ="Number of inherited ref alleles from het parents") + long nInheritedRef;*/ + + @DataPoint(description="Number of HomRef/HomRef/HomRef trios") + long HomRefHomRef_HomRef; + @DataPoint(description="Number of Het/Het/Het trios") + long HetHet_Het; + @DataPoint(description="Number of Het/Het/HomRef trios") + long HetHet_HomRef; + @DataPoint(description="Number of Het/Het/HomVar trios") + long HetHet_HomVar; + @DataPoint(description="Number of HomVar/HomVar/HomVar trios") + long HomVarHomVar_HomVar; + @DataPoint(description="Number of HomRef/HomVar/Het trios") + long HomRefHomVAR_Het; + @DataPoint(description="Number of ref alleles inherited from het/het parents") + long HetHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from het/het parents") + long HetHet_inheritedVar; + @DataPoint(description="Number of ref alleles inherited from homRef/het parents") + long HomRefHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from homRef/het parents") + long HomRefHet_inheritedVar; + @DataPoint(description="Number of ref alleles inherited from homVar/het parents") + long HomVarHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from homVar/het parents") + long HomVarHet_inheritedVar; MendelianViolation mv; + PrintStream mvFile; + Map> families; public void initialize(VariantEvalWalker walker) { - mv = new MendelianViolation(walker.getFamilyStructure(), walker.getMendelianViolationQualThreshold()); + //Changed by Laurent Francioli - 2011-06-07 + //mv = new MendelianViolation(walker.getFamilyStructure(), walker.getMendelianViolationQualThreshold()); + mv = new MendelianViolation(walker.getMendelianViolationQualThreshold(),false); + families = walker.getSampleDB().getFamilies(); } public boolean enabled() { @@ -75,110 +146,48 @@ public class MendelianViolationEvaluator extends VariantEvaluator { public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (vc.isBiallelic() && vc.hasGenotypes()) { // todo -- currently limited to biallelic loci - if (mv.setAlleles(vc)) { + + if(mv.countViolations(families,vc)>0){ + nLociViolations++; + nViolations += mv.getViolationsCount(); + mvRefRef_Var += mv.getParentsRefRefChildVar(); + mvRefRef_Het += mv.getParentsRefRefChildHet(); + mvRefHet_Var += mv.getParentsRefHetChildVar(); + mvRefVar_Var += mv.getParentsRefVarChildVar(); + mvRefVar_Ref += mv.getParentsRefVarChildRef(); + mvVarHet_Ref += mv.getParentsVarHetChildRef(); + mvVarVar_Ref += mv.getParentsVarVarChildRef(); + mvVarVar_Het += mv.getParentsVarVarChildHet(); + + } + HomRefHomRef_HomRef += mv.getRefRefRef(); + HetHet_Het += mv.getHetHetHet(); + HetHet_HomRef += mv.getHetHetHomRef(); + HetHet_HomVar += mv.getHetHetHomVar(); + HomVarHomVar_HomVar += mv.getVarVarVar(); + HomRefHomVAR_Het += mv.getRefVarHet(); + HetHet_inheritedRef += mv.getParentsHetHetInheritedRef(); + HetHet_inheritedVar += mv.getParentsHetHetInheritedVar(); + HomRefHet_inheritedRef += mv.getParentsRefHetInheritedRef(); + HomRefHet_inheritedVar += mv.getParentsRefHetInheritedVar(); + HomVarHet_inheritedRef += mv.getParentsVarHetInheritedRef(); + HomVarHet_inheritedVar += mv.getParentsVarHetInheritedVar(); + + if(mv.getFamilyCalledCount()>0){ nVariants++; - - Genotype momG = vc.getGenotype(mv.getSampleMom()); - Genotype dadG = vc.getGenotype(mv.getSampleDad()); - Genotype childG = vc.getGenotype(mv.getSampleChild()); - - if (mv.isViolation()) { - nViolations++; - - String label; - if (childG.isHomRef() && (momG.isHomVar() || dadG.isHomVar())) { - label = "KidHomRef_ParentHomVar"; - KidHomRef_ParentHomVar++; - } else if (childG.isHet() && (momG.isHomRef() && dadG.isHomRef())) { - label = "KidHet_ParentsHomRef"; - KidHet_ParentsHomRef++; - } else if (childG.isHet() && (momG.isHomVar() && dadG.isHomVar())) { - label = "KidHet_ParentsHomVar"; - KidHet_ParentsHomVar++; - } else if (childG.isHomVar() && (momG.isHomRef() || dadG.isHomRef())) { - label = "KidHomVar_ParentHomRef"; - KidHomVar_ParentHomRef++; - } else { - throw new ReviewedStingException("BUG: unexpected child genotype class " + childG); - } - - return "MendelViolation=" + label; - } + nFamCalled += mv.getFamilyCalledCount(); + nLowQual += mv.getFamilyLowQualsCount(); + nNoCall += mv.getFamilyNoCallCount(); + nVarFamCalled += mv.getVarFamilyCalledCount(); } - } - - return null; // we don't capture any intersting sites - } - - -/* - private double getQThreshold() { - //return getVEWalker().MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10; // we aren't 10x scaled in the GATK a la phred - return mendelianViolationQualThreshold / 10; // we aren't 10x scaled in the GATK a la phred - //return 0.0; - } - - TrioStructure trio; - double mendelianViolationQualThreshold; - - private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); - - public static class TrioStructure { - public String mom, dad, child; - } - - public static TrioStructure parseTrioDescription(String family) { - Matcher m = FAMILY_PATTERN.matcher(family); - if (m.matches()) { - TrioStructure trio = new TrioStructure(); - //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE); - trio.mom = m.group(1); - trio.dad = m.group(2); - trio.child = m.group(3); - return trio; - } else { - throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); - } - } - - public void initialize(VariantEvalWalker walker) { - trio = parseTrioDescription(walker.getFamilyStructure()); - mendelianViolationQualThreshold = walker.getMendelianViolationQualThreshold(); - } - - private boolean includeGenotype(Genotype g) { - return g.getLog10PError() > getQThreshold() && g.isCalled(); - } - - public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG) { - return isViolation(vc, momG.getAlleles(), dadG.getAlleles(), childG.getAlleles()); - } - - public static boolean isViolation(VariantContext vc, TrioStructure trio ) { - return isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child) ); - } - - public static boolean isViolation(VariantContext vc, List momA, List dadA, List childA) { - //VariantContext momVC = vc.subContextFromGenotypes(momG); - //VariantContext dadVC = vc.subContextFromGenotypes(dadG); - int i = 0; - Genotype childG = new Genotype("kidG", childA); - for (Allele momAllele : momA) { - for (Allele dadAllele : dadA) { - if (momAllele.isCalled() && dadAllele.isCalled()) { - Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele)); - if (childG.sameGenotype(possibleChild)) { - return false; - } - } + else{ + nSkipped++; } + + + return null; } - return true; + return null; // we don't capture any interesting sites } - - -*/ - - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index b0016ff4b..fc01dae9f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -26,9 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.variantcontext.*; @@ -41,7 +41,6 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; import java.io.File; -import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.*; @@ -282,6 +281,9 @@ public class SelectVariants extends RodWalker { @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) private double fractionRandom = 0; + @Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false) + private double fractionGenotypes = 0; + /** * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. * When specified one or more times, a particular type of variant is selected. @@ -325,7 +327,7 @@ public class SelectVariants extends RodWalker { private boolean DISCORDANCE_ONLY = false; private boolean CONCORDANCE_ONLY = false; - private Set mvSet = new HashSet(); + private MendelianViolation mv; /* variables used by the SELECT RANDOM modules */ @@ -344,6 +346,8 @@ public class SelectVariants extends RodWalker { private PrintStream outMVFileStream = null; + //Random number generator for the genotypes to remove + private Random randomGenotypes = new Random(); /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher @@ -380,8 +384,6 @@ public class SelectVariants extends RodWalker { for ( String sample : samples ) logger.info("Including sample '" + sample + "'"); - - // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include if (TYPES_TO_INCLUDE.isEmpty()) { @@ -421,29 +423,7 @@ public class SelectVariants extends RodWalker { if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName()); if (MENDELIAN_VIOLATIONS) { - if ( FAMILY_STRUCTURE_FILE != null) { - try { - for ( final String line : new XReadLines( FAMILY_STRUCTURE_FILE ) ) { - MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD); - if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom())) - mvSet.add(mv); - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e); - } - if (outMVFile != null) - try { - outMVFileStream = new PrintStream(outMVFile); - } - catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(outMVFile, "Can't open output file", e); } - } - else - mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); - } - else if (!FAMILY_STRUCTURE.isEmpty()) { - mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); - MENDELIAN_VIOLATIONS = true; + mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true); } SELECT_RANDOM_NUMBER = numRandom > 0; @@ -479,26 +459,26 @@ public class SelectVariants extends RodWalker { } for (VariantContext vc : vcs) { - if (MENDELIAN_VIOLATIONS) { - boolean foundMV = false; - for (MendelianViolation mv : mvSet) { - if (mv.isViolation(vc)) { - foundMV = true; - //System.out.println(vc.toString()); - if (outMVFile != null) - outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + + if (MENDELIAN_VIOLATIONS && mv.countViolations(this.getSampleDB().getFamilies(samples),vc) < 1) + break; + + if (outMVFile != null){ + for( String familyId : mv.getViolationFamilies()){ + for(Sample sample : this.getSampleDB().getFamily(familyId)){ + if(sample.getParents().size() > 0){ + outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + "childG=%s childGL=%s\n",vc.getChr(), vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getCalledChrCount(vc.getAlternateAllele(0)), - mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(), - vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), - vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), - vc.getGenotype(mv.getSampleChild()).toBriefString(),vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString() ); + sample.getMaternalID(), sample.getPaternalID(), sample.getID(), + vc.getGenotype(sample.getMaternalID()).toBriefString(), vc.getGenotype(sample.getMaternalID()).getLikelihoods().getAsString(), + vc.getGenotype(sample.getPaternalID()).toBriefString(), vc.getGenotype(sample.getPaternalID()).getLikelihoods().getAsString(), + vc.getGenotype(sample.getID()).toBriefString(),vc.getGenotype(sample.getID()).getLikelihoods().getAsString() ); + + } } } - - if (!foundMV) - break; } + if (DISCORDANCE_ONLY) { Collection compVCs = tracker.getValues(discordanceTrack, context.getLocation()); if (!isDiscordant(vc, compVCs)) @@ -657,9 +637,31 @@ public class SelectVariants extends RodWalker { final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles()); VariantContextBuilder builder = new VariantContextBuilder(sub); + GenotypesContext newGC = sub.getGenotypes(); + // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate) if ( vc.getAlleles().size() != sub.getAlleles().size() ) - builder.genotypes(VariantContextUtils.stripPLs(vc.getGenotypes())); + newGC = VariantContextUtils.stripPLs(sub.getGenotypes()); + + //Remove a fraction of the genotypes if needed + if(fractionGenotypes>0){ + ArrayList genotypes = new ArrayList(); + for ( Genotype genotype : newGC ) { + //Set genotype to no call if it falls in the fraction. + if(fractionGenotypes>0 && randomGenotypes.nextDouble() alleles = new ArrayList(2); + alleles.add(Allele.create((byte)'.')); + alleles.add(Allele.create((byte)'.')); + genotypes.add(new Genotype(genotype.getSampleName(),alleles, Genotype.NO_LOG10_PERROR,genotype.getFilters(),new HashMap(),false)); + } + else{ + genotypes.add(genotype); + } + } + newGC = GenotypesContext.create(genotypes); + } + + builder.genotypes(newGC); int depth = 0; for (String sample : sub.getSampleNames()) { diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java index cf45dab79..e140575c0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java +++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java @@ -1,147 +1,399 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Collection; -import java.util.List; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * User: carneiro + * User: carneiro / lfran * Date: 3/9/11 * Time: 12:38 PM + * + * Class for the identification and tracking of mendelian violation. It can be used in 2 distinct ways: + * - Either using an instance of the MendelianViolation class to track mendelian violations for each of the families while + * walking over the variants + * - Or using the static methods to directly get information about mendelian violation in a family at a given locus + * */ public class MendelianViolation { - String sampleMom; - String sampleDad; - String sampleChild; + //List of families with violations + private List violationFamilies; - List allelesMom; - List allelesDad; - List allelesChild; + //Call information + private int nocall = 0; + private int familyCalled = 0; + private int varFamilyCalled = 0; + private int lowQual = 0; - double minGenotypeQuality; + private boolean allCalledOnly = true; + + //Stores occurrences of inheritance + private EnumMap>> inheritance; + + private int violations_total=0; + + private double minGenotypeQuality; + + private boolean abortOnSampleNotFound; + + //Number of families with genotype information for all members + public int getFamilyCalledCount(){ + return familyCalled; + } + + //Number of families with genotype information for all members + public int getVarFamilyCalledCount(){ + return varFamilyCalled; + } + + //Number of families missing genotypes for one or more of their members + public int getFamilyNoCallCount(){ + return nocall; + } + + //Number of families with genotypes below the set quality threshold + public int getFamilyLowQualsCount(){ + return lowQual; + } + + public int getViolationsCount(){ + return violations_total; + } + + //Count of alt alleles inherited from het parents (no violation) + public int getParentHetInheritedVar(){ + return getParentsHetHetInheritedVar() + getParentsRefHetInheritedVar() + getParentsVarHetInheritedVar(); + } + + //Count of ref alleles inherited from het parents (no violation) + public int getParentHetInheritedRef(){ + return getParentsHetHetInheritedRef() + getParentsRefHetInheritedRef() + getParentsVarHetInheritedRef(); + } + + //Count of HomRef/HomRef/HomRef trios + public int getRefRefRef(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + } + + //Count of HomVar/HomVar/HomVar trios + public int getVarVarVar(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR); + } + + //Count of HomRef/HomVar/Het trios + public int getRefVarHet(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + } + + //Count of Het/Het/Het trios + public int getHetHetHet(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET); + } + + //Count of Het/Het/HomRef trios + public int getHetHetHomRef(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + } + + //Count of Het/Het/HomVar trios + public int getHetHetHomVar(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR); + } + + //Count of ref alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedRef(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET) + + 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + //return parentsHetHet_childRef; + } + + //Count of var alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedVar(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET) + + 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR); + //return parentsHetHet_childVar; + } + + //Count of ref alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedRef(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + //return parentsHomRefHet_childRef; + } + + //Count of var alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HET) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + //return parentsHomRefHet_childVar; + } + + //Count of ref alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedRef(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HET) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET); + //return parentsHomVarHet_childRef; + } + + //Count of var alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedVar(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR); + //return parentsHomVarHet_childVar; + } + + //Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR + public int getParentsRefRefChildVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_REF -> HET + public int getParentsRefRefChildHet(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + } + + //Count of violations of the type HOM_REF/HET -> HOM_VAR + public int getParentsRefHetChildVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR + public int getParentsRefVarChildVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF + public int getParentsRefVarChildRef(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HET -> HOM_REF + public int getParentsVarHetChildRef(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF + public int getParentsVarVarChildRef(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HOM_VAR -> HET + public int getParentsVarVarChildHet(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET); + } + + + //Count of violations of the type HOM_VAR/? -> HOM_REF + public int getParentVarChildRef(){ + return getParentsRefVarChildRef() + getParentsVarHetChildRef() +getParentsVarVarChildRef(); + } + + //Count of violations of the type HOM_REF/? -> HOM_VAR + public int getParentRefChildVar(){ + return getParentsRefVarChildVar() + getParentsRefHetChildVar() +getParentsRefRefChildVar(); + } + + //Returns a String containing all trios where a Mendelian violation was observed. + //The String is formatted "mom1+dad1=child1,mom2+dad2=child2,..." + public String getViolationFamiliesString(){ + if(violationFamilies.isEmpty()) + return ""; + + Iterator it = violationFamilies.iterator(); + String violationFams = it.next(); + while(it.hasNext()){ + violationFams += ","+it.next(); + } + return violationFams; + } + + public List getViolationFamilies(){ + return violationFamilies; + } static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 }; static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 }; - private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); - - public String getSampleMom() { - return sampleMom; - } - public String getSampleDad() { - return sampleDad; - } - public String getSampleChild() { - return sampleChild; - } public double getMinGenotypeQuality() { return minGenotypeQuality; } - /** - * - * @param sampleMomP - sample name of mom - * @param sampleDadP - sample name of dad - * @param sampleChildP - sample name of child - */ - public MendelianViolation (String sampleMomP, String sampleDadP, String sampleChildP) { - sampleMom = sampleMomP; - sampleDad = sampleDadP; - sampleChild = sampleChildP; - } - - /** - * - * @param family - the sample names string "mom+dad=child" + /** + * Constructor * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation - */ - public MendelianViolation(String family, double minGenotypeQualityP) { - minGenotypeQuality = minGenotypeQualityP; - - Matcher m = FAMILY_PATTERN.matcher(family); - if (m.matches()) { - sampleMom = m.group(1); - sampleDad = m.group(2); - sampleChild = m.group(3); - } - else - throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); - } - - /** - * An alternative to the more general constructor if you want to get the Sample information from the engine yourself. - * @param sample - the sample object extracted from the sample metadata YAML file given to the engine. - * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation - */ - public MendelianViolation(Sample sample, double minGenotypeQualityP) { - sampleMom = sample.getMother().getID(); - sampleDad = sample.getFather().getID(); - sampleChild = sample.getID(); - minGenotypeQuality = minGenotypeQualityP; - } - - /** - * This method prepares the object to evaluate for violation. Typically you won't call it directly, a call to - * isViolation(vc) will take care of this. But if you want to know whether your site was a valid comparison site - * before evaluating it for mendelian violation, you can call setAlleles and then isViolation(). - * @param vc - the variant context to extract the genotypes and alleles for mom, dad and child. - * @return false if couldn't find the genotypes or context has empty alleles. True otherwise. - */ - public boolean setAlleles (VariantContext vc) - { - Genotype gMom = vc.getGenotypes(sampleMom).get(sampleMom); - Genotype gDad = vc.getGenotypes(sampleDad).get(sampleDad); - Genotype gChild = vc.getGenotypes(sampleChild).get(sampleChild); - - if (gMom == null || gDad == null || gChild == null) - throw new IllegalArgumentException(String.format("Variant %s:%d didn't contain genotypes for all family members: mom=%s dad=%s child=%s", vc.getChr(), vc.getStart(), sampleMom, sampleDad, sampleChild)); - - if (gMom.isNoCall() || gDad.isNoCall() || gChild.isNoCall() || - gMom.getPhredScaledQual() < minGenotypeQuality || - gDad.getPhredScaledQual() < minGenotypeQuality || - gChild.getPhredScaledQual() < minGenotypeQuality ) { - - return false; - } - - allelesMom = gMom.getAlleles(); - allelesDad = gDad.getAlleles(); - allelesChild = gChild.getAlleles(); - return !allelesMom.isEmpty() && !allelesDad.isEmpty() && !allelesChild.isEmpty(); - } - - - /** * + */ + public MendelianViolation(double minGenotypeQualityP) { + this(minGenotypeQualityP,true); + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. + */ + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound) { + minGenotypeQuality = minGenotypeQualityP; + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. + * @param completeTriosOnly - whether only complete trios are considered or parent/child pairs are too. + */ + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound, boolean completeTriosOnly) { + minGenotypeQuality = minGenotypeQualityP; + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); + allCalledOnly = completeTriosOnly; + } + + /** + * @param families the families to be checked for Mendelian violations * @param vc the variant context to extract the genotypes and alleles for mom, dad and child. - * @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation. - * - */ - public boolean isViolation(VariantContext vc) - { - return setAlleles(vc) && isViolation(); - } - - /** * @return whether or not there is a mendelian violation at the site. */ - public boolean isViolation() { - if (allelesMom.contains(allelesChild.get(0)) && allelesDad.contains(allelesChild.get(1)) || - allelesMom.contains(allelesChild.get(1)) && allelesDad.contains(allelesChild.get(0))) - return false; - return true; + public int countViolations(Map> families, VariantContext vc){ + + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); + + for(Set family : families.values()){ + Iterator sampleIterator = family.iterator(); + Sample sample; + while(sampleIterator.hasNext()){ + sample = sampleIterator.next(); + if(sample.getParents().size() > 0) + updateViolations(sample.getFamilyID(),sample.getMaternalID(), sample.getPaternalID(), sample.getID() ,vc); + } + } + return violations_total; + } + + public boolean isViolation(Sample mother, Sample father, Sample child, VariantContext vc){ + + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); + updateViolations(mother.getFamilyID(),mother.getID(),father.getID(),child.getID(),vc); + return violations_total>0; + } + + + private void updateViolations(String familyId, String motherId, String fatherId, String childId, VariantContext vc){ + + int count; + Genotype gMom = vc.getGenotype(motherId); + Genotype gDad = vc.getGenotype(fatherId); + Genotype gChild = vc.getGenotype(childId); + + if (gMom == null || gDad == null || gChild == null){ + if(abortOnSampleNotFound) + throw new IllegalArgumentException(String.format("Variant %s:%d: Missing genotypes for family %s: mom=%s dad=%s family=%s", vc.getChr(), vc.getStart(), familyId, motherId, fatherId, childId)); + else + return; + } + //Count No calls + if(allCalledOnly && (!gMom.isCalled() || !gDad.isCalled() || !gChild.isCalled())){ + nocall++; + } + else if (!gMom.isCalled() && !gDad.isCalled() || !gChild.isCalled()){ + nocall++; + } + //Count lowQual. Note that if min quality is set to 0, even values with no quality associated are returned + else if (minGenotypeQuality>0 && (gMom.getPhredScaledQual() < minGenotypeQuality || + gDad.getPhredScaledQual() < minGenotypeQuality || + gChild.getPhredScaledQual() < minGenotypeQuality )) { + lowQual++; + } + else{ + //Count all families per loci called + familyCalled++; + //If the family is all homref, not too interesting + if(!(gMom.isHomRef() && gDad.isHomRef() && gChild.isHomRef())) + { + varFamilyCalled++; + if(isViolation(gMom, gDad, gChild)){ + violationFamilies.add(familyId); + violations_total++; + } + } + count = inheritance.get(gMom.getType()).get(gDad.getType()).get(gChild.getType()); + inheritance.get(gMom.getType()).get(gDad.getType()).put(gChild.getType(),count+1); + + } + } + + private boolean isViolation(Genotype gMom, Genotype gDad, Genotype gChild) { + //1 parent is no "call + if(!gMom.isCalled()){ + return (gDad.isHomRef() && gChild.isHomVar()) || (gDad.isHomVar() && gChild.isHomRef()); + } + else if(!gDad.isCalled()){ + return (gMom.isHomRef() && gChild.isHomVar()) || (gMom.isHomVar() && gChild.isHomRef()); + } + //Both parents have genotype information + return !(gMom.getAlleles().contains(gChild.getAlleles().get(0)) && gDad.getAlleles().contains(gChild.getAlleles().get(1)) || + gMom.getAlleles().contains(gChild.getAlleles().get(1)) && gDad.getAlleles().contains(gChild.getAlleles().get(0))); + } + + private void createInheritanceMap(){ + + inheritance = new EnumMap>>(Genotype.Type.class); + for(Genotype.Type mType : Genotype.Type.values()){ + inheritance.put(mType, new EnumMap>(Genotype.Type.class)); + for(Genotype.Type dType : Genotype.Type.values()){ + inheritance.get(mType).put(dType, new EnumMap(Genotype.Type.class)); + for(Genotype.Type cType : Genotype.Type.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } + + } + + private void clearInheritanceMap(){ + for(Genotype.Type mType : Genotype.Type.values()){ + for(Genotype.Type dType : Genotype.Type.values()){ + for(Genotype.Type cType : Genotype.Type.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } } /** * @return the likelihood ratio for a mendelian violation */ - public double violationLikelihoodRatio(VariantContext vc) { + public double violationLikelihoodRatio(VariantContext vc, String motherId, String fatherId, String childId) { double[] logLikAssignments = new double[27]; // the matrix to set up is // MOM DAD CHILD @@ -152,9 +404,9 @@ public class MendelianViolation { // AA AB | AB // |- BB // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs - double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector(); - double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector(); - double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector(); + double[] momGL = vc.getGenotype(motherId).getLikelihoods().getAsVector(); + double[] dadGL = vc.getGenotype(fatherId).getLikelihoods().getAsVector(); + double[] childGL = vc.getGenotype(childId).getLikelihoods().getAsVector(); int offset = 0; for ( int oMom = 0; oMom < 3; oMom++ ) { for ( int oDad = 0; oDad < 3; oDad++ ) {