From 76dd816e709ef230231173b43076c4a423577a4a Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 20 Oct 2011 12:47:27 +0200 Subject: [PATCH 01/20] Added getParents() -> returns an arrayList containing the sample's parent(s) if available --- .../broadinstitute/sting/gatk/samples/Sample.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index b39fdd79d..a14d999ea 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples; import org.broadinstitute.sting.utils.exceptions.UserException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Map; @@ -110,6 +111,17 @@ public class Sample implements Comparable { // implements java.io.Serial return infoDB.getSample(paternalID); } + public ArrayList getParents(){ + ArrayList parents = new ArrayList(2); + Sample parent = getMother(); + if(parent != null) + parents.add(parent); + parent = getFather(); + if(parent != null) + parents.add(parent); + return parents; + } + /** * Get gender of the sample * @return property of key "gender" - must be of type Gender From ef6a6fdfe499809fc3fd6282a460c2fd542963c5 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 20 Oct 2011 12:49:18 +0200 Subject: [PATCH 02/20] Added getAsMap -> returns the likelihoods as an EnumMap with Genotypes as keys and likelihoods as values. --- .../utils/variantcontext/GenotypeLikelihoods.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index dba16cf86..292bc17cd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -25,8 +25,12 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import java.util.EnumMap; +import java.util.Map; + public class GenotypeLikelihoods { public static final boolean CAP_PLS = false; public static final int PL_CAP = 255; @@ -94,6 +98,16 @@ public class GenotypeLikelihoods { return likelihoodsAsString_PLs; } + public EnumMap getAsMap(boolean normalizeFromLog10){ + //Make sure that the log10likelihoods are set + double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); + EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class); + likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]); + return likelihoodsMap; + } + private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { String[] strings = likelihoodsAsString_PLs.split(","); From 1c61a573296f6df9a3b2af20255c0deff1ece565 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 20 Oct 2011 13:06:44 +0200 Subject: [PATCH 03/20] Original rewrite of PhaseByTransmission: - Adapted to get the trio information from the SampleDB (i.e. from Pedigree file (ped)) => Multiple trios can be passed as argument - Mendelian violations and trio phasing possibilities are pre-calculated and stored in Maps. => Runtime is ~3x faster - Genotype combinations possible only given two MVs are now given a squared MV prior (e.g. 0/0+0/0=>1/1 is given 10^-16 prior if the MV prior is 10^-8) - Corrected bug: In case the best genotype combination is Het/Het/Het, the genotypes are now set appropriately (before original genotypes were left even if they weren't Het/Het/Het) - Basic reporting added: -- mvf argument let the user specify a file to report remaining MVs -- When the walker ends, some basic stats about the genotype reconfiguration and phasing are output Known problems: - GQ is not recalculated even if the genotype changes Possible improvements: - Phase partially typed trios - Use standard Allele/Genotype Classes for the storage of the pre-calculated phase --- .../walkers/phasing/PhaseByTransmission.java | 781 +++++++++++++----- 1 file changed, 569 insertions(+), 212 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 3eedc2a28..12b541526 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -7,18 +7,18 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import java.io.File; -import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.*; /** @@ -32,13 +32,13 @@ import java.util.*; * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to * begin. */ -public class PhaseByTransmission extends RodWalker { +public class PhaseByTransmission extends RodWalker, HashMap> { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.") - public ArrayList familySpecs = null; + @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") + private PrintStream mvFile = null; @Output protected VCFWriter vcfWriter = null; @@ -48,239 +48,523 @@ public class PhaseByTransmission extends RodWalker { private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; - private class Trio { - private String mother; - private String father; - private String child; + private ArrayList trios = new ArrayList(); - public Trio(String mother, String father, String child) { - this.mother = mother; - this.father = father; - this.child = child; - } + //Matrix of priors for all genotype combinations + private EnumMap>> mvCountMatrix; - public Trio(String familySpec) { - String[] pieces = familySpec.split("[\\+\\=]"); + //Matrix of allele transmission + private EnumMap>> transmissionMatrix; - this.mother = pieces[0]; - this.father = pieces[1]; - this.child = pieces[2]; - } + //Metrics counters hashkeys + private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; + private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; + private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; + private final Byte NUM_HET = 3; + private final Byte NUM_HET_HET_HET = 4; + private final Byte NUM_VIOLATIONS = 5; - public String getMother() { return mother; } - public String getFather() { return father; } - public String getChild() { return child; } + private enum AlleleType { + NO_CALL, + REF, + VAR, + UNPHASED_REF, + UNPHASED_VAR } - private ArrayList trios = new ArrayList(); + //Stores a trio-genotype + private class TrioPhase { - public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) { - if (familySpecs != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // spec list set, and treat the entries as if they had been specified on the command line. - ArrayList specs = new ArrayList(); - for (String familySpec : familySpecs) { - File specFile = new File(familySpec); + private ArrayList trioAlleles = new ArrayList(6); - try { - XReadLines reader = new XReadLines(specFile); + private ArrayList getAlleles(Genotype.Type genotype){ + ArrayList alleles = new ArrayList(2); + if(genotype == Genotype.Type.HOM_REF){ + alleles.add(AlleleType.REF); + alleles.add(AlleleType.REF); + } + else if(genotype == Genotype.Type.HET){ + alleles.add(AlleleType.REF); + alleles.add(AlleleType.VAR); + } + else if(genotype == Genotype.Type.HOM_VAR){ + alleles.add(AlleleType.VAR); + alleles.add(AlleleType.VAR); + } + else{ + alleles.add(AlleleType.NO_CALL); + alleles.add(AlleleType.NO_CALL); + } + return alleles; + } - List lines = reader.readLines(); - for (String line : lines) { - specs.add(new Trio(line)); - } - } catch (FileNotFoundException e) { - specs.add(new Trio(familySpec)); // not a file, so must be a family spec - } - } + private ArrayList phaseSingleIndividualAlleles(Genotype.Type genotype){ + if(genotype == Genotype.Type.HET){ + ArrayList phasedAlleles = new ArrayList(2); + phasedAlleles.add(AlleleType.UNPHASED_REF); + phasedAlleles.add(AlleleType.UNPHASED_VAR); + return phasedAlleles; + } + else + return getAlleles(genotype); + } - return specs; + private ArrayList phaseMonoParentFamilyAlleles(Genotype.Type parent, Genotype.Type child){ + ArrayList phasedAlleles = new ArrayList(4); + //Special case for Het/Het as it is ambiguous + if(parent == Genotype.Type.HET && child == Genotype.Type.HET){ + phasedAlleles.add(AlleleType.UNPHASED_REF); + phasedAlleles.add(AlleleType.UNPHASED_VAR); + phasedAlleles.add(AlleleType.UNPHASED_REF); + phasedAlleles.add(AlleleType.UNPHASED_VAR); } - return new ArrayList(); + ArrayList parentAlleles = getAlleles(parent); + ArrayList childAlleles = getAlleles(child); + + int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); + if(childTransmittedAlleleIndex > -1){ + phasedAlleles.add(parentAlleles.get(0)); + phasedAlleles.add(parentAlleles.get(1)); + phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + phasedAlleles.add(childAlleles.get(0)); + } + else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ + phasedAlleles.add(parentAlleles.get(1)); + phasedAlleles.add(parentAlleles.get(0)); + phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + phasedAlleles.add(childAlleles.get(0)); + } + else{ + parentAlleles.addAll(childAlleles); + for(AlleleType allele : parentAlleles){ + if(allele == AlleleType.REF){ + phasedAlleles.add(AlleleType.UNPHASED_REF); + } + else if(allele == AlleleType.VAR){ + phasedAlleles.add(AlleleType.UNPHASED_VAR); + } + else{ + phasedAlleles.add(AlleleType.NO_CALL); + } + } + } + + return phasedAlleles; + } + + private ArrayList phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + ArrayList phasedAlleles = new ArrayList(6); + + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); + + //Build all possible child genotypes for the given parent's genotypes + for (AlleleType momAllele : motherAlleles) { + for (AlleleType fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(fatherAllele); + possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); + } + } + + for (ArrayList phasedChildGenotype : possiblePhasedChildGenotypes) { + int firstAlleleIndex = phasedChildGenotype.indexOf(childAlleles.get(0)); + int secondAlleleIndex = phasedChildGenotype.lastIndexOf(childAlleles.get(1)); + if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { + //Add mother's alleles + phasedAlleles.add(phasedChildGenotype.get(0)); + if(motherAlleles.get(0) != phasedAlleles.get(0)) + phasedAlleles.add(motherAlleles.get(0)); + else + phasedAlleles.add(motherAlleles.get(1)); + + //Add father's alleles + phasedAlleles.add(phasedChildGenotype.get(1)); + if(fatherAlleles.get(0) != phasedAlleles.get(2)) + phasedAlleles.add(fatherAlleles.get(0)); + else + phasedAlleles.add(fatherAlleles.get(1)); + + //Add child's alleles + phasedAlleles.addAll(phasedChildGenotype); + return phasedAlleles; + } + } + + //If this is reached then no phasing could be found + motherAlleles.addAll(fatherAlleles); + motherAlleles.addAll(childAlleles); + for(AlleleType allele : motherAlleles){ + if(allele == AlleleType.REF){ + phasedAlleles.add(AlleleType.UNPHASED_REF); + } + else if(allele == AlleleType.VAR){ + phasedAlleles.add(AlleleType.UNPHASED_VAR); + } + else{ + phasedAlleles.add(AlleleType.NO_CALL); + } + } + return phasedAlleles; + } + + + public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + //Take care of cases where one or more family members are no call + if(child == Genotype.Type.NO_CALL){ + trioAlleles.addAll(phaseSingleIndividualAlleles(mother)); + trioAlleles.addAll(phaseSingleIndividualAlleles(father)); + trioAlleles.add(AlleleType.NO_CALL); + trioAlleles.add(AlleleType.NO_CALL); + } + else if(mother == Genotype.Type.NO_CALL){ + trioAlleles.add(AlleleType.NO_CALL); + trioAlleles.add(AlleleType.NO_CALL); + if(father == Genotype.Type.NO_CALL){ + trioAlleles.add(AlleleType.NO_CALL); + trioAlleles.add(AlleleType.NO_CALL); + trioAlleles.addAll(phaseSingleIndividualAlleles(child)); + } + else + trioAlleles.addAll(phaseMonoParentFamilyAlleles(father, child)); + } + else if(father == Genotype.Type.NO_CALL){ + trioAlleles.addAll(phaseMonoParentFamilyAlleles(mother, child)); + trioAlleles.add(2, AlleleType.NO_CALL); + trioAlleles.add(3, AlleleType.NO_CALL); + } + //Special case for Het/Het/Het as it is ambiguous + else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ + trioAlleles.add(AlleleType.UNPHASED_REF); + trioAlleles.add(AlleleType.UNPHASED_VAR); + trioAlleles.add(AlleleType.UNPHASED_REF); + trioAlleles.add(AlleleType.UNPHASED_VAR); + trioAlleles.add(AlleleType.UNPHASED_REF); + trioAlleles.add(AlleleType.UNPHASED_VAR); + } + //All family members have genotypes and at least one of them is not Het + else{ + trioAlleles = phaseFamilyAlleles(mother, father, child); + } + } + + public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){ + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,trioAlleles.subList(0,2))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,trioAlleles.subList(2,4))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,trioAlleles.subList(4,6))); + return phasedGenotypes; + } + + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, List phasedAlleles){ + + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getAttributes()); + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb); + + boolean isPhased = true; + + List alleles = new ArrayList(2); + + //If unphased, return original genotype + for(AlleleType allele : phasedAlleles){ + if(allele == AlleleType.NO_CALL){ + genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); + return genotype; + } + //Otherwise add the appropriate allele + else if(allele == AlleleType.UNPHASED_REF){ + isPhased = false; + alleles.add(refAllele); + } + else if(allele == AlleleType.UNPHASED_VAR){ + isPhased = false; + alleles.add(altAllele); + } + else if(allele == AlleleType.REF){ + alleles.add(refAllele); + } + else if(allele == AlleleType.VAR){ + alleles.add(altAllele); + } + } + //TODO: Recalculate GQ field for the new genotype + //Remove the GQ attribute as it needs to be recalculated for the new genotype assignment + //double[] likelihoods = genotype.getLikelihoods().getAsVector(); + + //genotypeAttributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,likelihoods[1]); + genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); + return new Genotype(genotype.getSampleName(), alleles, genotype.getNegLog10PError(), null, genotype.getAttributes(), isPhased); + } + + } /** * Parse the familial relationship specification, and initialize VCF writer */ public void initialize() { - trios = getFamilySpecsFromCommandLineInput(familySpecs); - ArrayList rodNames = new ArrayList(); rodNames.add(variantCollection.variants.getName()); - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + //Get the trios from the families passed as ped + setTrios(); + if(trios.size()<1) + throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted."); + + Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the phase given that the genotypes are correct")); headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); + + buildMatrices(); + + if(mvFile != null) + mvFile.println("#CHROM\tPOS\tFILTER\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_RAD\tMOTHER_AAD\tMOTHER_HRPL\tMOTHER_HETPL\tMOTHER_HAPL\tFATHER_GT\tFATHER_DP\tFATHER_RAD\tFATHER_AAD\tFATHER_HRPL\tFATHER_HETPL\tFATHER_HAPL\tCHILD_GT\tCHILD_DP\tCHILD_RAD\tCHILD_AAD\tCHILD_HRPL\tCHILD_HETPL\tCHILD_HAPL"); + } - private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) { - double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector()); - double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector()); - double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector()); + /** + * Select Trios only + */ + private void setTrios(){ - int momIndex = mom.getType().ordinal() - 1; - int dadIndex = dad.getType().ordinal() - 1; - int childIndex = child.getType().ordinal() - 1; - - return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex]; - } - - private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) { - List homRefAlleles = new ArrayList(); - homRefAlleles.add(refAllele); - homRefAlleles.add(refAllele); - Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - List hetAlleles = new ArrayList(); - hetAlleles.add(refAllele); - hetAlleles.add(altAllele); - Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - List homVarAlleles = new ArrayList(); - homVarAlleles.add(altAllele); - homVarAlleles.add(altAllele); - Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - ArrayList genotypes = new ArrayList(); - genotypes.add(homRef); - genotypes.add(het); - genotypes.add(homVar); - - return genotypes; - } - - private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) { - List alleles = g.getAlleles(); - int matchingAlleles = 0; - - for (Allele a : alleles) { - if (!alleleToMatch.equals(a)) { - matchingAlleles++; + Map> families = this.getSampleDB().getFamilies(); + Set family; + ArrayList parents; + for(String familyID : families.keySet()){ + family = families.get(familyID); + if(family.size()!=3){ + logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios. Family skipped.",familyID,family.size())); } - } - - return matchingAlleles; - } - - private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) { - int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0; - int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0; - - int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0; - int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0; - - int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child); - int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child); - - return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles); - } - - private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) { - Set possiblePhasedChildGenotypes = new HashSet(); - - for (Allele momAllele : mom.getAlleles()) { - for (Allele dadAllele : dad.getAlleles()) { - ArrayList possiblePhasedChildAlleles = new ArrayList(); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(dadAllele); - - Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getNegLog10PError(), child.getFilters(), child.getAttributes(), true); - - possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype); - } - } - - ArrayList finalGenotypes = new ArrayList(); - - for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) { - if (child.sameGenotype(phasedChildGenotype, true)) { - Allele momTransmittedAllele = phasedChildGenotype.getAllele(0); - Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1); - - ArrayList phasedMomAlleles = new ArrayList(); - phasedMomAlleles.add(momTransmittedAllele); - phasedMomAlleles.add(momUntransmittedAllele); - - Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getNegLog10PError(), mom.getFilters(), mom.getAttributes(), true); - - Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1); - Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1); - - ArrayList phasedDadAlleles = new ArrayList(); - phasedDadAlleles.add(dadTransmittedAllele); - phasedDadAlleles.add(dadUntransmittedAllele); - - Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getNegLog10PError(), dad.getFilters(), dad.getAttributes(), true); - - finalGenotypes.add(phasedMomGenotype); - finalGenotypes.add(phasedDadGenotype); - finalGenotypes.add(phasedChildGenotype); - - return finalGenotypes; - } - } - - finalGenotypes.add(mom); - finalGenotypes.add(dad); - finalGenotypes.add(child); - - return finalGenotypes; - } - - private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) { - ArrayList finalGenotypes = new ArrayList(); - finalGenotypes.add(mother); - finalGenotypes.add(father); - finalGenotypes.add(child); - - if (mother.isCalled() && father.isCalled() && child.isCalled()) { - ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother); - ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father); - ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child); - - double bestConfigurationLikelihood = 0.0; - double bestPrior = 0.0; - Genotype bestMotherGenotype = mother; - Genotype bestFatherGenotype = father; - Genotype bestChildGenotype = child; - - double norm = 0.0; - - for (Genotype motherGenotype : possibleMotherGenotypes) { - for (Genotype fatherGenotype : possibleFatherGenotypes) { - for (Genotype childGenotype : possibleChildGenotypes) { - double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype); - norm += prior*configurationLikelihood; - - if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) { - bestConfigurationLikelihood = configurationLikelihood; - bestPrior = prior; - bestMotherGenotype = motherGenotype; - bestFatherGenotype = fatherGenotype; - bestChildGenotype = childGenotype; - } - } + else{ + for(Sample familyMember : family){ + parents = familyMember.getParents(); + if(parents.size()>0){ + if(family.containsAll(parents)) + this.trios.add(familyMember); + else + logger.info(String.format("Caution: Family %s is not a trio; At the moment Phase By Transmission only supports trios. Family skipped.",familyID)); + break; + } } } - if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) { - Map attributes = new HashMap(); - attributes.putAll(bestChildGenotype.getAttributes()); - attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm); - bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes); + } - finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype); + + + } + + private void buildMatrices(){ + mvCountMatrix = new EnumMap>>(Genotype.Type.class); + transmissionMatrix = new EnumMap>>(Genotype.Type.class); + for(Genotype.Type mother : Genotype.Type.values()){ + mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + for(Genotype.Type father : Genotype.Type.values()){ + mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + for(Genotype.Type child : Genotype.Type.values()){ + mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); + transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); + } + } + } + } + + private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + //Child is no call => No MV + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) + return 0; + //Add parents with genotypes for the evaluation + ArrayList parents = new ArrayList(); + if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) + parents.add(mother); + if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) + parents.add(father); + + //Both parents no calls => No MV + if (parents.isEmpty()) + return 0; + + //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed + int parentsNumRefAlleles = 0; + int parentsNumAltAlleles = 0; + + for(Genotype.Type parent : parents){ + if(parent == Genotype.Type.HOM_REF){ + parentsNumRefAlleles++; + } + else if(parent == Genotype.Type.HET){ + parentsNumRefAlleles++; + parentsNumAltAlleles++; + } + else if(parent == Genotype.Type.HOM_VAR){ + parentsNumAltAlleles++; } } - return finalGenotypes; + //Case Child is HomRef + if(child == Genotype.Type.HOM_REF){ + if(parentsNumRefAlleles == parents.size()) + return 0; + else return (parents.size()-parentsNumRefAlleles); + } + + //Case child is HomVar + if(child == Genotype.Type.HOM_VAR){ + if(parentsNumAltAlleles == parents.size()) + return 0; + else return parents.size()-parentsNumAltAlleles; + } + + //Case child is Het + if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) + return 0; + + //MV + return 1; + } + + private Double getCombinationPrior(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + double nonMVPrior = 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; + + //Child is no call => No MV + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) + return nonMVPrior; + //Add parents with genotypes for the evaluation + ArrayList parents = new ArrayList(); + if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) + parents.add(mother); + if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) + parents.add(father); + + //Both parents no calls => No MV + if (parents.isEmpty()) + return nonMVPrior; + + //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed + int parentsNumRefAlleles = 0; + int parentsNumAltAlleles = 0; + + for(Genotype.Type parent : parents){ + if(parent == Genotype.Type.HOM_REF){ + parentsNumRefAlleles++; + } + else if(parent == Genotype.Type.HET){ + parentsNumRefAlleles++; + parentsNumAltAlleles++; + } + else if(parent == Genotype.Type.HOM_VAR){ + parentsNumAltAlleles++; + } + } + + //Case Child is HomRef + if(child == Genotype.Type.HOM_REF){ + if(parentsNumRefAlleles == parents.size()) + return nonMVPrior; + else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumRefAlleles); + } + + //Case child is HomVar + if(child == Genotype.Type.HOM_VAR){ + if(parentsNumAltAlleles == parents.size()) + return nonMVPrior; + else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumAltAlleles); + } + + //Case child is Het + if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) + return nonMVPrior; + + //MV + return MENDELIAN_VIOLATION_PRIOR; + } + + private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ + int count = 0; + if(motherOriginal!=motherNew) + count++; + if(fatherOriginal!=fatherNew) + count++; + if(childOriginal!=childNew) + count++; + return count; + } + + + private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { + + //For now, only consider trios with complete information + //TODO: Phasing of trios with missing information + if(!(mother.isCalled() && father.isCalled() && child.isCalled())) { + finalGenotypes.add(mother); + finalGenotypes.add(father); + finalGenotypes.add(child); + return false; + } + + //Get the PL + Map motherLikelihoods = mother.getLikelihoods().getAsMap(true); + Map fatherLikelihoods = father.getLikelihoods().getAsMap(true); + Map childLikelihoods = child.getLikelihoods().getAsMap(true); + + //Prior vars + double bestConfigurationLikelihood = 0.0; + double norm = 0.0; + boolean isMV = false; + int bestConfigurationGenotypeDiffs=4; + Genotype.Type bestMotherGenotype = mother.getType(); + Genotype.Type bestFatherGenotype = father.getType(); + Genotype.Type bestChildGenotype = child.getType(); + + //Get the most likely combination + int mvCount; + double configurationLikelihood; + int configurationGenotypeDiffs; + for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ + for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); + configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-12*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if ((configurationLikelihood > bestConfigurationLikelihood) || + (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) { + bestConfigurationLikelihood = configurationLikelihood; + bestMotherGenotype = motherGenotype.getKey(); + bestFatherGenotype = fatherGenotype.getKey(); + bestChildGenotype = childGenotype.getKey(); + isMV = mvCount>0; + bestConfigurationGenotypeDiffs=configurationGenotypeDiffs; + } + } + } + } + + //Get the phased alleles for the genotype configuration + TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype); + + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,MathUtils.probabilityToPhredScale(1-(bestConfigurationLikelihood / norm)),finalGenotypes); + return isMV; + } /** @@ -292,18 +576,34 @@ public class PhaseByTransmission extends RodWalker { * @return null */ @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + //Local cars to avoid lookups on increment + int numTrioGenotypesCalled = 0; + int numTrioGenotypesNoCall = 0; + int numTrioGenotypesPhased = 0; + int numHet = 0 ; + int numHetHetHet = 0; + int numMVs = 0; + if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); Map genotypeMap = vc.getGenotypes(); - for (Trio trio : trios) { - Genotype mother = vc.getGenotype(trio.getMother()); - Genotype father = vc.getGenotype(trio.getFather()); - Genotype child = vc.getGenotype(trio.getChild()); + boolean isMV; - ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child); + for (Sample sample : trios) { + Genotype mother = vc.getGenotype(sample.getMaternalID()); + Genotype father = vc.getGenotype(sample.getPaternalID()); + Genotype child = vc.getGenotype(sample.getID()); + + //Skip trios where any of the genotype is missing in the variant context + if(mother == null || father == null | child == null) + continue; + + ArrayList trioGenotypes = new ArrayList(3); + isMV = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); Genotype phasedMother = trioGenotypes.get(0); Genotype phasedFather = trioGenotypes.get(1); @@ -312,14 +612,47 @@ public class PhaseByTransmission extends RodWalker { genotypeMap.put(phasedMother.getSampleName(), phasedMother); genotypeMap.put(phasedFather.getSampleName(), phasedFather); genotypeMap.put(phasedChild.getSampleName(), phasedChild); + + //Increment metrics counters + if(phasedMother.isCalled() && phasedFather.isCalled() && phasedChild.isCalled()){ + numTrioGenotypesCalled++; + if(phasedMother.isPhased()) + numTrioGenotypesPhased++; + + if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){ + numHet++; + if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()){ + numHetHetHet++; + }else if(!phasedMother.isPhased()){ + int x =9; + } + } + + if(isMV){ + numMVs++; + if(mvFile != null) + mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString())); + } + }else{ + numTrioGenotypesNoCall++; + } + } + VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap); vcfWriter.add(newvc); } - return null; + HashMap metricsCounters = new HashMap(5); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,numTrioGenotypesCalled); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,numTrioGenotypesNoCall); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,numTrioGenotypesPhased); + metricsCounters.put(NUM_HET,numHet); + metricsCounters.put(NUM_HET_HET_HET,numHetHetHet); + metricsCounters.put(NUM_VIOLATIONS,numMVs); + return metricsCounters; } /** @@ -328,8 +661,15 @@ public class PhaseByTransmission extends RodWalker { * @return Initial value of reduce. */ @Override - public Integer reduceInit() { - return null; + public HashMap reduceInit() { + HashMap metricsCounters = new HashMap(5); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_HET,0); + metricsCounters.put(NUM_HET_HET_HET,0); + metricsCounters.put(NUM_VIOLATIONS,0); + return metricsCounters; } /** @@ -340,7 +680,24 @@ public class PhaseByTransmission extends RodWalker { * @return accumulator with result of the map taken into account. */ @Override - public Integer reduce(Integer value, Integer sum) { - return null; + public HashMap reduce(HashMap value, HashMap sum) { + sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); + sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); + sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); + sum.put(NUM_HET,value.get(NUM_HET)+sum.get(NUM_HET)); + sum.put(NUM_HET_HET_HET,value.get(NUM_HET_HET_HET)+sum.get(NUM_HET_HET_HET)); + sum.put(NUM_VIOLATIONS,value.get(NUM_VIOLATIONS)+sum.get(NUM_VIOLATIONS)); + return sum; + } + + + @Override + public void onTraversalDone(HashMap result) { + logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); + logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); + logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); + logger.info("Number of resulting Hom/Hom/Hom trios: " + result.get(NUM_HET)); + logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_HET_HET_HET)); + logger.info("Number of remaining mendelian violations: " + result.get(NUM_VIOLATIONS)); } } From edea90786a4555ff665b07a3dc2f97ce7cbae24c Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 20 Oct 2011 17:04:19 +0200 Subject: [PATCH 04/20] Genotype quality is now recalculated for each of the phased Genotypes. Small problem is that we unnecessarily loose a little precision on the genotypes that do not change after assignment. --- .../walkers/phasing/PhaseByTransmission.java | 9 ++---- .../variantcontext/GenotypeLikelihoods.java | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 12b541526..52629277f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -266,6 +266,7 @@ public class PhaseByTransmission extends RodWalker, HashMa Map genotypeAttributes = new HashMap(); genotypeAttributes.putAll(genotype.getAttributes()); genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb); + genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); boolean isPhased = true; @@ -274,7 +275,6 @@ public class PhaseByTransmission extends RodWalker, HashMa //If unphased, return original genotype for(AlleleType allele : phasedAlleles){ if(allele == AlleleType.NO_CALL){ - genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); return genotype; } //Otherwise add the appropriate allele @@ -293,13 +293,8 @@ public class PhaseByTransmission extends RodWalker, HashMa alleles.add(altAllele); } } - //TODO: Recalculate GQ field for the new genotype - //Remove the GQ attribute as it needs to be recalculated for the new genotype assignment - //double[] likelihoods = genotype.getLikelihoods().getAsVector(); - //genotypeAttributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,likelihoods[1]); - genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); - return new Genotype(genotype.getSampleName(), alleles, genotype.getNegLog10PError(), null, genotype.getAttributes(), isPhased); + return new Genotype(genotype.getSampleName(), alleles, genotype.getLikelihoods().getLog10GQ(genotype.getType()), null, genotype.getAttributes(), isPhased); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 292bc17cd..9ed9b41cc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -25,8 +25,10 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.jgrapht.util.MathUtil; import java.util.EnumMap; import java.util.Map; @@ -98,6 +100,7 @@ public class GenotypeLikelihoods { return likelihoodsAsString_PLs; } + //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values public EnumMap getAsMap(boolean normalizeFromLog10){ //Make sure that the log10likelihoods are set double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); @@ -108,6 +111,32 @@ public class GenotypeLikelihoods { return likelihoodsMap; } + //Return the log10 Genotype Quality (GQ) for the given genotype + public double getLog10GQ(Genotype.Type genotype){ + + double qual = Double.NEGATIVE_INFINITY; + EnumMap likelihoods = getAsMap(false); + for(Map.Entry likelihood : likelihoods.entrySet()){ + if(likelihood.getKey() == genotype) + continue; + if(likelihood.getValue() > qual) + qual = likelihood.getValue(); + + } + + qual = likelihoods.get(genotype) - qual; + + if (qual < 0) { + // QUAL can be negative if the chosen genotype is not the most likely one individually. + // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one + double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); + double chosenGenotype = normalized[genotype.ordinal()-1]; + qual = -1.0 * Math.log10(1.0 - chosenGenotype); + } + + return qual; + } + private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { String[] strings = likelihoodsAsString_PLs.split(","); From 01b16abc8dcc233e29258eaa4e1d1c0415180e91 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 24 Oct 2011 10:24:41 +0200 Subject: [PATCH 05/20] Genotype quality calculation modified to handle all genotypes the same way. This is inconsistent with GQ output by the UG but is correct even for cases of poor quality genotypes. --- .../variantcontext/GenotypeLikelihoods.java | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 9ed9b41cc..b83ffa2fd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -111,8 +111,8 @@ public class GenotypeLikelihoods { return likelihoodsMap; } - //Return the log10 Genotype Quality (GQ) for the given genotype - public double getLog10GQ(Genotype.Type genotype){ + //Return the neg log10 Genotype Quality (GQ) for the given genotype + public double getNegLog10GQ(Genotype.Type genotype){ double qual = Double.NEGATIVE_INFINITY; EnumMap likelihoods = getAsMap(false); @@ -124,15 +124,9 @@ public class GenotypeLikelihoods { } - qual = likelihoods.get(genotype) - qual; - - if (qual < 0) { - // QUAL can be negative if the chosen genotype is not the most likely one individually. - // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one - double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); - double chosenGenotype = normalized[genotype.ordinal()-1]; - qual = -1.0 * Math.log10(1.0 - chosenGenotype); - } + double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); + double chosenGenotype = normalized[genotype.ordinal()-1]; + qual = -1.0 * Math.log10(1.0 - chosenGenotype); return qual; } From 7312e35c71167dee3510bf428204b3a4b75ab52f Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 24 Oct 2011 10:25:53 +0200 Subject: [PATCH 06/20] Now makes use of standard Allele and Genotype classes. This allowed quite some code cleaning. --- .../walkers/phasing/PhaseByTransmission.java | 341 +++++++----------- 1 file changed, 135 insertions(+), 206 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 52629277f..62b7bfffa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -56,7 +56,7 @@ public class PhaseByTransmission extends RodWalker, HashMa //Matrix of allele transmission private EnumMap>> transmissionMatrix; - //Metrics counters hashkeys + //Metrics counters hash keys private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; @@ -64,237 +64,223 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_HET_HET_HET = 4; private final Byte NUM_VIOLATIONS = 5; - private enum AlleleType { - NO_CALL, - REF, - VAR, - UNPHASED_REF, - UNPHASED_VAR + private enum FamilyMember { + MOTHER, + FATHER, + CHILD } //Stores a trio-genotype private class TrioPhase { - private ArrayList trioAlleles = new ArrayList(6); + //Create 2 fake alleles + //The actual bases will never be used but the Genotypes created using the alleles will be. + private final Allele REF = Allele.create("A",true); + private final Allele VAR = Allele.create("A",false); + private final Allele NO_CALL = Allele.create(".",false); + private final String DUMMY_NAME = "DummySample"; - private ArrayList getAlleles(Genotype.Type genotype){ - ArrayList alleles = new ArrayList(2); + private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); + + private ArrayList getAlleles(Genotype.Type genotype){ + ArrayList alleles = new ArrayList(2); if(genotype == Genotype.Type.HOM_REF){ - alleles.add(AlleleType.REF); - alleles.add(AlleleType.REF); + alleles.add(REF); + alleles.add(REF); } else if(genotype == Genotype.Type.HET){ - alleles.add(AlleleType.REF); - alleles.add(AlleleType.VAR); + alleles.add(REF); + alleles.add(VAR); } else if(genotype == Genotype.Type.HOM_VAR){ - alleles.add(AlleleType.VAR); - alleles.add(AlleleType.VAR); + alleles.add(VAR); + alleles.add(VAR); + } + else if(genotype == Genotype.Type.NO_CALL){ + alleles.add(NO_CALL); + alleles.add(NO_CALL); } else{ - alleles.add(AlleleType.NO_CALL); - alleles.add(AlleleType.NO_CALL); + return null; } return alleles; } - private ArrayList phaseSingleIndividualAlleles(Genotype.Type genotype){ - if(genotype == Genotype.Type.HET){ - ArrayList phasedAlleles = new ArrayList(2); - phasedAlleles.add(AlleleType.UNPHASED_REF); - phasedAlleles.add(AlleleType.UNPHASED_VAR); - return phasedAlleles; + //Create a new Genotype based on information from a single individual + //Homozygous genotypes will be set as phased, heterozygous won't be + private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ + if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true)); } else - return getAlleles(genotype); + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - private ArrayList phaseMonoParentFamilyAlleles(Genotype.Type parent, Genotype.Type child){ - ArrayList phasedAlleles = new ArrayList(4); + private void phaseMonoParentFamilyAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ + //Special case for Het/Het as it is ambiguous - if(parent == Genotype.Type.HET && child == Genotype.Type.HET){ - phasedAlleles.add(AlleleType.UNPHASED_REF); - phasedAlleles.add(AlleleType.UNPHASED_VAR); - phasedAlleles.add(AlleleType.UNPHASED_REF); - phasedAlleles.add(AlleleType.UNPHASED_VAR); + if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - ArrayList parentAlleles = getAlleles(parent); - ArrayList childAlleles = getAlleles(child); + ArrayList parentAlleles = getAlleles(parentGenotype); + ArrayList childAlleles = getAlleles(childGenotype); + ArrayList parentPhasedAlleles = new ArrayList(2); + ArrayList childPhasedAlleles = new ArrayList(2); + //If there is a possible phasing between the mother and child => phase int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); if(childTransmittedAlleleIndex > -1){ - phasedAlleles.add(parentAlleles.get(0)); - phasedAlleles.add(parentAlleles.get(1)); - phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - phasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); } else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ - phasedAlleles.add(parentAlleles.get(1)); - phasedAlleles.add(parentAlleles.get(0)); - phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - phasedAlleles.add(childAlleles.get(0)); + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); } + //This is a Mendelian Violation => Do not phase else{ - parentAlleles.addAll(childAlleles); - for(AlleleType allele : parentAlleles){ - if(allele == AlleleType.REF){ - phasedAlleles.add(AlleleType.UNPHASED_REF); - } - else if(allele == AlleleType.VAR){ - phasedAlleles.add(AlleleType.UNPHASED_VAR); - } - else{ - phasedAlleles.add(AlleleType.NO_CALL); - } - } + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - - return phasedAlleles; } - private ArrayList phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - ArrayList phasedAlleles = new ArrayList(6); + private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - Set> possiblePhasedChildGenotypes = new HashSet>(); - ArrayList motherAlleles = getAlleles(mother); - ArrayList fatherAlleles = getAlleles(father); - ArrayList childAlleles = getAlleles(child); + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); //Build all possible child genotypes for the given parent's genotypes - for (AlleleType momAllele : motherAlleles) { - for (AlleleType fatherAllele : fatherAlleles) { - ArrayList possiblePhasedChildAlleles = new ArrayList(2); + for (Allele momAllele : motherAlleles) { + for (Allele fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); possiblePhasedChildAlleles.add(momAllele); possiblePhasedChildAlleles.add(fatherAllele); possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); } } - for (ArrayList phasedChildGenotype : possiblePhasedChildGenotypes) { - int firstAlleleIndex = phasedChildGenotype.indexOf(childAlleles.get(0)); - int secondAlleleIndex = phasedChildGenotype.lastIndexOf(childAlleles.get(1)); + for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { + int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); + int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); + //If a possible combination has been found, create the genotypes if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { - //Add mother's alleles - phasedAlleles.add(phasedChildGenotype.get(0)); - if(motherAlleles.get(0) != phasedAlleles.get(0)) - phasedAlleles.add(motherAlleles.get(0)); + //Create mother's genotype + ArrayList motherPhasedAlleles = new ArrayList(2); + motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); + if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) + motherPhasedAlleles.add(motherAlleles.get(0)); else - phasedAlleles.add(motherAlleles.get(1)); + motherPhasedAlleles.add(motherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); - //Add father's alleles - phasedAlleles.add(phasedChildGenotype.get(1)); - if(fatherAlleles.get(0) != phasedAlleles.get(2)) - phasedAlleles.add(fatherAlleles.get(0)); + //Create father's genotype + ArrayList fatherPhasedAlleles = new ArrayList(2); + fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); + if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) + fatherPhasedAlleles.add(fatherAlleles.get(0)); else - phasedAlleles.add(fatherAlleles.get(1)); + fatherPhasedAlleles.add(fatherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); - //Add child's alleles - phasedAlleles.addAll(phasedChildGenotype); - return phasedAlleles; + //Create child's genotype + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Once a phased combination is found; exit + return; } } //If this is reached then no phasing could be found - motherAlleles.addAll(fatherAlleles); - motherAlleles.addAll(childAlleles); - for(AlleleType allele : motherAlleles){ - if(allele == AlleleType.REF){ - phasedAlleles.add(AlleleType.UNPHASED_REF); - } - else if(allele == AlleleType.VAR){ - phasedAlleles.add(AlleleType.UNPHASED_VAR); - } - else{ - phasedAlleles.add(AlleleType.NO_CALL); - } - } - return phasedAlleles; + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ //Take care of cases where one or more family members are no call - if(child == Genotype.Type.NO_CALL){ - trioAlleles.addAll(phaseSingleIndividualAlleles(mother)); - trioAlleles.addAll(phaseSingleIndividualAlleles(father)); - trioAlleles.add(AlleleType.NO_CALL); - trioAlleles.add(AlleleType.NO_CALL); + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } - else if(mother == Genotype.Type.NO_CALL){ - trioAlleles.add(AlleleType.NO_CALL); - trioAlleles.add(AlleleType.NO_CALL); - if(father == Genotype.Type.NO_CALL){ - trioAlleles.add(AlleleType.NO_CALL); - trioAlleles.add(AlleleType.NO_CALL); - trioAlleles.addAll(phaseSingleIndividualAlleles(child)); + else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } else - trioAlleles.addAll(phaseMonoParentFamilyAlleles(father, child)); + phaseMonoParentFamilyAlleles(father, child, FamilyMember.FATHER); } - else if(father == Genotype.Type.NO_CALL){ - trioAlleles.addAll(phaseMonoParentFamilyAlleles(mother, child)); - trioAlleles.add(2, AlleleType.NO_CALL); - trioAlleles.add(3, AlleleType.NO_CALL); + else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + phaseMonoParentFamilyAlleles(mother, child, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); } //Special case for Het/Het/Het as it is ambiguous else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ - trioAlleles.add(AlleleType.UNPHASED_REF); - trioAlleles.add(AlleleType.UNPHASED_VAR); - trioAlleles.add(AlleleType.UNPHASED_REF); - trioAlleles.add(AlleleType.UNPHASED_VAR); - trioAlleles.add(AlleleType.UNPHASED_REF); - trioAlleles.add(AlleleType.UNPHASED_VAR); + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } //All family members have genotypes and at least one of them is not Het else{ - trioAlleles = phaseFamilyAlleles(mother, father, child); + phaseFamilyAlleles(mother, father, child); } } public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){ - phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,trioAlleles.subList(0,2))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,trioAlleles.subList(2,4))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,trioAlleles.subList(4,6))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); return phasedGenotypes; } - private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, List phasedAlleles){ + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, Genotype phasedGenotype){ //Add the transmission probability Map genotypeAttributes = new HashMap(); genotypeAttributes.putAll(genotype.getAttributes()); genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb); - genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); - boolean isPhased = true; + //Handle missing genotype + if(!phasedGenotype.isAvailable()) + return new Genotype(genotype.getSampleName(), null); + //Handle NoCall genotype + else if(phasedGenotype.isNoCall()) + return new Genotype(genotype.getSampleName(), phasedGenotype.getAlleles()); - List alleles = new ArrayList(2); + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); - //If unphased, return original genotype - for(AlleleType allele : phasedAlleles){ - if(allele == AlleleType.NO_CALL){ - return genotype; - } - //Otherwise add the appropriate allele - else if(allele == AlleleType.UNPHASED_REF){ - isPhased = false; - alleles.add(refAllele); - } - else if(allele == AlleleType.UNPHASED_VAR){ - isPhased = false; - alleles.add(altAllele); - } - else if(allele == AlleleType.REF){ - alleles.add(refAllele); - } - else if(allele == AlleleType.VAR){ - alleles.add(altAllele); - } - } + } - return new Genotype(genotype.getSampleName(), alleles, genotype.getLikelihoods().getLog10GQ(genotype.getType()), null, genotype.getAttributes(), isPhased); + //Compute the new Log10Error if the genotype is different from the original genotype + double negLog10Error; + if(genotype.getType() == phasedGenotype.getType()) + negLog10Error = genotype.getNegLog10PError(); + else + negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType()); + + return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased()); } @@ -432,63 +418,6 @@ public class PhaseByTransmission extends RodWalker, HashMa return 1; } - private Double getCombinationPrior(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - - double nonMVPrior = 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - - //Child is no call => No MV - if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) - return nonMVPrior; - //Add parents with genotypes for the evaluation - ArrayList parents = new ArrayList(); - if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) - parents.add(mother); - if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) - parents.add(father); - - //Both parents no calls => No MV - if (parents.isEmpty()) - return nonMVPrior; - - //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed - int parentsNumRefAlleles = 0; - int parentsNumAltAlleles = 0; - - for(Genotype.Type parent : parents){ - if(parent == Genotype.Type.HOM_REF){ - parentsNumRefAlleles++; - } - else if(parent == Genotype.Type.HET){ - parentsNumRefAlleles++; - parentsNumAltAlleles++; - } - else if(parent == Genotype.Type.HOM_VAR){ - parentsNumAltAlleles++; - } - } - - //Case Child is HomRef - if(child == Genotype.Type.HOM_REF){ - if(parentsNumRefAlleles == parents.size()) - return nonMVPrior; - else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumRefAlleles); - } - - //Case child is HomVar - if(child == Genotype.Type.HOM_VAR){ - if(parentsNumAltAlleles == parents.size()) - return nonMVPrior; - else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumAltAlleles); - } - - //Case child is Het - if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) - return nonMVPrior; - - //MV - return MENDELIAN_VIOLATION_PRIOR; - } - private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ int count = 0; if(motherOriginal!=motherNew) @@ -534,7 +463,7 @@ public class PhaseByTransmission extends RodWalker, HashMa for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ for(Map.Entry childGenotype : childLikelihoods.entrySet()){ mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); - configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-12*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); + configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); norm += configurationLikelihood; configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); //Keep this combination if From 38ebf3141a13ebafcc43b69c3364e0066ba17ea9 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 24 Oct 2011 12:30:04 +0200 Subject: [PATCH 07/20] - Now supports parent/child pairs - Sites with missing genotypes in pairs/trios are handled as follows: -- Missing child -> Homozygous parents are phased, no transmission probability is emitted -- Two individuals missing -> Phase if homozygous, no transmission probability is emitted -- One parent missing -> Phased / transmission probability emitted - Mutation prior set as argument --- .../walkers/phasing/PhaseByTransmission.java | 134 ++++++++++-------- 1 file changed, 77 insertions(+), 57 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 62b7bfffa..d128ff40e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -40,13 +40,16 @@ public class PhaseByTransmission extends RodWalker, HashMa @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") private PrintStream mvFile = null; + @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8") + private double deNovoPrior=1e-8; + @Output protected VCFWriter vcfWriter = null; private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; private final String SOURCE_NAME = "PhaseByTransmission"; - private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; + public final double NO_TRANSMISSION_PROB = -1.0; private ArrayList trios = new ArrayList(); @@ -240,26 +243,26 @@ public class PhaseByTransmission extends RodWalker, HashMa } } - public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){ + public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); return phasedGenotypes; } - private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, Genotype phasedGenotype){ + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) + return genotype; //Add the transmission probability Map genotypeAttributes = new HashMap(); genotypeAttributes.putAll(genotype.getAttributes()); - genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb); - - //Handle missing genotype - if(!phasedGenotype.isAvailable()) - return new Genotype(genotype.getSampleName(), null); - //Handle NoCall genotype - else if(phasedGenotype.isNoCall()) - return new Genotype(genotype.getSampleName(), phasedGenotype.getAlleles()); + if(transmissionProb>NO_TRANSMISSION_PROB) + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb))); ArrayList phasedAlleles = new ArrayList(2); for(Allele allele : phasedGenotype.getAlleles()){ @@ -324,8 +327,8 @@ public class PhaseByTransmission extends RodWalker, HashMa ArrayList parents; for(String familyID : families.keySet()){ family = families.get(familyID); - if(family.size()!=3){ - logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios. Family skipped.",familyID,family.size())); + if(family.size()<2 || family.size()>3){ + logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID,family.size())); } else{ for(Sample familyMember : family){ @@ -334,7 +337,7 @@ public class PhaseByTransmission extends RodWalker, HashMa if(family.containsAll(parents)) this.trios.add(familyMember); else - logger.info(String.format("Caution: Family %s is not a trio; At the moment Phase By Transmission only supports trios. Family skipped.",familyID)); + logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID)); break; } } @@ -429,64 +432,84 @@ public class PhaseByTransmission extends RodWalker, HashMa return count; } + private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ + if(genotype == null || !genotype.isAvailable()){ + EnumMap likelihoods = new EnumMap(Genotype.Type.class); + likelihoods.put(Genotype.Type.UNAVAILABLE,1.0); + return likelihoods; + } + else if(genotype.isNoCall()){ + EnumMap likelihoods = new EnumMap(Genotype.Type.class); + likelihoods.put(Genotype.Type.NO_CALL,1.0); + return likelihoods; + } + return genotype.getLikelihoods().getAsMap(true); + } + + private Genotype.Type getTypeSafeNull(Genotype genotype){ + if(genotype == null) + return Genotype.Type.UNAVAILABLE; + return genotype.getType(); + } + private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { - //For now, only consider trios with complete information - //TODO: Phasing of trios with missing information - if(!(mother.isCalled() && father.isCalled() && child.isCalled())) { - finalGenotypes.add(mother); - finalGenotypes.add(father); - finalGenotypes.add(child); - return false; - } - //Get the PL - Map motherLikelihoods = mother.getLikelihoods().getAsMap(true); - Map fatherLikelihoods = father.getLikelihoods().getAsMap(true); - Map childLikelihoods = child.getLikelihoods().getAsMap(true); + Map motherLikelihoods = getLikelihoodsAsMapSafeNull(mother); + Map fatherLikelihoods = getLikelihoodsAsMapSafeNull(father); + Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); //Prior vars double bestConfigurationLikelihood = 0.0; double norm = 0.0; boolean isMV = false; int bestConfigurationGenotypeDiffs=4; - Genotype.Type bestMotherGenotype = mother.getType(); - Genotype.Type bestFatherGenotype = father.getType(); - Genotype.Type bestChildGenotype = child.getType(); + Genotype.Type bestMotherGenotype = getTypeSafeNull(mother); + Genotype.Type bestFatherGenotype = getTypeSafeNull(father); + Genotype.Type bestChildGenotype = getTypeSafeNull(child); //Get the most likely combination - int mvCount; - double configurationLikelihood; - int configurationGenotypeDiffs; - for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ - for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ - for(Map.Entry childGenotype : childLikelihoods.entrySet()){ - mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); - configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); - norm += configurationLikelihood; - configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); - //Keep this combination if - //It has a better likelihood - //Or it has the same likelihood but requires less changes from original genotypes - if ((configurationLikelihood > bestConfigurationLikelihood) || - (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) { - bestConfigurationLikelihood = configurationLikelihood; - bestMotherGenotype = motherGenotype.getKey(); - bestFatherGenotype = fatherGenotype.getKey(); - bestChildGenotype = childGenotype.getKey(); - isMV = mvCount>0; - bestConfigurationGenotypeDiffs=configurationGenotypeDiffs; + //Only check for most likely combination if at least a parent and the child have genotypes + if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){ + int mvCount; + double configurationLikelihood; + int configurationGenotypeDiffs; + for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ + for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); + configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if ((configurationLikelihood > bestConfigurationLikelihood) || + (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) { + bestConfigurationLikelihood = configurationLikelihood; + bestMotherGenotype = motherGenotype.getKey(); + bestFatherGenotype = fatherGenotype.getKey(); + bestChildGenotype = childGenotype.getKey(); + isMV = mvCount>0; + bestConfigurationGenotypeDiffs=configurationGenotypeDiffs; + } } } } + + //normalize the best configuration probability + bestConfigurationLikelihood = bestConfigurationLikelihood / norm; + } + else{ + bestConfigurationLikelihood = NO_TRANSMISSION_PROB; } //Get the phased alleles for the genotype configuration TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype); //Return the phased genotypes - phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,MathUtils.probabilityToPhredScale(1-(bestConfigurationLikelihood / norm)),finalGenotypes); + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); return isMV; } @@ -522,8 +545,8 @@ public class PhaseByTransmission extends RodWalker, HashMa Genotype father = vc.getGenotype(sample.getPaternalID()); Genotype child = vc.getGenotype(sample.getID()); - //Skip trios where any of the genotype is missing in the variant context - if(mother == null || father == null | child == null) + //Keep only trios and parent/child pairs + if(mother == null && father == null || child == null) continue; ArrayList trioGenotypes = new ArrayList(3); @@ -545,11 +568,8 @@ public class PhaseByTransmission extends RodWalker, HashMa if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){ numHet++; - if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()){ + if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()) numHetHetHet++; - }else if(!phasedMother.isPhased()){ - int x =9; - } } if(isMV){ From 62477a08104c3a158ccbb547ab5f347625676809 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 24 Oct 2011 13:45:21 +0200 Subject: [PATCH 08/20] Added documentation and comments --- .../walkers/phasing/PhaseByTransmission.java | 127 ++++++++++++++---- 1 file changed, 103 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index d128ff40e..c4c6b69b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -22,15 +22,57 @@ import java.io.PrintStream; import java.util.*; /** - * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio, - * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype - * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents - * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the - * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to - * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally - * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or - * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to - * begin. + * Computes the most likely genotype combination and phases trios and parent/child pairs + * + *

+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases + * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. + * Ambiguous sites are: + *

    + *
  • Sites where all individuals are heterozygous
  • + *
  • Sites where there is a Mendelian violation
  • + *
+ * Missing genotypes are handled as follows: + *
    + *
  • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
  • + *
  • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
  • + *
  • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
  • + *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • + *
+ * + *

Input

+ *

+ *

    + *
  • A VCF variant set containing trio(s) and/or parent/child pair(s).
  • + *
  • A PED pedigree file containing the description of the individuals relationships.
  • + *
+ *

+ * + *

Options

+ *

+ *

    + *
  • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype + * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, + * and each individual genotype, depth, allelic depth and likelihoods.
  • + *
  • DeNovoPrior: Mutation prio; default is 1e-8
  • + *
+ *

+ * + *

Output

+ *

+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PhaseByTransmission \
+ *   -V input.vcf \
+ *   -ped input.ped \
+ *   -o output.vcf
+ * 
+ * */ public class PhaseByTransmission extends RodWalker, HashMap> { @@ -73,7 +115,8 @@ public class PhaseByTransmission extends RodWalker, HashMa CHILD } - //Stores a trio-genotype + //Stores a conceptual trio or parent/child pair genotype combination along with its phasing. + //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method. private class TrioPhase { //Create 2 fake alleles @@ -119,7 +162,8 @@ public class PhaseByTransmission extends RodWalker, HashMa trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - private void phaseMonoParentFamilyAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ + //Find the phase for a parent/child pair + private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ //Special case for Het/Het as it is ambiguous if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ @@ -155,6 +199,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } } + //Phases a family by transmission private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ Set> possiblePhasedChildGenotypes = new HashSet>(); @@ -209,7 +254,10 @@ public class PhaseByTransmission extends RodWalker, HashMa trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - + /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. + If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair + or single individual. + */ public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ //Take care of cases where one or more family members are no call @@ -225,10 +273,10 @@ public class PhaseByTransmission extends RodWalker, HashMa phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } else - phaseMonoParentFamilyAlleles(father, child, FamilyMember.FATHER); + phasePairAlleles(father, child, FamilyMember.FATHER); } else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ - phaseMonoParentFamilyAlleles(mother, child, FamilyMember.MOTHER); + phasePairAlleles(mother, child, FamilyMember.MOTHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER); } //Special case for Het/Het/Het as it is ambiguous @@ -243,11 +291,20 @@ public class PhaseByTransmission extends RodWalker, HashMa } } - public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ + public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); - return phasedGenotypes; } private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ @@ -290,7 +347,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } /** - * Parse the familial relationship specification, and initialize VCF writer + * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer */ public void initialize() { ArrayList rodNames = new ArrayList(); @@ -306,7 +363,7 @@ public class PhaseByTransmission extends RodWalker, HashMa Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the phase given that the genotypes are correct")); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); @@ -318,7 +375,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } /** - * Select Trios only + * Select trios and parent/child pairs only */ private void setTrios(){ @@ -349,6 +406,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } + //Create the transmission matrices private void buildMatrices(){ mvCountMatrix = new EnumMap>>(Genotype.Type.class); transmissionMatrix = new EnumMap>>(Genotype.Type.class); @@ -366,6 +424,9 @@ public class PhaseByTransmission extends RodWalker, HashMa } } + //Returns the number of Mendelian Violations for a given genotype combination. + //If one of the parents genotype is missing, it will consider it as a parent/child pair + //If the child genotype or both parents genotypes are missing, 0 is returned. private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ //Child is no call => No MV @@ -421,6 +482,7 @@ public class PhaseByTransmission extends RodWalker, HashMa return 1; } + //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ int count = 0; if(motherOriginal!=motherNew) @@ -432,6 +494,8 @@ public class PhaseByTransmission extends RodWalker, HashMa return count; } + //Get a Map of genotype likelihoods. If the genotype is NO_CALL or UNAVAILABLE, the Map will contain a single + //NO_CALL resp. UNAVAILABLE element with a likelihood of 1.0 private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ if(genotype == null || !genotype.isAvailable()){ EnumMap likelihoods = new EnumMap(Genotype.Type.class); @@ -446,6 +510,7 @@ public class PhaseByTransmission extends RodWalker, HashMa return genotype.getLikelihoods().getAsMap(true); } + //Returns the Genotype.Type; returns UNVAILABLE if given null private Genotype.Type getTypeSafeNull(Genotype genotype){ if(genotype == null) return Genotype.Type.UNAVAILABLE; @@ -453,6 +518,16 @@ public class PhaseByTransmission extends RodWalker, HashMa } + /** + * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair. + * @param ref: Reference allele + * @param alt: Alternative allele + * @param mother: Mother's genotype + * @param father: Father's genotype + * @param child: Child's genotype + * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child + * @return + */ private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { //Get the PL @@ -600,9 +675,9 @@ public class PhaseByTransmission extends RodWalker, HashMa } /** - * Provide an initial value for reduce computations. + * Initializes the reporting counters. * - * @return Initial value of reduce. + * @return All counters initialized to 0 */ @Override public HashMap reduceInit() { @@ -617,10 +692,10 @@ public class PhaseByTransmission extends RodWalker, HashMa } /** - * Reduces a single map with the accumulator provided as the ReduceType. + * Adds the value of the site phased to the reporting counters. * - * @param value result of the map. - * @param sum accumulator for the reduce. + * @param value Site values + * @param sum accumulator for the reporting counters * @return accumulator with result of the map taken into account. */ @Override @@ -635,6 +710,10 @@ public class PhaseByTransmission extends RodWalker, HashMa } + /** + * Reports statistics on the phasing by transmission process. + * @param result Accumulator with all counters. + */ @Override public void onTraversalDone(HashMap result) { logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); From 62cff266d4e20aa2a48f88c47f282d9a6bc25287 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 26 Oct 2011 14:40:04 +0200 Subject: [PATCH 09/20] GQ calculation corrected for most likely genotype --- .../utils/variantcontext/GenotypeLikelihoods.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index b83ffa2fd..47c93bb1b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -124,10 +124,15 @@ public class GenotypeLikelihoods { } - double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); - double chosenGenotype = normalized[genotype.ordinal()-1]; - qual = -1.0 * Math.log10(1.0 - chosenGenotype); + //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best) + qual = likelihoods.get(genotype) - qual; + //Quality of other genotypes 1-P(G) + if (qual < 0) { + double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); + double chosenGenotype = normalized[genotype.ordinal()-1]; + qual = -1.0 * Math.log10(1.0 - chosenGenotype); + } return qual; } From 81b163ff4d6a8203cb4cb155a92a751ee1b2349d Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 26 Oct 2011 14:49:12 +0200 Subject: [PATCH 10/20] Indentation --- .../walkers/phasing/PhaseByTransmission.java | 370 +++++++++--------- 1 file changed, 185 insertions(+), 185 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index c4c6b69b7..956cf7c89 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -129,220 +129,220 @@ public class PhaseByTransmission extends RodWalker, HashMa private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); private ArrayList getAlleles(Genotype.Type genotype){ - ArrayList alleles = new ArrayList(2); - if(genotype == Genotype.Type.HOM_REF){ - alleles.add(REF); - alleles.add(REF); - } - else if(genotype == Genotype.Type.HET){ - alleles.add(REF); - alleles.add(VAR); - } - else if(genotype == Genotype.Type.HOM_VAR){ - alleles.add(VAR); - alleles.add(VAR); - } - else if(genotype == Genotype.Type.NO_CALL){ - alleles.add(NO_CALL); - alleles.add(NO_CALL); - } - else{ - return null; - } - return alleles; - } - - //Create a new Genotype based on information from a single individual - //Homozygous genotypes will be set as phased, heterozygous won't be - private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ - if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ - trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - } - else - trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - } - - //Find the phase for a parent/child pair - private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ - - //Special case for Het/Het as it is ambiguous - if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + ArrayList alleles = new ArrayList(2); + if(genotype == Genotype.Type.HOM_REF){ + alleles.add(REF); + alleles.add(REF); + } + else if(genotype == Genotype.Type.HET){ + alleles.add(REF); + alleles.add(VAR); + } + else if(genotype == Genotype.Type.HOM_VAR){ + alleles.add(VAR); + alleles.add(VAR); + } + else if(genotype == Genotype.Type.NO_CALL){ + alleles.add(NO_CALL); + alleles.add(NO_CALL); + } + else{ + return null; + } + return alleles; } - ArrayList parentAlleles = getAlleles(parentGenotype); - ArrayList childAlleles = getAlleles(childGenotype); - ArrayList parentPhasedAlleles = new ArrayList(2); - ArrayList childPhasedAlleles = new ArrayList(2); - - //If there is a possible phasing between the mother and child => phase - int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); - if(childTransmittedAlleleIndex > -1){ - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - childPhasedAlleles.add(childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + //Create a new Genotype based on information from a single individual + //Homozygous genotypes will be set as phased, heterozygous won't be + private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ + if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + } + else + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ - parentPhasedAlleles.add(parentAlleles.get(1)); - parentPhasedAlleles.add(parentAlleles.get(0)); - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - childPhasedAlleles.add(childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - } - //This is a Mendelian Violation => Do not phase - else{ - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - } - } - //Phases a family by transmission - private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + //Find the phase for a parent/child pair + private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ - Set> possiblePhasedChildGenotypes = new HashSet>(); - ArrayList motherAlleles = getAlleles(mother); - ArrayList fatherAlleles = getAlleles(father); - ArrayList childAlleles = getAlleles(child); + //Special case for Het/Het as it is ambiguous + if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + } - //Build all possible child genotypes for the given parent's genotypes - for (Allele momAllele : motherAlleles) { - for (Allele fatherAllele : fatherAlleles) { - ArrayList possiblePhasedChildAlleles = new ArrayList(2); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(fatherAllele); - possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); + ArrayList parentAlleles = getAlleles(parentGenotype); + ArrayList childAlleles = getAlleles(childGenotype); + ArrayList parentPhasedAlleles = new ArrayList(2); + ArrayList childPhasedAlleles = new ArrayList(2); + + //If there is a possible phasing between the mother and child => phase + int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); + if(childTransmittedAlleleIndex > -1){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + } + else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + } + //This is a Mendelian Violation => Do not phase + else{ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } } - for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { - int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); - int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); - //If a possible combination has been found, create the genotypes - if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { - //Create mother's genotype - ArrayList motherPhasedAlleles = new ArrayList(2); - motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); - if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) - motherPhasedAlleles.add(motherAlleles.get(0)); - else - motherPhasedAlleles.add(motherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + //Phases a family by transmission + private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - //Create father's genotype - ArrayList fatherPhasedAlleles = new ArrayList(2); - fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); - if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) - fatherPhasedAlleles.add(fatherAlleles.get(0)); - else - fatherPhasedAlleles.add(fatherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); - //Create child's genotype - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); - - //Once a phased combination is found; exit - return; + //Build all possible child genotypes for the given parent's genotypes + for (Allele momAllele : motherAlleles) { + for (Allele fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(fatherAllele); + possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); + } } + + for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { + int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); + int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); + //If a possible combination has been found, create the genotypes + if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { + //Create mother's genotype + ArrayList motherPhasedAlleles = new ArrayList(2); + motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); + if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) + motherPhasedAlleles.add(motherAlleles.get(0)); + else + motherPhasedAlleles.add(motherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Create father's genotype + ArrayList fatherPhasedAlleles = new ArrayList(2); + fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); + if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) + fatherPhasedAlleles.add(fatherAlleles.get(0)); + else + fatherPhasedAlleles.add(fatherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Create child's genotype + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Once a phased combination is found; exit + return; + } + } + + //If this is reached then no phasing could be found + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - //If this is reached then no phasing could be found - trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - } + /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. + If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair + or single individual. + */ + public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. - If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair - or single individual. - */ - public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - - //Take care of cases where one or more family members are no call - if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + //Take care of cases where one or more family members are no call + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER); phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } - else - phasePairAlleles(father, child, FamilyMember.FATHER); + else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else + phasePairAlleles(father, child, FamilyMember.FATHER); + } + else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + phasePairAlleles(mother, child, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + } + //Special case for Het/Het/Het as it is ambiguous + else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + //All family members have genotypes and at least one of them is not Het + else{ + phaseFamilyAlleles(mother, father, child); + } } - else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ - phasePairAlleles(mother, child, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ + public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); } - //Special case for Het/Het/Het as it is ambiguous - else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - //All family members have genotypes and at least one of them is not Het - else{ - phaseFamilyAlleles(mother, father, child); - } - } - /** - * Applies the trio genotype combination to the given trio. - * @param ref: Reference allele - * @param alt: Alternate allele - * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination - * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination - * @param childGenotype: Genotype of the child to phase using this trio genotype combination - * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) - * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child - */ - public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ - phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); - } + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ - private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) + return genotype; - //Handle null, missing and unavailable genotypes - //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable - //genotype so it is safe to return the original genotype in this case. - if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) - return genotype; + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getAttributes()); + if(transmissionProb>NO_TRANSMISSION_PROB) + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb))); - //Add the transmission probability - Map genotypeAttributes = new HashMap(); - genotypeAttributes.putAll(genotype.getAttributes()); - if(transmissionProb>NO_TRANSMISSION_PROB) - genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb))); + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); - ArrayList phasedAlleles = new ArrayList(2); - for(Allele allele : phasedGenotype.getAlleles()){ - if(allele.isReference()) - phasedAlleles.add(refAllele); - else if(allele.isNonReference()) - phasedAlleles.add(altAllele); - //At this point there should not be any other alleles left + } + + //Compute the new Log10Error if the genotype is different from the original genotype + double negLog10Error; + if(genotype.getType() == phasedGenotype.getType()) + negLog10Error = genotype.getNegLog10PError(); else - throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); + negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType()); + return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased()); } - //Compute the new Log10Error if the genotype is different from the original genotype - double negLog10Error; - if(genotype.getType() == phasedGenotype.getType()) - negLog10Error = genotype.getNegLog10PError(); - else - negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType()); - - return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased()); - } - } From 1f044faedd01317b71c3abf91ac935beb6581223 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 26 Oct 2011 19:57:09 +0200 Subject: [PATCH 11/20] - Genotype assignment in case of equally likeli combination is now random - Genotype combinations with 0 confidence are now left unphased --- .../walkers/phasing/PhaseByTransmission.java | 65 +++++++++++++------ 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 956cf7c89..f5e9ce6ea 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -109,6 +109,9 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_HET_HET_HET = 4; private final Byte NUM_VIOLATIONS = 5; + //Random number generator + private Random rand = new Random(); + private enum FamilyMember { MOTHER, FATHER, @@ -309,17 +312,22 @@ public class PhaseByTransmission extends RodWalker, HashMa private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + int phredScoreTransmission = -1; + if(transmissionProb != NO_TRANSMISSION_PROB) + phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb)); + //Handle null, missing and unavailable genotypes //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable //genotype so it is safe to return the original genotype in this case. - if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) + //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. + if(phredScoreTransmission ==0 || genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) return genotype; //Add the transmission probability Map genotypeAttributes = new HashMap(); genotypeAttributes.putAll(genotype.getAttributes()); if(transmissionProb>NO_TRANSMISSION_PROB) - genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb))); + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); ArrayList phasedAlleles = new ArrayList(2); for(Allele allele : phasedGenotype.getAlleles()){ @@ -538,36 +546,46 @@ public class PhaseByTransmission extends RodWalker, HashMa //Prior vars double bestConfigurationLikelihood = 0.0; double norm = 0.0; - boolean isMV = false; - int bestConfigurationGenotypeDiffs=4; - Genotype.Type bestMotherGenotype = getTypeSafeNull(mother); - Genotype.Type bestFatherGenotype = getTypeSafeNull(father); - Genotype.Type bestChildGenotype = getTypeSafeNull(child); + int configuration_index =0; + ArrayList isMV = new ArrayList(); + isMV.add(false); + ArrayList bestMotherGenotype = new ArrayList(); + bestMotherGenotype.add(getTypeSafeNull(mother)); + ArrayList bestFatherGenotype = new ArrayList(); + bestFatherGenotype.add(getTypeSafeNull(father)); + ArrayList bestChildGenotype = new ArrayList(); + bestChildGenotype.add(getTypeSafeNull(child)); //Get the most likely combination //Only check for most likely combination if at least a parent and the child have genotypes if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){ int mvCount; double configurationLikelihood; - int configurationGenotypeDiffs; for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ for(Map.Entry childGenotype : childLikelihoods.entrySet()){ mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); norm += configurationLikelihood; - configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); //Keep this combination if //It has a better likelihood //Or it has the same likelihood but requires less changes from original genotypes - if ((configurationLikelihood > bestConfigurationLikelihood) || - (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) { + if (configurationLikelihood > bestConfigurationLikelihood){ bestConfigurationLikelihood = configurationLikelihood; - bestMotherGenotype = motherGenotype.getKey(); - bestFatherGenotype = fatherGenotype.getKey(); - bestChildGenotype = childGenotype.getKey(); - isMV = mvCount>0; - bestConfigurationGenotypeDiffs=configurationGenotypeDiffs; + isMV.clear(); + isMV.add(mvCount>0); + bestMotherGenotype.clear(); + bestMotherGenotype.add(motherGenotype.getKey()); + bestFatherGenotype.clear(); + bestFatherGenotype.add(fatherGenotype.getKey()); + bestChildGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestMotherGenotype.add(motherGenotype.getKey()); + bestFatherGenotype.add(fatherGenotype.getKey()); + bestChildGenotype.add(childGenotype.getKey()); + isMV.add(mvCount>0); } } } @@ -575,17 +593,22 @@ public class PhaseByTransmission extends RodWalker, HashMa //normalize the best configuration probability bestConfigurationLikelihood = bestConfigurationLikelihood / norm; + + //In case of multiple equally likely combinations, take a random one + if(bestMotherGenotype.size()>1){ + configuration_index = rand.nextInt(bestMotherGenotype.size()-1); + } + } else{ bestConfigurationLikelihood = NO_TRANSMISSION_PROB; } - //Get the phased alleles for the genotype configuration - TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype); + TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype.get(configuration_index)).get(bestFatherGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); - //Return the phased genotypes - phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); - return isMV; + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); + return isMV.get(configuration_index); } From b91a9c4711d60b361a226a517a97e68217fdd5cf Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 2 Nov 2011 08:04:01 +0100 Subject: [PATCH 12/20] - Fixed parent/child pairs handling (was crashing before) - Added parent/child pair reporting --- .../walkers/phasing/PhaseByTransmission.java | 151 ++++++++++++------ 1 file changed, 101 insertions(+), 50 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index f5e9ce6ea..5b2024f96 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -105,9 +105,13 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; - private final Byte NUM_HET = 3; - private final Byte NUM_HET_HET_HET = 4; - private final Byte NUM_VIOLATIONS = 5; + private final Byte NUM_TRIO_HET_HET_HET = 3; + private final Byte NUM_TRIO_VIOLATIONS = 4; + private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; + private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; + private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; + private final Byte NUM_PAIR_HET_HET = 8; + private final Byte NUM_PAIR_VIOLATIONS = 9; //Random number generator private Random rand = new Random(); @@ -172,6 +176,7 @@ public class PhaseByTransmission extends RodWalker, HashMa if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + return; } ArrayList parentAlleles = getAlleles(parentGenotype); @@ -612,6 +617,48 @@ public class PhaseByTransmission extends RodWalker, HashMa } + + private void updatePairMetricsCounters(Genotype parent, Genotype child, boolean isMV, HashMap counters){ + + //Increment metrics counters + if(parent.isCalled() && child.isCalled()){ + counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); + if(parent.isPhased()) + counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); + else{ + if(parent.isHet() && child.isHet()) + counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); + + else if(isMV) + counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+1); + } + }else{ + counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); + } + + } + + private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, boolean isMV, HashMap counters){ + + //Increment metrics counters + if(mother.isCalled() && father.isCalled() && child.isCalled()){ + counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); + if(mother.isPhased()) + counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); + + else{ + if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); + + else if(isMV) + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + + } + }else{ + counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); + } + } + /** * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype * @@ -623,13 +670,17 @@ public class PhaseByTransmission extends RodWalker, HashMa @Override public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - //Local cars to avoid lookups on increment - int numTrioGenotypesCalled = 0; - int numTrioGenotypesNoCall = 0; - int numTrioGenotypesPhased = 0; - int numHet = 0 ; - int numHetHetHet = 0; - int numMVs = 0; + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); @@ -654,30 +705,26 @@ public class PhaseByTransmission extends RodWalker, HashMa Genotype phasedFather = trioGenotypes.get(1); Genotype phasedChild = trioGenotypes.get(2); - genotypeMap.put(phasedMother.getSampleName(), phasedMother); - genotypeMap.put(phasedFather.getSampleName(), phasedFather); - genotypeMap.put(phasedChild.getSampleName(), phasedChild); - - //Increment metrics counters - if(phasedMother.isCalled() && phasedFather.isCalled() && phasedChild.isCalled()){ - numTrioGenotypesCalled++; - if(phasedMother.isPhased()) - numTrioGenotypesPhased++; - - if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){ - numHet++; - if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()) - numHetHetHet++; + //Fill the genotype map with the new genotypes and increment metrics counters + genotypeMap.put(phasedChild.getSampleName(),phasedChild); + if(mother != null){ + genotypeMap.put(phasedMother.getSampleName(), phasedMother); + if(father != null){ + genotypeMap.put(phasedFather.getSampleName(), phasedFather); + updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters); } - - if(isMV){ - numMVs++; - if(mvFile != null) - mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString())); - } - }else{ - numTrioGenotypesNoCall++; + else + updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters); } + else{ + genotypeMap.put(phasedFather.getSampleName(),phasedFather); + updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters); + } + + //Report violation if set so + //TODO: ADAPT FOR PAIRS TOO!! + if(isMV && mvFile != null) + mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString())); } @@ -686,14 +733,6 @@ public class PhaseByTransmission extends RodWalker, HashMa vcfWriter.add(newvc); } - - HashMap metricsCounters = new HashMap(5); - metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,numTrioGenotypesCalled); - metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,numTrioGenotypesNoCall); - metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,numTrioGenotypesPhased); - metricsCounters.put(NUM_HET,numHet); - metricsCounters.put(NUM_HET_HET_HET,numHetHetHet); - metricsCounters.put(NUM_VIOLATIONS,numMVs); return metricsCounters; } @@ -704,13 +743,17 @@ public class PhaseByTransmission extends RodWalker, HashMa */ @Override public HashMap reduceInit() { - HashMap metricsCounters = new HashMap(5); + HashMap metricsCounters = new HashMap(10); metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_HET,0); - metricsCounters.put(NUM_HET_HET_HET,0); - metricsCounters.put(NUM_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); return metricsCounters; } @@ -726,9 +769,13 @@ public class PhaseByTransmission extends RodWalker, HashMa sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); - sum.put(NUM_HET,value.get(NUM_HET)+sum.get(NUM_HET)); - sum.put(NUM_HET_HET_HET,value.get(NUM_HET_HET_HET)+sum.get(NUM_HET_HET_HET)); - sum.put(NUM_VIOLATIONS,value.get(NUM_VIOLATIONS)+sum.get(NUM_VIOLATIONS)); + sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET)); + sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS)); + sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED)); + sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL)); + sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); + sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); + sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); return sum; } @@ -742,8 +789,12 @@ public class PhaseByTransmission extends RodWalker, HashMa logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); - logger.info("Number of resulting Hom/Hom/Hom trios: " + result.get(NUM_HET)); - logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_HET_HET_HET)); - logger.info("Number of remaining mendelian violations: " + result.get(NUM_VIOLATIONS)); + logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); + logger.info("Number of remaining mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); + logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); + logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); + logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); } } From 119ca7d742371423060bcc7150f908e1cb04de8a Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 2 Nov 2011 08:22:33 +0100 Subject: [PATCH 13/20] Fixed a bug in parent/child pairs reporting causing a crash in case the -mvf option was used and mother was not provided --- .../sting/gatk/walkers/phasing/PhaseByTransmission.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 5b2024f96..806a84e9d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -681,6 +681,7 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + String mvfLine = ""; if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); @@ -712,19 +713,23 @@ public class PhaseByTransmission extends RodWalker, HashMa if(father != null){ genotypeMap.put(phasedFather.getSampleName(), phasedFather); updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } - else + else{ updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + } } else{ genotypeMap.put(phasedFather.getSampleName(),phasedFather); updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } //Report violation if set so //TODO: ADAPT FOR PAIRS TOO!! if(isMV && mvFile != null) - mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString())); + mvFile.println(mvfLine); } From 19ad5b635ad46f119ed1beccccf7bb2bc6a18aef Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 2 Nov 2011 18:35:31 +0100 Subject: [PATCH 14/20] - Calculation of parent/child pairs corrected - Separated the reporting of single and double mendelian violations in trios --- .../walkers/phasing/PhaseByTransmission.java | 177 ++++++++++++------ 1 file changed, 120 insertions(+), 57 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 806a84e9d..3c39b58d3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -107,6 +107,7 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; private final Byte NUM_TRIO_HET_HET_HET = 3; private final Byte NUM_TRIO_VIOLATIONS = 4; + private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10; private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; @@ -507,17 +508,14 @@ public class PhaseByTransmission extends RodWalker, HashMa return count; } - //Get a Map of genotype likelihoods. If the genotype is NO_CALL or UNAVAILABLE, the Map will contain a single - //NO_CALL resp. UNAVAILABLE element with a likelihood of 1.0 + //Get a Map of genotype likelihoods. + //In case of null, unavailable or no call, all likelihoods are 1/3. private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ - if(genotype == null || !genotype.isAvailable()){ + if(genotype == null || !genotype.isCalled()){ EnumMap likelihoods = new EnumMap(Genotype.Type.class); - likelihoods.put(Genotype.Type.UNAVAILABLE,1.0); - return likelihoods; - } - else if(genotype.isNoCall()){ - EnumMap likelihoods = new EnumMap(Genotype.Type.class); - likelihoods.put(Genotype.Type.NO_CALL,1.0); + likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0); + likelihoods.put(Genotype.Type.HET,1.0/3.0); + likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0); return likelihoods; } return genotype.getLikelihoods().getAsMap(true); @@ -541,57 +539,113 @@ public class PhaseByTransmission extends RodWalker, HashMa * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child * @return */ - private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { + private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { - //Get the PL - Map motherLikelihoods = getLikelihoodsAsMapSafeNull(mother); - Map fatherLikelihoods = getLikelihoodsAsMapSafeNull(father); + //Check whether it is a pair or trio + //Always assign the first parent as the parent having genotype information in pairs + //Always assign the mother as the first parent in trios + int parentsCalled = 0; + Map firstParentLikelihoods; + Map secondParentLikelihoods; + Genotype.Type pairSecondParentGenotype = null; + if(mother == null || !mother.isCalled()){ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType(); + if(father != null && father.isCalled()) + parentsCalled = 1; + } + else{ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + if(father == null || !father.isCalled()){ + parentsCalled = 1; + pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType(); + }else{ + parentsCalled = 2; + } + } Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); //Prior vars double bestConfigurationLikelihood = 0.0; double norm = 0.0; int configuration_index =0; - ArrayList isMV = new ArrayList(); - isMV.add(false); - ArrayList bestMotherGenotype = new ArrayList(); - bestMotherGenotype.add(getTypeSafeNull(mother)); - ArrayList bestFatherGenotype = new ArrayList(); - bestFatherGenotype.add(getTypeSafeNull(father)); + ArrayList bestMVCount = new ArrayList(); + bestMVCount.add(0); + + ArrayList bestFirstParentGenotype = new ArrayList(); + ArrayList bestSecondParentGenotype = new ArrayList(); ArrayList bestChildGenotype = new ArrayList(); + bestFirstParentGenotype.add(getTypeSafeNull(mother)); + bestSecondParentGenotype.add(getTypeSafeNull(father)); bestChildGenotype.add(getTypeSafeNull(child)); //Get the most likely combination //Only check for most likely combination if at least a parent and the child have genotypes - if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){ + if(child.isCalled() && parentsCalled > 0){ int mvCount; - double configurationLikelihood; - for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ - for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ - for(Map.Entry childGenotype : childLikelihoods.entrySet()){ - mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); - configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); + int cumulativeMVCount = 0; + double configurationLikelihood = 0; + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){ + for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); + //For parent/child pairs, sum over the possible genotype configurations of the missing parent + if(parentsCalled<2){ + cumulativeMVCount += mvCount; + configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + } + //Evaluate configurations of trios + else{ + configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(mvCount); + bestFirstParentGenotype.clear(); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.clear(); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(mvCount); + } + } + } + //Evaluate configurations of parent/child pairs + if(parentsCalled<2){ norm += configurationLikelihood; //Keep this combination if //It has a better likelihood //Or it has the same likelihood but requires less changes from original genotypes if (configurationLikelihood > bestConfigurationLikelihood){ bestConfigurationLikelihood = configurationLikelihood; - isMV.clear(); - isMV.add(mvCount>0); - bestMotherGenotype.clear(); - bestMotherGenotype.add(motherGenotype.getKey()); - bestFatherGenotype.clear(); - bestFatherGenotype.add(fatherGenotype.getKey()); + bestMVCount.clear(); + bestMVCount.add(cumulativeMVCount/3); bestChildGenotype.clear(); + bestFirstParentGenotype.clear(); + bestSecondParentGenotype.clear(); bestChildGenotype.add(childGenotype.getKey()); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); } else if(configurationLikelihood == bestConfigurationLikelihood) { - bestMotherGenotype.add(motherGenotype.getKey()); - bestFatherGenotype.add(fatherGenotype.getKey()); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); bestChildGenotype.add(childGenotype.getKey()); - isMV.add(mvCount>0); + bestMVCount.add(cumulativeMVCount/3); } + configurationLikelihood = 0; } } } @@ -600,8 +654,8 @@ public class PhaseByTransmission extends RodWalker, HashMa bestConfigurationLikelihood = bestConfigurationLikelihood / norm; //In case of multiple equally likely combinations, take a random one - if(bestMotherGenotype.size()>1){ - configuration_index = rand.nextInt(bestMotherGenotype.size()-1); + if(bestFirstParentGenotype.size()>1){ + configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1); } } @@ -609,16 +663,20 @@ public class PhaseByTransmission extends RodWalker, HashMa bestConfigurationLikelihood = NO_TRANSMISSION_PROB; } - TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype.get(configuration_index)).get(bestFatherGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + TrioPhase phasedTrioGenotypes; + if(parentsCalled < 2 && mother == null || !mother.isCalled()) + phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + else + phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); //Return the phased genotypes phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); - return isMV.get(configuration_index); + return bestMVCount.get(configuration_index); } - private void updatePairMetricsCounters(Genotype parent, Genotype child, boolean isMV, HashMap counters){ + private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){ //Increment metrics counters if(parent.isCalled() && child.isCalled()){ @@ -626,11 +684,9 @@ public class PhaseByTransmission extends RodWalker, HashMa if(parent.isPhased()) counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); else{ + counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); if(parent.isHet() && child.isHet()) counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); - - else if(isMV) - counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+1); } }else{ counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); @@ -638,7 +694,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } - private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, boolean isMV, HashMap counters){ + private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){ //Increment metrics counters if(mother.isCalled() && father.isCalled() && child.isCalled()){ @@ -647,11 +703,14 @@ public class PhaseByTransmission extends RodWalker, HashMa counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); else{ - if(mother.isHet() && father.isHet() && child.isHet()) - counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); - - else if(isMV) - counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + if(mvCount > 0){ + if(mvCount >1) + counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); + else + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + } + else if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); } }else{ @@ -681,14 +740,15 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); - String mvfLine = ""; + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + String mvfLine; if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); Map genotypeMap = vc.getGenotypes(); - boolean isMV; + int mvCount; for (Sample sample : trios) { Genotype mother = vc.getGenotype(sample.getMaternalID()); @@ -700,7 +760,7 @@ public class PhaseByTransmission extends RodWalker, HashMa continue; ArrayList trioGenotypes = new ArrayList(3); - isMV = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); + mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); Genotype phasedMother = trioGenotypes.get(0); Genotype phasedFather = trioGenotypes.get(1); @@ -712,23 +772,23 @@ public class PhaseByTransmission extends RodWalker, HashMa genotypeMap.put(phasedMother.getSampleName(), phasedMother); if(father != null){ genotypeMap.put(phasedFather.getSampleName(), phasedFather); - updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters); + updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } else{ - updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters); + updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } } else{ genotypeMap.put(phasedFather.getSampleName(),phasedFather); - updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters); + updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } //Report violation if set so //TODO: ADAPT FOR PAIRS TOO!! - if(isMV && mvFile != null) + if(mvCount>0 && mvFile != null) mvFile.println(mvfLine); } @@ -759,6 +819,7 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); return metricsCounters; } @@ -781,6 +842,7 @@ public class PhaseByTransmission extends RodWalker, HashMa sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); + sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); return sum; } @@ -795,7 +857,8 @@ public class PhaseByTransmission extends RodWalker, HashMa logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); - logger.info("Number of remaining mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS)); logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); From 893787de535fde77b56d02e5c006400e021f76c1 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 3 Nov 2011 13:04:11 +0100 Subject: [PATCH 15/20] Functions getAsMap and getNegLog10GQ now handle missing genotype case. --- .../sting/utils/variantcontext/GenotypeLikelihoods.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 47c93bb1b..8c8e4f257 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -101,9 +101,12 @@ public class GenotypeLikelihoods { } //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values + //Returns null in case of missing likelihoods public EnumMap getAsMap(boolean normalizeFromLog10){ //Make sure that the log10likelihoods are set double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); + if(likelihoods == null) + return null; EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class); likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]); likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]); @@ -112,10 +115,13 @@ public class GenotypeLikelihoods { } //Return the neg log10 Genotype Quality (GQ) for the given genotype + //Returns Double.NEGATIVE_INFINITY in case of missing genotype public double getNegLog10GQ(Genotype.Type genotype){ double qual = Double.NEGATIVE_INFINITY; EnumMap likelihoods = getAsMap(false); + if(likelihoods == null) + return qual; for(Map.Entry likelihood : likelihoods.entrySet()){ if(likelihood.getKey() == genotype) continue; From 385a6abec144fc72c726f88216cdcd9c3a4fa435 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 3 Nov 2011 13:04:53 +0100 Subject: [PATCH 16/20] Fixed a bug that wrongly swapped the mother and father genotypes in case the child genotype missing. --- .../gatk/walkers/phasing/PhaseByTransmission.java | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 3c39b58d3..244527212 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -547,10 +547,15 @@ public class PhaseByTransmission extends RodWalker, HashMa int parentsCalled = 0; Map firstParentLikelihoods; Map secondParentLikelihoods; + ArrayList bestFirstParentGenotype = new ArrayList(); + ArrayList bestSecondParentGenotype = new ArrayList(); + ArrayList bestChildGenotype = new ArrayList(); Genotype.Type pairSecondParentGenotype = null; if(mother == null || !mother.isCalled()){ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + bestFirstParentGenotype.add(getTypeSafeNull(father)); + bestSecondParentGenotype.add(getTypeSafeNull(mother)); pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType(); if(father != null && father.isCalled()) parentsCalled = 1; @@ -558,6 +563,8 @@ public class PhaseByTransmission extends RodWalker, HashMa else{ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + bestFirstParentGenotype.add(getTypeSafeNull(mother)); + bestSecondParentGenotype.add(getTypeSafeNull(father)); if(father == null || !father.isCalled()){ parentsCalled = 1; pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType(); @@ -566,6 +573,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } } Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); + bestChildGenotype.add(getTypeSafeNull(child)); //Prior vars double bestConfigurationLikelihood = 0.0; @@ -574,13 +582,6 @@ public class PhaseByTransmission extends RodWalker, HashMa ArrayList bestMVCount = new ArrayList(); bestMVCount.add(0); - ArrayList bestFirstParentGenotype = new ArrayList(); - ArrayList bestSecondParentGenotype = new ArrayList(); - ArrayList bestChildGenotype = new ArrayList(); - bestFirstParentGenotype.add(getTypeSafeNull(mother)); - bestSecondParentGenotype.add(getTypeSafeNull(father)); - bestChildGenotype.add(getTypeSafeNull(child)); - //Get the most likely combination //Only check for most likely combination if at least a parent and the child have genotypes if(child.isCalled() && parentsCalled > 0){ From 571c724cfdc378dac05573146f64e3da7e6424ec Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Tue, 8 Nov 2011 15:15:51 +0100 Subject: [PATCH 17/20] Added reporting of the number of genotypes updated. --- .../walkers/phasing/PhaseByTransmission.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 244527212..6394e0e24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -113,6 +113,7 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; private final Byte NUM_PAIR_HET_HET = 8; private final Byte NUM_PAIR_VIOLATIONS = 9; + private final Byte NUM_GENOTYPES_MODIFIED = 11; //Random number generator private Random rand = new Random(); @@ -742,6 +743,8 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + String mvfLine; if (tracker != null) { @@ -775,15 +778,21 @@ public class PhaseByTransmission extends RodWalker, HashMa genotypeMap.put(phasedFather.getSampleName(), phasedFather); updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); } else{ updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); + if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } } else{ genotypeMap.put(phasedFather.getSampleName(),phasedFather); updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); + if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } @@ -820,7 +829,9 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); - metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + return metricsCounters; } @@ -844,6 +855,8 @@ public class PhaseByTransmission extends RodWalker, HashMa sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED)); + return sum; } @@ -865,5 +878,7 @@ public class PhaseByTransmission extends RodWalker, HashMa logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); + logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED)); + } } From 34acf8b9789a263b991b65477e28b8fe3f25881d Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 14 Nov 2011 10:47:02 +0100 Subject: [PATCH 18/20] Added Unit tests for new methods in GenotypeLikelihoods --- .../GenotypeLikelihoodsUnitTest.java | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 9243588ab..f3d0dedcd 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -29,10 +29,13 @@ package org.broadinstitute.sting.utils.variantcontext; // the imports for unit testing. +import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; import org.testng.annotations.Test; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import java.util.EnumMap; + /** * Basic unit test for Genotype likelihoods objects @@ -69,6 +72,50 @@ public class GenotypeLikelihoodsUnitTest { gl.getAsVector(); } + @Test + public void testGetAsMap(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(v); + //Log scale + EnumMap glMap = gl.getAsMap(false); + Assert.assertEquals(v[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(v[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(v[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Linear scale + glMap = gl.getAsMap(true); + double [] vl = MathUtils.normalizeFromLog10(v); + Assert.assertEquals(vl[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(vl[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(vl[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + glMap = gl.getAsMap(false); + Assert.assertNull(glMap); + + } + + @Test + public void testGetNegLog10GQ(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString); + + //GQ for the best guess genotype + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),3.9); + + double[] test = MathUtils.normalizeFromLog10(gl.getAsVector()); + + //GQ for the other genotypes + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1])); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1])); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY); + + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { From 6881d4800c3f782255152f63d453f12451a96509 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 14 Nov 2011 10:47:51 +0100 Subject: [PATCH 19/20] Added Integration tests for Phasing by Transmission --- .../PhaseByTransmissionIntegrationTest.java | 122 +++++++++++++++++- 1 file changed, 115 insertions(+), 7 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index c663c1dd7..2cd76e7a5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -6,23 +6,131 @@ import org.testng.annotations.Test; import java.util.Arrays; public class PhaseByTransmissionIntegrationTest extends WalkerTest { - private static String phaseByTransmissionTestDataRoot = validationDataLocation + "/PhaseByTransmission"; - private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf"; + private static String phaseByTransmissionTestDataRoot = validationDataLocation + "PhaseByTransmission/"; + private static String goodFamilyFile = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.goodFamilies.ped"; + private static String TNTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TN.vcf"; + private static String TPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TP.vcf"; + private static String FPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.FP.vcf"; + private static String SpecialTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.Special.vcf"; + //Tests using PbT on all genotypes with default parameters + //And all reporting options @Test - public void testBasicFunctionality() { + public void testTrueNegativeMV() { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T PhaseByTransmission", "-NO_HEADER", "-R " + b37KGReference, - "--variant " + fundamentalTestVCF, - "-f NA12892+NA12891=NA12878", + "--variant " + TNTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("16fefda693156eadf1481fd9de23facb","9418a7a6405b78179ca13a67b8bfcc14") + ); + executeTest("testTrueNegativeMV", spec); + } + + @Test + public void testTruePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + TPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("14cf1d21a54d8b9fb506df178b634c56","efc66ae3d036715b721f9bd35b65d556") + ); + executeTest("testTruePositiveMV", spec); + } + + @Test + public void testFalsePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("f9b0fae9fe1e0f09b883a292b0e70a12","398724bc1e65314cc5ee92706e05a3ee") + ); + executeTest("testFalsePositiveMV", spec); + } + + @Test + public void testSpecialCases() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + SpecialTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("b8d1aa3789ce77b45430c62d13ee3006","a1a333e08fafb288cda0e7711909e1c3") + ); + executeTest("testSpecialCases", spec); + } + + //Test using a different prior + //Here the FP file is used but as the prior is lowered, 3 turn to TP + @Test + public void testPriorOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-prior 1e-4", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("7201ce7cc47db5840ac6b647709f7c33","c11b5e7cd7459d90d0160f917eff3b1e") + ); + executeTest("testPriorOption", spec); + } + + //Test when running without MV reporting option + //This is the exact same test file as FP but should not generate a .mvf file + @Test + public void testMVFileOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", "-o %s" ), 1, - Arrays.asList("") + Arrays.asList("398724bc1e65314cc5ee92706e05a3ee") ); - executeTest("testBasicFunctionality", spec); + executeTest("testMVFileOption", spec); } + } From 7d77fc51f55948510e9d230864dbbe78e1f791b2 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 16 Nov 2011 03:32:43 -0500 Subject: [PATCH 20/20] Corrected bug causing PhaseByTransmission to crash in case of new Genotype.Type --- .../walkers/phasing/PhaseByTransmission.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 6394e0e24..847165e3e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -151,16 +151,16 @@ public class PhaseByTransmission extends RodWalker, HashMa alleles.add(VAR); alleles.add(VAR); } - else if(genotype == Genotype.Type.NO_CALL){ - alleles.add(NO_CALL); - alleles.add(NO_CALL); - } else{ return null; } return alleles; } + private boolean isPhasable(Genotype.Type genotype){ + return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR; + } + //Create a new Genotype based on information from a single individual //Homozygous genotypes will be set as phased, heterozygous won't be private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ @@ -271,21 +271,21 @@ public class PhaseByTransmission extends RodWalker, HashMa public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ //Take care of cases where one or more family members are no call - if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){ + if(!isPhasable(child)){ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER); phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } - else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){ + else if(!isPhasable(mother)){ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + if(!isPhasable(father)){ phaseSingleIndividualAlleles(father, FamilyMember.FATHER); phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } else phasePairAlleles(father, child, FamilyMember.FATHER); } - else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + else if(!isPhasable(father)){ phasePairAlleles(mother, child, FamilyMember.MOTHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER); } @@ -327,7 +327,7 @@ public class PhaseByTransmission extends RodWalker, HashMa //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable //genotype so it is safe to return the original genotype in this case. //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. - if(phredScoreTransmission ==0 || genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) + if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) return genotype; //Add the transmission probability