From 76dd816e709ef230231173b43076c4a423577a4a Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 20 Oct 2011 12:47:27 +0200 Subject: [PATCH 01/27] Added getParents() -> returns an arrayList containing the sample's parent(s) if available --- .../broadinstitute/sting/gatk/samples/Sample.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java index b39fdd79d..a14d999ea 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples; import org.broadinstitute.sting.utils.exceptions.UserException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Map; @@ -110,6 +111,17 @@ public class Sample implements Comparable { // implements java.io.Serial return infoDB.getSample(paternalID); } + public ArrayList getParents(){ + ArrayList parents = new ArrayList(2); + Sample parent = getMother(); + if(parent != null) + parents.add(parent); + parent = getFather(); + if(parent != null) + parents.add(parent); + return parents; + } + /** * Get gender of the sample * @return property of key "gender" - must be of type Gender From ef6a6fdfe499809fc3fd6282a460c2fd542963c5 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 20 Oct 2011 12:49:18 +0200 Subject: [PATCH 02/27] Added getAsMap -> returns the likelihoods as an EnumMap with Genotypes as keys and likelihoods as values. --- .../utils/variantcontext/GenotypeLikelihoods.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index dba16cf86..292bc17cd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -25,8 +25,12 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import java.util.EnumMap; +import java.util.Map; + public class GenotypeLikelihoods { public static final boolean CAP_PLS = false; public static final int PL_CAP = 255; @@ -94,6 +98,16 @@ public class GenotypeLikelihoods { return likelihoodsAsString_PLs; } + public EnumMap getAsMap(boolean normalizeFromLog10){ + //Make sure that the log10likelihoods are set + double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); + EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class); + likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]); + likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]); + return likelihoodsMap; + } + private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { String[] strings = likelihoodsAsString_PLs.split(","); From 1c61a573296f6df9a3b2af20255c0deff1ece565 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 20 Oct 2011 13:06:44 +0200 Subject: [PATCH 03/27] Original rewrite of PhaseByTransmission: - Adapted to get the trio information from the SampleDB (i.e. from Pedigree file (ped)) => Multiple trios can be passed as argument - Mendelian violations and trio phasing possibilities are pre-calculated and stored in Maps. => Runtime is ~3x faster - Genotype combinations possible only given two MVs are now given a squared MV prior (e.g. 0/0+0/0=>1/1 is given 10^-16 prior if the MV prior is 10^-8) - Corrected bug: In case the best genotype combination is Het/Het/Het, the genotypes are now set appropriately (before original genotypes were left even if they weren't Het/Het/Het) - Basic reporting added: -- mvf argument let the user specify a file to report remaining MVs -- When the walker ends, some basic stats about the genotype reconfiguration and phasing are output Known problems: - GQ is not recalculated even if the genotype changes Possible improvements: - Phase partially typed trios - Use standard Allele/Genotype Classes for the storage of the pre-calculated phase --- .../walkers/phasing/PhaseByTransmission.java | 781 +++++++++++++----- 1 file changed, 569 insertions(+), 212 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 3eedc2a28..12b541526 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -7,18 +7,18 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import java.io.File; -import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.*; /** @@ -32,13 +32,13 @@ import java.util.*; * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to * begin. */ -public class PhaseByTransmission extends RodWalker { +public class PhaseByTransmission extends RodWalker, HashMap> { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); - @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.") - public ArrayList familySpecs = null; + @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") + private PrintStream mvFile = null; @Output protected VCFWriter vcfWriter = null; @@ -48,239 +48,523 @@ public class PhaseByTransmission extends RodWalker { private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; - private class Trio { - private String mother; - private String father; - private String child; + private ArrayList trios = new ArrayList(); - public Trio(String mother, String father, String child) { - this.mother = mother; - this.father = father; - this.child = child; - } + //Matrix of priors for all genotype combinations + private EnumMap>> mvCountMatrix; - public Trio(String familySpec) { - String[] pieces = familySpec.split("[\\+\\=]"); + //Matrix of allele transmission + private EnumMap>> transmissionMatrix; - this.mother = pieces[0]; - this.father = pieces[1]; - this.child = pieces[2]; - } + //Metrics counters hashkeys + private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; + private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; + private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; + private final Byte NUM_HET = 3; + private final Byte NUM_HET_HET_HET = 4; + private final Byte NUM_VIOLATIONS = 5; - public String getMother() { return mother; } - public String getFather() { return father; } - public String getChild() { return child; } + private enum AlleleType { + NO_CALL, + REF, + VAR, + UNPHASED_REF, + UNPHASED_VAR } - private ArrayList trios = new ArrayList(); + //Stores a trio-genotype + private class TrioPhase { - public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) { - if (familySpecs != null) { - // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our - // spec list set, and treat the entries as if they had been specified on the command line. - ArrayList specs = new ArrayList(); - for (String familySpec : familySpecs) { - File specFile = new File(familySpec); + private ArrayList trioAlleles = new ArrayList(6); - try { - XReadLines reader = new XReadLines(specFile); + private ArrayList getAlleles(Genotype.Type genotype){ + ArrayList alleles = new ArrayList(2); + if(genotype == Genotype.Type.HOM_REF){ + alleles.add(AlleleType.REF); + alleles.add(AlleleType.REF); + } + else if(genotype == Genotype.Type.HET){ + alleles.add(AlleleType.REF); + alleles.add(AlleleType.VAR); + } + else if(genotype == Genotype.Type.HOM_VAR){ + alleles.add(AlleleType.VAR); + alleles.add(AlleleType.VAR); + } + else{ + alleles.add(AlleleType.NO_CALL); + alleles.add(AlleleType.NO_CALL); + } + return alleles; + } - List lines = reader.readLines(); - for (String line : lines) { - specs.add(new Trio(line)); - } - } catch (FileNotFoundException e) { - specs.add(new Trio(familySpec)); // not a file, so must be a family spec - } - } + private ArrayList phaseSingleIndividualAlleles(Genotype.Type genotype){ + if(genotype == Genotype.Type.HET){ + ArrayList phasedAlleles = new ArrayList(2); + phasedAlleles.add(AlleleType.UNPHASED_REF); + phasedAlleles.add(AlleleType.UNPHASED_VAR); + return phasedAlleles; + } + else + return getAlleles(genotype); + } - return specs; + private ArrayList phaseMonoParentFamilyAlleles(Genotype.Type parent, Genotype.Type child){ + ArrayList phasedAlleles = new ArrayList(4); + //Special case for Het/Het as it is ambiguous + if(parent == Genotype.Type.HET && child == Genotype.Type.HET){ + phasedAlleles.add(AlleleType.UNPHASED_REF); + phasedAlleles.add(AlleleType.UNPHASED_VAR); + phasedAlleles.add(AlleleType.UNPHASED_REF); + phasedAlleles.add(AlleleType.UNPHASED_VAR); } - return new ArrayList(); + ArrayList parentAlleles = getAlleles(parent); + ArrayList childAlleles = getAlleles(child); + + int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); + if(childTransmittedAlleleIndex > -1){ + phasedAlleles.add(parentAlleles.get(0)); + phasedAlleles.add(parentAlleles.get(1)); + phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + phasedAlleles.add(childAlleles.get(0)); + } + else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ + phasedAlleles.add(parentAlleles.get(1)); + phasedAlleles.add(parentAlleles.get(0)); + phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + phasedAlleles.add(childAlleles.get(0)); + } + else{ + parentAlleles.addAll(childAlleles); + for(AlleleType allele : parentAlleles){ + if(allele == AlleleType.REF){ + phasedAlleles.add(AlleleType.UNPHASED_REF); + } + else if(allele == AlleleType.VAR){ + phasedAlleles.add(AlleleType.UNPHASED_VAR); + } + else{ + phasedAlleles.add(AlleleType.NO_CALL); + } + } + } + + return phasedAlleles; + } + + private ArrayList phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + ArrayList phasedAlleles = new ArrayList(6); + + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); + + //Build all possible child genotypes for the given parent's genotypes + for (AlleleType momAllele : motherAlleles) { + for (AlleleType fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(fatherAllele); + possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); + } + } + + for (ArrayList phasedChildGenotype : possiblePhasedChildGenotypes) { + int firstAlleleIndex = phasedChildGenotype.indexOf(childAlleles.get(0)); + int secondAlleleIndex = phasedChildGenotype.lastIndexOf(childAlleles.get(1)); + if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { + //Add mother's alleles + phasedAlleles.add(phasedChildGenotype.get(0)); + if(motherAlleles.get(0) != phasedAlleles.get(0)) + phasedAlleles.add(motherAlleles.get(0)); + else + phasedAlleles.add(motherAlleles.get(1)); + + //Add father's alleles + phasedAlleles.add(phasedChildGenotype.get(1)); + if(fatherAlleles.get(0) != phasedAlleles.get(2)) + phasedAlleles.add(fatherAlleles.get(0)); + else + phasedAlleles.add(fatherAlleles.get(1)); + + //Add child's alleles + phasedAlleles.addAll(phasedChildGenotype); + return phasedAlleles; + } + } + + //If this is reached then no phasing could be found + motherAlleles.addAll(fatherAlleles); + motherAlleles.addAll(childAlleles); + for(AlleleType allele : motherAlleles){ + if(allele == AlleleType.REF){ + phasedAlleles.add(AlleleType.UNPHASED_REF); + } + else if(allele == AlleleType.VAR){ + phasedAlleles.add(AlleleType.UNPHASED_VAR); + } + else{ + phasedAlleles.add(AlleleType.NO_CALL); + } + } + return phasedAlleles; + } + + + public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + //Take care of cases where one or more family members are no call + if(child == Genotype.Type.NO_CALL){ + trioAlleles.addAll(phaseSingleIndividualAlleles(mother)); + trioAlleles.addAll(phaseSingleIndividualAlleles(father)); + trioAlleles.add(AlleleType.NO_CALL); + trioAlleles.add(AlleleType.NO_CALL); + } + else if(mother == Genotype.Type.NO_CALL){ + trioAlleles.add(AlleleType.NO_CALL); + trioAlleles.add(AlleleType.NO_CALL); + if(father == Genotype.Type.NO_CALL){ + trioAlleles.add(AlleleType.NO_CALL); + trioAlleles.add(AlleleType.NO_CALL); + trioAlleles.addAll(phaseSingleIndividualAlleles(child)); + } + else + trioAlleles.addAll(phaseMonoParentFamilyAlleles(father, child)); + } + else if(father == Genotype.Type.NO_CALL){ + trioAlleles.addAll(phaseMonoParentFamilyAlleles(mother, child)); + trioAlleles.add(2, AlleleType.NO_CALL); + trioAlleles.add(3, AlleleType.NO_CALL); + } + //Special case for Het/Het/Het as it is ambiguous + else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ + trioAlleles.add(AlleleType.UNPHASED_REF); + trioAlleles.add(AlleleType.UNPHASED_VAR); + trioAlleles.add(AlleleType.UNPHASED_REF); + trioAlleles.add(AlleleType.UNPHASED_VAR); + trioAlleles.add(AlleleType.UNPHASED_REF); + trioAlleles.add(AlleleType.UNPHASED_VAR); + } + //All family members have genotypes and at least one of them is not Het + else{ + trioAlleles = phaseFamilyAlleles(mother, father, child); + } + } + + public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){ + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,trioAlleles.subList(0,2))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,trioAlleles.subList(2,4))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,trioAlleles.subList(4,6))); + return phasedGenotypes; + } + + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, List phasedAlleles){ + + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getAttributes()); + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb); + + boolean isPhased = true; + + List alleles = new ArrayList(2); + + //If unphased, return original genotype + for(AlleleType allele : phasedAlleles){ + if(allele == AlleleType.NO_CALL){ + genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); + return genotype; + } + //Otherwise add the appropriate allele + else if(allele == AlleleType.UNPHASED_REF){ + isPhased = false; + alleles.add(refAllele); + } + else if(allele == AlleleType.UNPHASED_VAR){ + isPhased = false; + alleles.add(altAllele); + } + else if(allele == AlleleType.REF){ + alleles.add(refAllele); + } + else if(allele == AlleleType.VAR){ + alleles.add(altAllele); + } + } + //TODO: Recalculate GQ field for the new genotype + //Remove the GQ attribute as it needs to be recalculated for the new genotype assignment + //double[] likelihoods = genotype.getLikelihoods().getAsVector(); + + //genotypeAttributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,likelihoods[1]); + genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); + return new Genotype(genotype.getSampleName(), alleles, genotype.getNegLog10PError(), null, genotype.getAttributes(), isPhased); + } + + } /** * Parse the familial relationship specification, and initialize VCF writer */ public void initialize() { - trios = getFamilySpecsFromCommandLineInput(familySpecs); - ArrayList rodNames = new ArrayList(); rodNames.add(variantCollection.variants.getName()); - Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + //Get the trios from the families passed as ped + setTrios(); + if(trios.size()<1) + throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted."); + + Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the phase given that the genotypes are correct")); headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); + + buildMatrices(); + + if(mvFile != null) + mvFile.println("#CHROM\tPOS\tFILTER\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_RAD\tMOTHER_AAD\tMOTHER_HRPL\tMOTHER_HETPL\tMOTHER_HAPL\tFATHER_GT\tFATHER_DP\tFATHER_RAD\tFATHER_AAD\tFATHER_HRPL\tFATHER_HETPL\tFATHER_HAPL\tCHILD_GT\tCHILD_DP\tCHILD_RAD\tCHILD_AAD\tCHILD_HRPL\tCHILD_HETPL\tCHILD_HAPL"); + } - private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) { - double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector()); - double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector()); - double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector()); + /** + * Select Trios only + */ + private void setTrios(){ - int momIndex = mom.getType().ordinal() - 1; - int dadIndex = dad.getType().ordinal() - 1; - int childIndex = child.getType().ordinal() - 1; - - return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex]; - } - - private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) { - List homRefAlleles = new ArrayList(); - homRefAlleles.add(refAllele); - homRefAlleles.add(refAllele); - Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - List hetAlleles = new ArrayList(); - hetAlleles.add(refAllele); - hetAlleles.add(altAllele); - Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - List homVarAlleles = new ArrayList(); - homVarAlleles.add(altAllele); - homVarAlleles.add(altAllele); - Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getNegLog10PError(), null, g.getAttributes(), false); - - ArrayList genotypes = new ArrayList(); - genotypes.add(homRef); - genotypes.add(het); - genotypes.add(homVar); - - return genotypes; - } - - private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) { - List alleles = g.getAlleles(); - int matchingAlleles = 0; - - for (Allele a : alleles) { - if (!alleleToMatch.equals(a)) { - matchingAlleles++; + Map> families = this.getSampleDB().getFamilies(); + Set family; + ArrayList parents; + for(String familyID : families.keySet()){ + family = families.get(familyID); + if(family.size()!=3){ + logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios. Family skipped.",familyID,family.size())); } - } - - return matchingAlleles; - } - - private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) { - int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0; - int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0; - - int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0; - int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0; - - int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child); - int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child); - - return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles); - } - - private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) { - Set possiblePhasedChildGenotypes = new HashSet(); - - for (Allele momAllele : mom.getAlleles()) { - for (Allele dadAllele : dad.getAlleles()) { - ArrayList possiblePhasedChildAlleles = new ArrayList(); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(dadAllele); - - Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getNegLog10PError(), child.getFilters(), child.getAttributes(), true); - - possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype); - } - } - - ArrayList finalGenotypes = new ArrayList(); - - for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) { - if (child.sameGenotype(phasedChildGenotype, true)) { - Allele momTransmittedAllele = phasedChildGenotype.getAllele(0); - Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1); - - ArrayList phasedMomAlleles = new ArrayList(); - phasedMomAlleles.add(momTransmittedAllele); - phasedMomAlleles.add(momUntransmittedAllele); - - Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getNegLog10PError(), mom.getFilters(), mom.getAttributes(), true); - - Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1); - Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1); - - ArrayList phasedDadAlleles = new ArrayList(); - phasedDadAlleles.add(dadTransmittedAllele); - phasedDadAlleles.add(dadUntransmittedAllele); - - Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getNegLog10PError(), dad.getFilters(), dad.getAttributes(), true); - - finalGenotypes.add(phasedMomGenotype); - finalGenotypes.add(phasedDadGenotype); - finalGenotypes.add(phasedChildGenotype); - - return finalGenotypes; - } - } - - finalGenotypes.add(mom); - finalGenotypes.add(dad); - finalGenotypes.add(child); - - return finalGenotypes; - } - - private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) { - ArrayList finalGenotypes = new ArrayList(); - finalGenotypes.add(mother); - finalGenotypes.add(father); - finalGenotypes.add(child); - - if (mother.isCalled() && father.isCalled() && child.isCalled()) { - ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother); - ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father); - ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child); - - double bestConfigurationLikelihood = 0.0; - double bestPrior = 0.0; - Genotype bestMotherGenotype = mother; - Genotype bestFatherGenotype = father; - Genotype bestChildGenotype = child; - - double norm = 0.0; - - for (Genotype motherGenotype : possibleMotherGenotypes) { - for (Genotype fatherGenotype : possibleFatherGenotypes) { - for (Genotype childGenotype : possibleChildGenotypes) { - double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype); - norm += prior*configurationLikelihood; - - if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) { - bestConfigurationLikelihood = configurationLikelihood; - bestPrior = prior; - bestMotherGenotype = motherGenotype; - bestFatherGenotype = fatherGenotype; - bestChildGenotype = childGenotype; - } - } + else{ + for(Sample familyMember : family){ + parents = familyMember.getParents(); + if(parents.size()>0){ + if(family.containsAll(parents)) + this.trios.add(familyMember); + else + logger.info(String.format("Caution: Family %s is not a trio; At the moment Phase By Transmission only supports trios. Family skipped.",familyID)); + break; + } } } - if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) { - Map attributes = new HashMap(); - attributes.putAll(bestChildGenotype.getAttributes()); - attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm); - bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes); + } - finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype); + + + } + + private void buildMatrices(){ + mvCountMatrix = new EnumMap>>(Genotype.Type.class); + transmissionMatrix = new EnumMap>>(Genotype.Type.class); + for(Genotype.Type mother : Genotype.Type.values()){ + mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class)); + for(Genotype.Type father : Genotype.Type.values()){ + mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class)); + for(Genotype.Type child : Genotype.Type.values()){ + mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); + transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); + } + } + } + } + + private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + //Child is no call => No MV + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) + return 0; + //Add parents with genotypes for the evaluation + ArrayList parents = new ArrayList(); + if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) + parents.add(mother); + if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) + parents.add(father); + + //Both parents no calls => No MV + if (parents.isEmpty()) + return 0; + + //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed + int parentsNumRefAlleles = 0; + int parentsNumAltAlleles = 0; + + for(Genotype.Type parent : parents){ + if(parent == Genotype.Type.HOM_REF){ + parentsNumRefAlleles++; + } + else if(parent == Genotype.Type.HET){ + parentsNumRefAlleles++; + parentsNumAltAlleles++; + } + else if(parent == Genotype.Type.HOM_VAR){ + parentsNumAltAlleles++; } } - return finalGenotypes; + //Case Child is HomRef + if(child == Genotype.Type.HOM_REF){ + if(parentsNumRefAlleles == parents.size()) + return 0; + else return (parents.size()-parentsNumRefAlleles); + } + + //Case child is HomVar + if(child == Genotype.Type.HOM_VAR){ + if(parentsNumAltAlleles == parents.size()) + return 0; + else return parents.size()-parentsNumAltAlleles; + } + + //Case child is Het + if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) + return 0; + + //MV + return 1; + } + + private Double getCombinationPrior(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + + double nonMVPrior = 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; + + //Child is no call => No MV + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) + return nonMVPrior; + //Add parents with genotypes for the evaluation + ArrayList parents = new ArrayList(); + if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) + parents.add(mother); + if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) + parents.add(father); + + //Both parents no calls => No MV + if (parents.isEmpty()) + return nonMVPrior; + + //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed + int parentsNumRefAlleles = 0; + int parentsNumAltAlleles = 0; + + for(Genotype.Type parent : parents){ + if(parent == Genotype.Type.HOM_REF){ + parentsNumRefAlleles++; + } + else if(parent == Genotype.Type.HET){ + parentsNumRefAlleles++; + parentsNumAltAlleles++; + } + else if(parent == Genotype.Type.HOM_VAR){ + parentsNumAltAlleles++; + } + } + + //Case Child is HomRef + if(child == Genotype.Type.HOM_REF){ + if(parentsNumRefAlleles == parents.size()) + return nonMVPrior; + else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumRefAlleles); + } + + //Case child is HomVar + if(child == Genotype.Type.HOM_VAR){ + if(parentsNumAltAlleles == parents.size()) + return nonMVPrior; + else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumAltAlleles); + } + + //Case child is Het + if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) + return nonMVPrior; + + //MV + return MENDELIAN_VIOLATION_PRIOR; + } + + private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ + int count = 0; + if(motherOriginal!=motherNew) + count++; + if(fatherOriginal!=fatherNew) + count++; + if(childOriginal!=childNew) + count++; + return count; + } + + + private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { + + //For now, only consider trios with complete information + //TODO: Phasing of trios with missing information + if(!(mother.isCalled() && father.isCalled() && child.isCalled())) { + finalGenotypes.add(mother); + finalGenotypes.add(father); + finalGenotypes.add(child); + return false; + } + + //Get the PL + Map motherLikelihoods = mother.getLikelihoods().getAsMap(true); + Map fatherLikelihoods = father.getLikelihoods().getAsMap(true); + Map childLikelihoods = child.getLikelihoods().getAsMap(true); + + //Prior vars + double bestConfigurationLikelihood = 0.0; + double norm = 0.0; + boolean isMV = false; + int bestConfigurationGenotypeDiffs=4; + Genotype.Type bestMotherGenotype = mother.getType(); + Genotype.Type bestFatherGenotype = father.getType(); + Genotype.Type bestChildGenotype = child.getType(); + + //Get the most likely combination + int mvCount; + double configurationLikelihood; + int configurationGenotypeDiffs; + for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ + for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); + configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-12*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if ((configurationLikelihood > bestConfigurationLikelihood) || + (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) { + bestConfigurationLikelihood = configurationLikelihood; + bestMotherGenotype = motherGenotype.getKey(); + bestFatherGenotype = fatherGenotype.getKey(); + bestChildGenotype = childGenotype.getKey(); + isMV = mvCount>0; + bestConfigurationGenotypeDiffs=configurationGenotypeDiffs; + } + } + } + } + + //Get the phased alleles for the genotype configuration + TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype); + + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,MathUtils.probabilityToPhredScale(1-(bestConfigurationLikelihood / norm)),finalGenotypes); + return isMV; + } /** @@ -292,18 +576,34 @@ public class PhaseByTransmission extends RodWalker { * @return null */ @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + //Local cars to avoid lookups on increment + int numTrioGenotypesCalled = 0; + int numTrioGenotypesNoCall = 0; + int numTrioGenotypesPhased = 0; + int numHet = 0 ; + int numHetHetHet = 0; + int numMVs = 0; + if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); Map genotypeMap = vc.getGenotypes(); - for (Trio trio : trios) { - Genotype mother = vc.getGenotype(trio.getMother()); - Genotype father = vc.getGenotype(trio.getFather()); - Genotype child = vc.getGenotype(trio.getChild()); + boolean isMV; - ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child); + for (Sample sample : trios) { + Genotype mother = vc.getGenotype(sample.getMaternalID()); + Genotype father = vc.getGenotype(sample.getPaternalID()); + Genotype child = vc.getGenotype(sample.getID()); + + //Skip trios where any of the genotype is missing in the variant context + if(mother == null || father == null | child == null) + continue; + + ArrayList trioGenotypes = new ArrayList(3); + isMV = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); Genotype phasedMother = trioGenotypes.get(0); Genotype phasedFather = trioGenotypes.get(1); @@ -312,14 +612,47 @@ public class PhaseByTransmission extends RodWalker { genotypeMap.put(phasedMother.getSampleName(), phasedMother); genotypeMap.put(phasedFather.getSampleName(), phasedFather); genotypeMap.put(phasedChild.getSampleName(), phasedChild); + + //Increment metrics counters + if(phasedMother.isCalled() && phasedFather.isCalled() && phasedChild.isCalled()){ + numTrioGenotypesCalled++; + if(phasedMother.isPhased()) + numTrioGenotypesPhased++; + + if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){ + numHet++; + if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()){ + numHetHetHet++; + }else if(!phasedMother.isPhased()){ + int x =9; + } + } + + if(isMV){ + numMVs++; + if(mvFile != null) + mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString())); + } + }else{ + numTrioGenotypesNoCall++; + } + } + VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap); vcfWriter.add(newvc); } - return null; + HashMap metricsCounters = new HashMap(5); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,numTrioGenotypesCalled); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,numTrioGenotypesNoCall); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,numTrioGenotypesPhased); + metricsCounters.put(NUM_HET,numHet); + metricsCounters.put(NUM_HET_HET_HET,numHetHetHet); + metricsCounters.put(NUM_VIOLATIONS,numMVs); + return metricsCounters; } /** @@ -328,8 +661,15 @@ public class PhaseByTransmission extends RodWalker { * @return Initial value of reduce. */ @Override - public Integer reduceInit() { - return null; + public HashMap reduceInit() { + HashMap metricsCounters = new HashMap(5); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_HET,0); + metricsCounters.put(NUM_HET_HET_HET,0); + metricsCounters.put(NUM_VIOLATIONS,0); + return metricsCounters; } /** @@ -340,7 +680,24 @@ public class PhaseByTransmission extends RodWalker { * @return accumulator with result of the map taken into account. */ @Override - public Integer reduce(Integer value, Integer sum) { - return null; + public HashMap reduce(HashMap value, HashMap sum) { + sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); + sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); + sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); + sum.put(NUM_HET,value.get(NUM_HET)+sum.get(NUM_HET)); + sum.put(NUM_HET_HET_HET,value.get(NUM_HET_HET_HET)+sum.get(NUM_HET_HET_HET)); + sum.put(NUM_VIOLATIONS,value.get(NUM_VIOLATIONS)+sum.get(NUM_VIOLATIONS)); + return sum; + } + + + @Override + public void onTraversalDone(HashMap result) { + logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); + logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); + logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); + logger.info("Number of resulting Hom/Hom/Hom trios: " + result.get(NUM_HET)); + logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_HET_HET_HET)); + logger.info("Number of remaining mendelian violations: " + result.get(NUM_VIOLATIONS)); } } From edea90786a4555ff665b07a3dc2f97ce7cbae24c Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 20 Oct 2011 17:04:19 +0200 Subject: [PATCH 04/27] Genotype quality is now recalculated for each of the phased Genotypes. Small problem is that we unnecessarily loose a little precision on the genotypes that do not change after assignment. --- .../walkers/phasing/PhaseByTransmission.java | 9 ++---- .../variantcontext/GenotypeLikelihoods.java | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 12b541526..52629277f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -266,6 +266,7 @@ public class PhaseByTransmission extends RodWalker, HashMa Map genotypeAttributes = new HashMap(); genotypeAttributes.putAll(genotype.getAttributes()); genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb); + genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); boolean isPhased = true; @@ -274,7 +275,6 @@ public class PhaseByTransmission extends RodWalker, HashMa //If unphased, return original genotype for(AlleleType allele : phasedAlleles){ if(allele == AlleleType.NO_CALL){ - genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); return genotype; } //Otherwise add the appropriate allele @@ -293,13 +293,8 @@ public class PhaseByTransmission extends RodWalker, HashMa alleles.add(altAllele); } } - //TODO: Recalculate GQ field for the new genotype - //Remove the GQ attribute as it needs to be recalculated for the new genotype assignment - //double[] likelihoods = genotype.getLikelihoods().getAsVector(); - //genotypeAttributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,likelihoods[1]); - genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); - return new Genotype(genotype.getSampleName(), alleles, genotype.getNegLog10PError(), null, genotype.getAttributes(), isPhased); + return new Genotype(genotype.getSampleName(), alleles, genotype.getLikelihoods().getLog10GQ(genotype.getType()), null, genotype.getAttributes(), isPhased); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 292bc17cd..9ed9b41cc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -25,8 +25,10 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.jgrapht.util.MathUtil; import java.util.EnumMap; import java.util.Map; @@ -98,6 +100,7 @@ public class GenotypeLikelihoods { return likelihoodsAsString_PLs; } + //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values public EnumMap getAsMap(boolean normalizeFromLog10){ //Make sure that the log10likelihoods are set double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); @@ -108,6 +111,32 @@ public class GenotypeLikelihoods { return likelihoodsMap; } + //Return the log10 Genotype Quality (GQ) for the given genotype + public double getLog10GQ(Genotype.Type genotype){ + + double qual = Double.NEGATIVE_INFINITY; + EnumMap likelihoods = getAsMap(false); + for(Map.Entry likelihood : likelihoods.entrySet()){ + if(likelihood.getKey() == genotype) + continue; + if(likelihood.getValue() > qual) + qual = likelihood.getValue(); + + } + + qual = likelihoods.get(genotype) - qual; + + if (qual < 0) { + // QUAL can be negative if the chosen genotype is not the most likely one individually. + // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one + double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); + double chosenGenotype = normalized[genotype.ordinal()-1]; + qual = -1.0 * Math.log10(1.0 - chosenGenotype); + } + + return qual; + } + private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) { String[] strings = likelihoodsAsString_PLs.split(","); From 01b16abc8dcc233e29258eaa4e1d1c0415180e91 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 24 Oct 2011 10:24:41 +0200 Subject: [PATCH 05/27] Genotype quality calculation modified to handle all genotypes the same way. This is inconsistent with GQ output by the UG but is correct even for cases of poor quality genotypes. --- .../variantcontext/GenotypeLikelihoods.java | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 9ed9b41cc..b83ffa2fd 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -111,8 +111,8 @@ public class GenotypeLikelihoods { return likelihoodsMap; } - //Return the log10 Genotype Quality (GQ) for the given genotype - public double getLog10GQ(Genotype.Type genotype){ + //Return the neg log10 Genotype Quality (GQ) for the given genotype + public double getNegLog10GQ(Genotype.Type genotype){ double qual = Double.NEGATIVE_INFINITY; EnumMap likelihoods = getAsMap(false); @@ -124,15 +124,9 @@ public class GenotypeLikelihoods { } - qual = likelihoods.get(genotype) - qual; - - if (qual < 0) { - // QUAL can be negative if the chosen genotype is not the most likely one individually. - // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one - double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); - double chosenGenotype = normalized[genotype.ordinal()-1]; - qual = -1.0 * Math.log10(1.0 - chosenGenotype); - } + double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); + double chosenGenotype = normalized[genotype.ordinal()-1]; + qual = -1.0 * Math.log10(1.0 - chosenGenotype); return qual; } From 7312e35c71167dee3510bf428204b3a4b75ab52f Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 24 Oct 2011 10:25:53 +0200 Subject: [PATCH 06/27] Now makes use of standard Allele and Genotype classes. This allowed quite some code cleaning. --- .../walkers/phasing/PhaseByTransmission.java | 341 +++++++----------- 1 file changed, 135 insertions(+), 206 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 52629277f..62b7bfffa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -56,7 +56,7 @@ public class PhaseByTransmission extends RodWalker, HashMa //Matrix of allele transmission private EnumMap>> transmissionMatrix; - //Metrics counters hashkeys + //Metrics counters hash keys private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; @@ -64,237 +64,223 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_HET_HET_HET = 4; private final Byte NUM_VIOLATIONS = 5; - private enum AlleleType { - NO_CALL, - REF, - VAR, - UNPHASED_REF, - UNPHASED_VAR + private enum FamilyMember { + MOTHER, + FATHER, + CHILD } //Stores a trio-genotype private class TrioPhase { - private ArrayList trioAlleles = new ArrayList(6); + //Create 2 fake alleles + //The actual bases will never be used but the Genotypes created using the alleles will be. + private final Allele REF = Allele.create("A",true); + private final Allele VAR = Allele.create("A",false); + private final Allele NO_CALL = Allele.create(".",false); + private final String DUMMY_NAME = "DummySample"; - private ArrayList getAlleles(Genotype.Type genotype){ - ArrayList alleles = new ArrayList(2); + private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); + + private ArrayList getAlleles(Genotype.Type genotype){ + ArrayList alleles = new ArrayList(2); if(genotype == Genotype.Type.HOM_REF){ - alleles.add(AlleleType.REF); - alleles.add(AlleleType.REF); + alleles.add(REF); + alleles.add(REF); } else if(genotype == Genotype.Type.HET){ - alleles.add(AlleleType.REF); - alleles.add(AlleleType.VAR); + alleles.add(REF); + alleles.add(VAR); } else if(genotype == Genotype.Type.HOM_VAR){ - alleles.add(AlleleType.VAR); - alleles.add(AlleleType.VAR); + alleles.add(VAR); + alleles.add(VAR); + } + else if(genotype == Genotype.Type.NO_CALL){ + alleles.add(NO_CALL); + alleles.add(NO_CALL); } else{ - alleles.add(AlleleType.NO_CALL); - alleles.add(AlleleType.NO_CALL); + return null; } return alleles; } - private ArrayList phaseSingleIndividualAlleles(Genotype.Type genotype){ - if(genotype == Genotype.Type.HET){ - ArrayList phasedAlleles = new ArrayList(2); - phasedAlleles.add(AlleleType.UNPHASED_REF); - phasedAlleles.add(AlleleType.UNPHASED_VAR); - return phasedAlleles; + //Create a new Genotype based on information from a single individual + //Homozygous genotypes will be set as phased, heterozygous won't be + private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ + if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true)); } else - return getAlleles(genotype); + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - private ArrayList phaseMonoParentFamilyAlleles(Genotype.Type parent, Genotype.Type child){ - ArrayList phasedAlleles = new ArrayList(4); + private void phaseMonoParentFamilyAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ + //Special case for Het/Het as it is ambiguous - if(parent == Genotype.Type.HET && child == Genotype.Type.HET){ - phasedAlleles.add(AlleleType.UNPHASED_REF); - phasedAlleles.add(AlleleType.UNPHASED_VAR); - phasedAlleles.add(AlleleType.UNPHASED_REF); - phasedAlleles.add(AlleleType.UNPHASED_VAR); + if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - ArrayList parentAlleles = getAlleles(parent); - ArrayList childAlleles = getAlleles(child); + ArrayList parentAlleles = getAlleles(parentGenotype); + ArrayList childAlleles = getAlleles(childGenotype); + ArrayList parentPhasedAlleles = new ArrayList(2); + ArrayList childPhasedAlleles = new ArrayList(2); + //If there is a possible phasing between the mother and child => phase int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); if(childTransmittedAlleleIndex > -1){ - phasedAlleles.add(parentAlleles.get(0)); - phasedAlleles.add(parentAlleles.get(1)); - phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - phasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); } else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ - phasedAlleles.add(parentAlleles.get(1)); - phasedAlleles.add(parentAlleles.get(0)); - phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - phasedAlleles.add(childAlleles.get(0)); + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); } + //This is a Mendelian Violation => Do not phase else{ - parentAlleles.addAll(childAlleles); - for(AlleleType allele : parentAlleles){ - if(allele == AlleleType.REF){ - phasedAlleles.add(AlleleType.UNPHASED_REF); - } - else if(allele == AlleleType.VAR){ - phasedAlleles.add(AlleleType.UNPHASED_VAR); - } - else{ - phasedAlleles.add(AlleleType.NO_CALL); - } - } + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - - return phasedAlleles; } - private ArrayList phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - ArrayList phasedAlleles = new ArrayList(6); + private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - Set> possiblePhasedChildGenotypes = new HashSet>(); - ArrayList motherAlleles = getAlleles(mother); - ArrayList fatherAlleles = getAlleles(father); - ArrayList childAlleles = getAlleles(child); + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); //Build all possible child genotypes for the given parent's genotypes - for (AlleleType momAllele : motherAlleles) { - for (AlleleType fatherAllele : fatherAlleles) { - ArrayList possiblePhasedChildAlleles = new ArrayList(2); + for (Allele momAllele : motherAlleles) { + for (Allele fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); possiblePhasedChildAlleles.add(momAllele); possiblePhasedChildAlleles.add(fatherAllele); possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); } } - for (ArrayList phasedChildGenotype : possiblePhasedChildGenotypes) { - int firstAlleleIndex = phasedChildGenotype.indexOf(childAlleles.get(0)); - int secondAlleleIndex = phasedChildGenotype.lastIndexOf(childAlleles.get(1)); + for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { + int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); + int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); + //If a possible combination has been found, create the genotypes if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { - //Add mother's alleles - phasedAlleles.add(phasedChildGenotype.get(0)); - if(motherAlleles.get(0) != phasedAlleles.get(0)) - phasedAlleles.add(motherAlleles.get(0)); + //Create mother's genotype + ArrayList motherPhasedAlleles = new ArrayList(2); + motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); + if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) + motherPhasedAlleles.add(motherAlleles.get(0)); else - phasedAlleles.add(motherAlleles.get(1)); + motherPhasedAlleles.add(motherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); - //Add father's alleles - phasedAlleles.add(phasedChildGenotype.get(1)); - if(fatherAlleles.get(0) != phasedAlleles.get(2)) - phasedAlleles.add(fatherAlleles.get(0)); + //Create father's genotype + ArrayList fatherPhasedAlleles = new ArrayList(2); + fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); + if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) + fatherPhasedAlleles.add(fatherAlleles.get(0)); else - phasedAlleles.add(fatherAlleles.get(1)); + fatherPhasedAlleles.add(fatherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); - //Add child's alleles - phasedAlleles.addAll(phasedChildGenotype); - return phasedAlleles; + //Create child's genotype + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Once a phased combination is found; exit + return; } } //If this is reached then no phasing could be found - motherAlleles.addAll(fatherAlleles); - motherAlleles.addAll(childAlleles); - for(AlleleType allele : motherAlleles){ - if(allele == AlleleType.REF){ - phasedAlleles.add(AlleleType.UNPHASED_REF); - } - else if(allele == AlleleType.VAR){ - phasedAlleles.add(AlleleType.UNPHASED_VAR); - } - else{ - phasedAlleles.add(AlleleType.NO_CALL); - } - } - return phasedAlleles; + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ //Take care of cases where one or more family members are no call - if(child == Genotype.Type.NO_CALL){ - trioAlleles.addAll(phaseSingleIndividualAlleles(mother)); - trioAlleles.addAll(phaseSingleIndividualAlleles(father)); - trioAlleles.add(AlleleType.NO_CALL); - trioAlleles.add(AlleleType.NO_CALL); + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } - else if(mother == Genotype.Type.NO_CALL){ - trioAlleles.add(AlleleType.NO_CALL); - trioAlleles.add(AlleleType.NO_CALL); - if(father == Genotype.Type.NO_CALL){ - trioAlleles.add(AlleleType.NO_CALL); - trioAlleles.add(AlleleType.NO_CALL); - trioAlleles.addAll(phaseSingleIndividualAlleles(child)); + else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } else - trioAlleles.addAll(phaseMonoParentFamilyAlleles(father, child)); + phaseMonoParentFamilyAlleles(father, child, FamilyMember.FATHER); } - else if(father == Genotype.Type.NO_CALL){ - trioAlleles.addAll(phaseMonoParentFamilyAlleles(mother, child)); - trioAlleles.add(2, AlleleType.NO_CALL); - trioAlleles.add(3, AlleleType.NO_CALL); + else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + phaseMonoParentFamilyAlleles(mother, child, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); } //Special case for Het/Het/Het as it is ambiguous else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ - trioAlleles.add(AlleleType.UNPHASED_REF); - trioAlleles.add(AlleleType.UNPHASED_VAR); - trioAlleles.add(AlleleType.UNPHASED_REF); - trioAlleles.add(AlleleType.UNPHASED_VAR); - trioAlleles.add(AlleleType.UNPHASED_REF); - trioAlleles.add(AlleleType.UNPHASED_VAR); + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } //All family members have genotypes and at least one of them is not Het else{ - trioAlleles = phaseFamilyAlleles(mother, father, child); + phaseFamilyAlleles(mother, father, child); } } public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){ - phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,trioAlleles.subList(0,2))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,trioAlleles.subList(2,4))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,trioAlleles.subList(4,6))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); return phasedGenotypes; } - private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, List phasedAlleles){ + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, Genotype phasedGenotype){ //Add the transmission probability Map genotypeAttributes = new HashMap(); genotypeAttributes.putAll(genotype.getAttributes()); genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb); - genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); - boolean isPhased = true; + //Handle missing genotype + if(!phasedGenotype.isAvailable()) + return new Genotype(genotype.getSampleName(), null); + //Handle NoCall genotype + else if(phasedGenotype.isNoCall()) + return new Genotype(genotype.getSampleName(), phasedGenotype.getAlleles()); - List alleles = new ArrayList(2); + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); - //If unphased, return original genotype - for(AlleleType allele : phasedAlleles){ - if(allele == AlleleType.NO_CALL){ - return genotype; - } - //Otherwise add the appropriate allele - else if(allele == AlleleType.UNPHASED_REF){ - isPhased = false; - alleles.add(refAllele); - } - else if(allele == AlleleType.UNPHASED_VAR){ - isPhased = false; - alleles.add(altAllele); - } - else if(allele == AlleleType.REF){ - alleles.add(refAllele); - } - else if(allele == AlleleType.VAR){ - alleles.add(altAllele); - } - } + } - return new Genotype(genotype.getSampleName(), alleles, genotype.getLikelihoods().getLog10GQ(genotype.getType()), null, genotype.getAttributes(), isPhased); + //Compute the new Log10Error if the genotype is different from the original genotype + double negLog10Error; + if(genotype.getType() == phasedGenotype.getType()) + negLog10Error = genotype.getNegLog10PError(); + else + negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType()); + + return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased()); } @@ -432,63 +418,6 @@ public class PhaseByTransmission extends RodWalker, HashMa return 1; } - private Double getCombinationPrior(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - - double nonMVPrior = 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - - //Child is no call => No MV - if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) - return nonMVPrior; - //Add parents with genotypes for the evaluation - ArrayList parents = new ArrayList(); - if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) - parents.add(mother); - if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) - parents.add(father); - - //Both parents no calls => No MV - if (parents.isEmpty()) - return nonMVPrior; - - //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed - int parentsNumRefAlleles = 0; - int parentsNumAltAlleles = 0; - - for(Genotype.Type parent : parents){ - if(parent == Genotype.Type.HOM_REF){ - parentsNumRefAlleles++; - } - else if(parent == Genotype.Type.HET){ - parentsNumRefAlleles++; - parentsNumAltAlleles++; - } - else if(parent == Genotype.Type.HOM_VAR){ - parentsNumAltAlleles++; - } - } - - //Case Child is HomRef - if(child == Genotype.Type.HOM_REF){ - if(parentsNumRefAlleles == parents.size()) - return nonMVPrior; - else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumRefAlleles); - } - - //Case child is HomVar - if(child == Genotype.Type.HOM_VAR){ - if(parentsNumAltAlleles == parents.size()) - return nonMVPrior; - else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumAltAlleles); - } - - //Case child is Het - if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) - return nonMVPrior; - - //MV - return MENDELIAN_VIOLATION_PRIOR; - } - private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ int count = 0; if(motherOriginal!=motherNew) @@ -534,7 +463,7 @@ public class PhaseByTransmission extends RodWalker, HashMa for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ for(Map.Entry childGenotype : childLikelihoods.entrySet()){ mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); - configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-12*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); + configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); norm += configurationLikelihood; configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); //Keep this combination if From 38ebf3141a13ebafcc43b69c3364e0066ba17ea9 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 24 Oct 2011 12:30:04 +0200 Subject: [PATCH 07/27] - Now supports parent/child pairs - Sites with missing genotypes in pairs/trios are handled as follows: -- Missing child -> Homozygous parents are phased, no transmission probability is emitted -- Two individuals missing -> Phase if homozygous, no transmission probability is emitted -- One parent missing -> Phased / transmission probability emitted - Mutation prior set as argument --- .../walkers/phasing/PhaseByTransmission.java | 134 ++++++++++-------- 1 file changed, 77 insertions(+), 57 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 62b7bfffa..d128ff40e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -40,13 +40,16 @@ public class PhaseByTransmission extends RodWalker, HashMa @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.") private PrintStream mvFile = null; + @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8") + private double deNovoPrior=1e-8; + @Output protected VCFWriter vcfWriter = null; private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; private final String SOURCE_NAME = "PhaseByTransmission"; - private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; + public final double NO_TRANSMISSION_PROB = -1.0; private ArrayList trios = new ArrayList(); @@ -240,26 +243,26 @@ public class PhaseByTransmission extends RodWalker, HashMa } } - public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){ + public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); return phasedGenotypes; } - private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, Genotype phasedGenotype){ + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) + return genotype; //Add the transmission probability Map genotypeAttributes = new HashMap(); genotypeAttributes.putAll(genotype.getAttributes()); - genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb); - - //Handle missing genotype - if(!phasedGenotype.isAvailable()) - return new Genotype(genotype.getSampleName(), null); - //Handle NoCall genotype - else if(phasedGenotype.isNoCall()) - return new Genotype(genotype.getSampleName(), phasedGenotype.getAlleles()); + if(transmissionProb>NO_TRANSMISSION_PROB) + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb))); ArrayList phasedAlleles = new ArrayList(2); for(Allele allele : phasedGenotype.getAlleles()){ @@ -324,8 +327,8 @@ public class PhaseByTransmission extends RodWalker, HashMa ArrayList parents; for(String familyID : families.keySet()){ family = families.get(familyID); - if(family.size()!=3){ - logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios. Family skipped.",familyID,family.size())); + if(family.size()<2 || family.size()>3){ + logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID,family.size())); } else{ for(Sample familyMember : family){ @@ -334,7 +337,7 @@ public class PhaseByTransmission extends RodWalker, HashMa if(family.containsAll(parents)) this.trios.add(familyMember); else - logger.info(String.format("Caution: Family %s is not a trio; At the moment Phase By Transmission only supports trios. Family skipped.",familyID)); + logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID)); break; } } @@ -429,64 +432,84 @@ public class PhaseByTransmission extends RodWalker, HashMa return count; } + private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ + if(genotype == null || !genotype.isAvailable()){ + EnumMap likelihoods = new EnumMap(Genotype.Type.class); + likelihoods.put(Genotype.Type.UNAVAILABLE,1.0); + return likelihoods; + } + else if(genotype.isNoCall()){ + EnumMap likelihoods = new EnumMap(Genotype.Type.class); + likelihoods.put(Genotype.Type.NO_CALL,1.0); + return likelihoods; + } + return genotype.getLikelihoods().getAsMap(true); + } + + private Genotype.Type getTypeSafeNull(Genotype genotype){ + if(genotype == null) + return Genotype.Type.UNAVAILABLE; + return genotype.getType(); + } + private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { - //For now, only consider trios with complete information - //TODO: Phasing of trios with missing information - if(!(mother.isCalled() && father.isCalled() && child.isCalled())) { - finalGenotypes.add(mother); - finalGenotypes.add(father); - finalGenotypes.add(child); - return false; - } - //Get the PL - Map motherLikelihoods = mother.getLikelihoods().getAsMap(true); - Map fatherLikelihoods = father.getLikelihoods().getAsMap(true); - Map childLikelihoods = child.getLikelihoods().getAsMap(true); + Map motherLikelihoods = getLikelihoodsAsMapSafeNull(mother); + Map fatherLikelihoods = getLikelihoodsAsMapSafeNull(father); + Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); //Prior vars double bestConfigurationLikelihood = 0.0; double norm = 0.0; boolean isMV = false; int bestConfigurationGenotypeDiffs=4; - Genotype.Type bestMotherGenotype = mother.getType(); - Genotype.Type bestFatherGenotype = father.getType(); - Genotype.Type bestChildGenotype = child.getType(); + Genotype.Type bestMotherGenotype = getTypeSafeNull(mother); + Genotype.Type bestFatherGenotype = getTypeSafeNull(father); + Genotype.Type bestChildGenotype = getTypeSafeNull(child); //Get the most likely combination - int mvCount; - double configurationLikelihood; - int configurationGenotypeDiffs; - for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ - for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ - for(Map.Entry childGenotype : childLikelihoods.entrySet()){ - mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); - configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); - norm += configurationLikelihood; - configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); - //Keep this combination if - //It has a better likelihood - //Or it has the same likelihood but requires less changes from original genotypes - if ((configurationLikelihood > bestConfigurationLikelihood) || - (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) { - bestConfigurationLikelihood = configurationLikelihood; - bestMotherGenotype = motherGenotype.getKey(); - bestFatherGenotype = fatherGenotype.getKey(); - bestChildGenotype = childGenotype.getKey(); - isMV = mvCount>0; - bestConfigurationGenotypeDiffs=configurationGenotypeDiffs; + //Only check for most likely combination if at least a parent and the child have genotypes + if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){ + int mvCount; + double configurationLikelihood; + int configurationGenotypeDiffs; + for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ + for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); + configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if ((configurationLikelihood > bestConfigurationLikelihood) || + (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) { + bestConfigurationLikelihood = configurationLikelihood; + bestMotherGenotype = motherGenotype.getKey(); + bestFatherGenotype = fatherGenotype.getKey(); + bestChildGenotype = childGenotype.getKey(); + isMV = mvCount>0; + bestConfigurationGenotypeDiffs=configurationGenotypeDiffs; + } } } } + + //normalize the best configuration probability + bestConfigurationLikelihood = bestConfigurationLikelihood / norm; + } + else{ + bestConfigurationLikelihood = NO_TRANSMISSION_PROB; } //Get the phased alleles for the genotype configuration TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype); //Return the phased genotypes - phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,MathUtils.probabilityToPhredScale(1-(bestConfigurationLikelihood / norm)),finalGenotypes); + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); return isMV; } @@ -522,8 +545,8 @@ public class PhaseByTransmission extends RodWalker, HashMa Genotype father = vc.getGenotype(sample.getPaternalID()); Genotype child = vc.getGenotype(sample.getID()); - //Skip trios where any of the genotype is missing in the variant context - if(mother == null || father == null | child == null) + //Keep only trios and parent/child pairs + if(mother == null && father == null || child == null) continue; ArrayList trioGenotypes = new ArrayList(3); @@ -545,11 +568,8 @@ public class PhaseByTransmission extends RodWalker, HashMa if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){ numHet++; - if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()){ + if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()) numHetHetHet++; - }else if(!phasedMother.isPhased()){ - int x =9; - } } if(isMV){ From 62477a08104c3a158ccbb547ab5f347625676809 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 24 Oct 2011 13:45:21 +0200 Subject: [PATCH 08/27] Added documentation and comments --- .../walkers/phasing/PhaseByTransmission.java | 127 ++++++++++++++---- 1 file changed, 103 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index d128ff40e..c4c6b69b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -22,15 +22,57 @@ import java.io.PrintStream; import java.util.*; /** - * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio, - * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype - * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents - * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the - * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to - * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally - * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or - * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to - * begin. + * Computes the most likely genotype combination and phases trios and parent/child pairs + * + *

+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases + * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. + * Ambiguous sites are: + *

    + *
  • Sites where all individuals are heterozygous
  • + *
  • Sites where there is a Mendelian violation
  • + *
+ * Missing genotypes are handled as follows: + *
    + *
  • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
  • + *
  • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
  • + *
  • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
  • + *
  • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
  • + *
+ * + *

Input

+ *

+ *

    + *
  • A VCF variant set containing trio(s) and/or parent/child pair(s).
  • + *
  • A PED pedigree file containing the description of the individuals relationships.
  • + *
+ *

+ * + *

Options

+ *

+ *

    + *
  • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype + * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, + * and each individual genotype, depth, allelic depth and likelihoods.
  • + *
  • DeNovoPrior: Mutation prio; default is 1e-8
  • + *
+ *

+ * + *

Output

+ *

+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T PhaseByTransmission \
+ *   -V input.vcf \
+ *   -ped input.ped \
+ *   -o output.vcf
+ * 
+ * */ public class PhaseByTransmission extends RodWalker, HashMap> { @@ -73,7 +115,8 @@ public class PhaseByTransmission extends RodWalker, HashMa CHILD } - //Stores a trio-genotype + //Stores a conceptual trio or parent/child pair genotype combination along with its phasing. + //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method. private class TrioPhase { //Create 2 fake alleles @@ -119,7 +162,8 @@ public class PhaseByTransmission extends RodWalker, HashMa trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - private void phaseMonoParentFamilyAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ + //Find the phase for a parent/child pair + private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ //Special case for Het/Het as it is ambiguous if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ @@ -155,6 +199,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } } + //Phases a family by transmission private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ Set> possiblePhasedChildGenotypes = new HashSet>(); @@ -209,7 +254,10 @@ public class PhaseByTransmission extends RodWalker, HashMa trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - + /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. + If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair + or single individual. + */ public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ //Take care of cases where one or more family members are no call @@ -225,10 +273,10 @@ public class PhaseByTransmission extends RodWalker, HashMa phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } else - phaseMonoParentFamilyAlleles(father, child, FamilyMember.FATHER); + phasePairAlleles(father, child, FamilyMember.FATHER); } else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ - phaseMonoParentFamilyAlleles(mother, child, FamilyMember.MOTHER); + phasePairAlleles(mother, child, FamilyMember.MOTHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER); } //Special case for Het/Het/Het as it is ambiguous @@ -243,11 +291,20 @@ public class PhaseByTransmission extends RodWalker, HashMa } } - public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ + public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); - return phasedGenotypes; } private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ @@ -290,7 +347,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } /** - * Parse the familial relationship specification, and initialize VCF writer + * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer */ public void initialize() { ArrayList rodNames = new ArrayList(); @@ -306,7 +363,7 @@ public class PhaseByTransmission extends RodWalker, HashMa Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the phase given that the genotypes are correct")); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct")); headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); @@ -318,7 +375,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } /** - * Select Trios only + * Select trios and parent/child pairs only */ private void setTrios(){ @@ -349,6 +406,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } + //Create the transmission matrices private void buildMatrices(){ mvCountMatrix = new EnumMap>>(Genotype.Type.class); transmissionMatrix = new EnumMap>>(Genotype.Type.class); @@ -366,6 +424,9 @@ public class PhaseByTransmission extends RodWalker, HashMa } } + //Returns the number of Mendelian Violations for a given genotype combination. + //If one of the parents genotype is missing, it will consider it as a parent/child pair + //If the child genotype or both parents genotypes are missing, 0 is returned. private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ //Child is no call => No MV @@ -421,6 +482,7 @@ public class PhaseByTransmission extends RodWalker, HashMa return 1; } + //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ int count = 0; if(motherOriginal!=motherNew) @@ -432,6 +494,8 @@ public class PhaseByTransmission extends RodWalker, HashMa return count; } + //Get a Map of genotype likelihoods. If the genotype is NO_CALL or UNAVAILABLE, the Map will contain a single + //NO_CALL resp. UNAVAILABLE element with a likelihood of 1.0 private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ if(genotype == null || !genotype.isAvailable()){ EnumMap likelihoods = new EnumMap(Genotype.Type.class); @@ -446,6 +510,7 @@ public class PhaseByTransmission extends RodWalker, HashMa return genotype.getLikelihoods().getAsMap(true); } + //Returns the Genotype.Type; returns UNVAILABLE if given null private Genotype.Type getTypeSafeNull(Genotype genotype){ if(genotype == null) return Genotype.Type.UNAVAILABLE; @@ -453,6 +518,16 @@ public class PhaseByTransmission extends RodWalker, HashMa } + /** + * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair. + * @param ref: Reference allele + * @param alt: Alternative allele + * @param mother: Mother's genotype + * @param father: Father's genotype + * @param child: Child's genotype + * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child + * @return + */ private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { //Get the PL @@ -600,9 +675,9 @@ public class PhaseByTransmission extends RodWalker, HashMa } /** - * Provide an initial value for reduce computations. + * Initializes the reporting counters. * - * @return Initial value of reduce. + * @return All counters initialized to 0 */ @Override public HashMap reduceInit() { @@ -617,10 +692,10 @@ public class PhaseByTransmission extends RodWalker, HashMa } /** - * Reduces a single map with the accumulator provided as the ReduceType. + * Adds the value of the site phased to the reporting counters. * - * @param value result of the map. - * @param sum accumulator for the reduce. + * @param value Site values + * @param sum accumulator for the reporting counters * @return accumulator with result of the map taken into account. */ @Override @@ -635,6 +710,10 @@ public class PhaseByTransmission extends RodWalker, HashMa } + /** + * Reports statistics on the phasing by transmission process. + * @param result Accumulator with all counters. + */ @Override public void onTraversalDone(HashMap result) { logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); From 62cff266d4e20aa2a48f88c47f282d9a6bc25287 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 26 Oct 2011 14:40:04 +0200 Subject: [PATCH 09/27] GQ calculation corrected for most likely genotype --- .../utils/variantcontext/GenotypeLikelihoods.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index b83ffa2fd..47c93bb1b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -124,10 +124,15 @@ public class GenotypeLikelihoods { } - double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); - double chosenGenotype = normalized[genotype.ordinal()-1]; - qual = -1.0 * Math.log10(1.0 - chosenGenotype); + //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best) + qual = likelihoods.get(genotype) - qual; + //Quality of other genotypes 1-P(G) + if (qual < 0) { + double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); + double chosenGenotype = normalized[genotype.ordinal()-1]; + qual = -1.0 * Math.log10(1.0 - chosenGenotype); + } return qual; } From 81b163ff4d6a8203cb4cb155a92a751ee1b2349d Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 26 Oct 2011 14:49:12 +0200 Subject: [PATCH 10/27] Indentation --- .../walkers/phasing/PhaseByTransmission.java | 370 +++++++++--------- 1 file changed, 185 insertions(+), 185 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index c4c6b69b7..956cf7c89 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -129,220 +129,220 @@ public class PhaseByTransmission extends RodWalker, HashMa private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class); private ArrayList getAlleles(Genotype.Type genotype){ - ArrayList alleles = new ArrayList(2); - if(genotype == Genotype.Type.HOM_REF){ - alleles.add(REF); - alleles.add(REF); - } - else if(genotype == Genotype.Type.HET){ - alleles.add(REF); - alleles.add(VAR); - } - else if(genotype == Genotype.Type.HOM_VAR){ - alleles.add(VAR); - alleles.add(VAR); - } - else if(genotype == Genotype.Type.NO_CALL){ - alleles.add(NO_CALL); - alleles.add(NO_CALL); - } - else{ - return null; - } - return alleles; - } - - //Create a new Genotype based on information from a single individual - //Homozygous genotypes will be set as phased, heterozygous won't be - private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ - if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ - trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - } - else - trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - } - - //Find the phase for a parent/child pair - private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ - - //Special case for Het/Het as it is ambiguous - if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + ArrayList alleles = new ArrayList(2); + if(genotype == Genotype.Type.HOM_REF){ + alleles.add(REF); + alleles.add(REF); + } + else if(genotype == Genotype.Type.HET){ + alleles.add(REF); + alleles.add(VAR); + } + else if(genotype == Genotype.Type.HOM_VAR){ + alleles.add(VAR); + alleles.add(VAR); + } + else if(genotype == Genotype.Type.NO_CALL){ + alleles.add(NO_CALL); + alleles.add(NO_CALL); + } + else{ + return null; + } + return alleles; } - ArrayList parentAlleles = getAlleles(parentGenotype); - ArrayList childAlleles = getAlleles(childGenotype); - ArrayList parentPhasedAlleles = new ArrayList(2); - ArrayList childPhasedAlleles = new ArrayList(2); - - //If there is a possible phasing between the mother and child => phase - int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); - if(childTransmittedAlleleIndex > -1){ - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - childPhasedAlleles.add(childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + //Create a new Genotype based on information from a single individual + //Homozygous genotypes will be set as phased, heterozygous won't be + private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ + if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + } + else + trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ - parentPhasedAlleles.add(parentAlleles.get(1)); - parentPhasedAlleles.add(parentAlleles.get(0)); - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); - childPhasedAlleles.add(childAlleles.get(0)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); - } - //This is a Mendelian Violation => Do not phase - else{ - trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - } - } - //Phases a family by transmission - private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ + //Find the phase for a parent/child pair + private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ - Set> possiblePhasedChildGenotypes = new HashSet>(); - ArrayList motherAlleles = getAlleles(mother); - ArrayList fatherAlleles = getAlleles(father); - ArrayList childAlleles = getAlleles(child); + //Special case for Het/Het as it is ambiguous + if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + } - //Build all possible child genotypes for the given parent's genotypes - for (Allele momAllele : motherAlleles) { - for (Allele fatherAllele : fatherAlleles) { - ArrayList possiblePhasedChildAlleles = new ArrayList(2); - possiblePhasedChildAlleles.add(momAllele); - possiblePhasedChildAlleles.add(fatherAllele); - possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); + ArrayList parentAlleles = getAlleles(parentGenotype); + ArrayList childAlleles = getAlleles(childGenotype); + ArrayList parentPhasedAlleles = new ArrayList(2); + ArrayList childPhasedAlleles = new ArrayList(2); + + //If there is a possible phasing between the mother and child => phase + int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); + if(childTransmittedAlleleIndex > -1){ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + } + else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ + parentPhasedAlleles.add(parentAlleles.get(1)); + parentPhasedAlleles.add(parentAlleles.get(0)); + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); + childPhasedAlleles.add(childAlleles.get(0)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true)); + } + //This is a Mendelian Violation => Do not phase + else{ + trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } } - for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { - int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); - int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); - //If a possible combination has been found, create the genotypes - if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { - //Create mother's genotype - ArrayList motherPhasedAlleles = new ArrayList(2); - motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); - if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) - motherPhasedAlleles.add(motherAlleles.get(0)); - else - motherPhasedAlleles.add(motherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + //Phases a family by transmission + private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - //Create father's genotype - ArrayList fatherPhasedAlleles = new ArrayList(2); - fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); - if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) - fatherPhasedAlleles.add(fatherAlleles.get(0)); - else - fatherPhasedAlleles.add(fatherAlleles.get(1)); - trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + Set> possiblePhasedChildGenotypes = new HashSet>(); + ArrayList motherAlleles = getAlleles(mother); + ArrayList fatherAlleles = getAlleles(father); + ArrayList childAlleles = getAlleles(child); - //Create child's genotype - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); - - //Once a phased combination is found; exit - return; + //Build all possible child genotypes for the given parent's genotypes + for (Allele momAllele : motherAlleles) { + for (Allele fatherAllele : fatherAlleles) { + ArrayList possiblePhasedChildAlleles = new ArrayList(2); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(fatherAllele); + possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles); + } } + + for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) { + int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0)); + int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1)); + //If a possible combination has been found, create the genotypes + if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) { + //Create mother's genotype + ArrayList motherPhasedAlleles = new ArrayList(2); + motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0)); + if(motherAlleles.get(0) != motherPhasedAlleles.get(0)) + motherPhasedAlleles.add(motherAlleles.get(0)); + else + motherPhasedAlleles.add(motherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Create father's genotype + ArrayList fatherPhasedAlleles = new ArrayList(2); + fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1)); + if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0)) + fatherPhasedAlleles.add(fatherAlleles.get(0)); + else + fatherPhasedAlleles.add(fatherAlleles.get(1)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Create child's genotype + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true)); + + //Once a phased combination is found; exit + return; + } + } + + //If this is reached then no phasing could be found + trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); } - //If this is reached then no phasing could be found - trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); - } + /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. + If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair + or single individual. + */ + public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - /* Constructor: Creates a conceptual trio genotype combination from the given genotypes. - If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair - or single individual. - */ - public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ - - //Take care of cases where one or more family members are no call - if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + //Take care of cases where one or more family members are no call + if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER); phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } - else - phasePairAlleles(father, child, FamilyMember.FATHER); + else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + else + phasePairAlleles(father, child, FamilyMember.FATHER); + } + else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + phasePairAlleles(mother, child, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + } + //Special case for Het/Het/Het as it is ambiguous + else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ + phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); + phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + phaseSingleIndividualAlleles(child, FamilyMember.CHILD); + } + //All family members have genotypes and at least one of them is not Het + else{ + phaseFamilyAlleles(mother, father, child); + } } - else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ - phasePairAlleles(mother, child, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); + + /** + * Applies the trio genotype combination to the given trio. + * @param ref: Reference allele + * @param alt: Alternate allele + * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination + * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination + * @param childGenotype: Genotype of the child to phase using this trio genotype combination + * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) + * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child + */ + public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ + phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); + phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); } - //Special case for Het/Het/Het as it is ambiguous - else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ - phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - phaseSingleIndividualAlleles(father, FamilyMember.FATHER); - phaseSingleIndividualAlleles(child, FamilyMember.CHILD); - } - //All family members have genotypes and at least one of them is not Het - else{ - phaseFamilyAlleles(mother, father, child); - } - } - /** - * Applies the trio genotype combination to the given trio. - * @param ref: Reference allele - * @param alt: Alternate allele - * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination - * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination - * @param childGenotype: Genotype of the child to phase using this trio genotype combination - * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable) - * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child - */ - public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){ - phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER))); - phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD))); - } + private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ - private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) + return genotype; - //Handle null, missing and unavailable genotypes - //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable - //genotype so it is safe to return the original genotype in this case. - if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) - return genotype; + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getAttributes()); + if(transmissionProb>NO_TRANSMISSION_PROB) + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb))); - //Add the transmission probability - Map genotypeAttributes = new HashMap(); - genotypeAttributes.putAll(genotype.getAttributes()); - if(transmissionProb>NO_TRANSMISSION_PROB) - genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb))); + ArrayList phasedAlleles = new ArrayList(2); + for(Allele allele : phasedGenotype.getAlleles()){ + if(allele.isReference()) + phasedAlleles.add(refAllele); + else if(allele.isNonReference()) + phasedAlleles.add(altAllele); + //At this point there should not be any other alleles left + else + throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); - ArrayList phasedAlleles = new ArrayList(2); - for(Allele allele : phasedGenotype.getAlleles()){ - if(allele.isReference()) - phasedAlleles.add(refAllele); - else if(allele.isNonReference()) - phasedAlleles.add(altAllele); - //At this point there should not be any other alleles left + } + + //Compute the new Log10Error if the genotype is different from the original genotype + double negLog10Error; + if(genotype.getType() == phasedGenotype.getType()) + negLog10Error = genotype.getNegLog10PError(); else - throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString())); + negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType()); + return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased()); } - //Compute the new Log10Error if the genotype is different from the original genotype - double negLog10Error; - if(genotype.getType() == phasedGenotype.getType()) - negLog10Error = genotype.getNegLog10PError(); - else - negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType()); - - return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased()); - } - } From 1f044faedd01317b71c3abf91ac935beb6581223 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 26 Oct 2011 19:57:09 +0200 Subject: [PATCH 11/27] - Genotype assignment in case of equally likeli combination is now random - Genotype combinations with 0 confidence are now left unphased --- .../walkers/phasing/PhaseByTransmission.java | 65 +++++++++++++------ 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 956cf7c89..f5e9ce6ea 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -109,6 +109,9 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_HET_HET_HET = 4; private final Byte NUM_VIOLATIONS = 5; + //Random number generator + private Random rand = new Random(); + private enum FamilyMember { MOTHER, FATHER, @@ -309,17 +312,22 @@ public class PhaseByTransmission extends RodWalker, HashMa private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){ + int phredScoreTransmission = -1; + if(transmissionProb != NO_TRANSMISSION_PROB) + phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb)); + //Handle null, missing and unavailable genotypes //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable //genotype so it is safe to return the original genotype in this case. - if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) + //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. + if(phredScoreTransmission ==0 || genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) return genotype; //Add the transmission probability Map genotypeAttributes = new HashMap(); genotypeAttributes.putAll(genotype.getAttributes()); if(transmissionProb>NO_TRANSMISSION_PROB) - genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb))); + genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); ArrayList phasedAlleles = new ArrayList(2); for(Allele allele : phasedGenotype.getAlleles()){ @@ -538,36 +546,46 @@ public class PhaseByTransmission extends RodWalker, HashMa //Prior vars double bestConfigurationLikelihood = 0.0; double norm = 0.0; - boolean isMV = false; - int bestConfigurationGenotypeDiffs=4; - Genotype.Type bestMotherGenotype = getTypeSafeNull(mother); - Genotype.Type bestFatherGenotype = getTypeSafeNull(father); - Genotype.Type bestChildGenotype = getTypeSafeNull(child); + int configuration_index =0; + ArrayList isMV = new ArrayList(); + isMV.add(false); + ArrayList bestMotherGenotype = new ArrayList(); + bestMotherGenotype.add(getTypeSafeNull(mother)); + ArrayList bestFatherGenotype = new ArrayList(); + bestFatherGenotype.add(getTypeSafeNull(father)); + ArrayList bestChildGenotype = new ArrayList(); + bestChildGenotype.add(getTypeSafeNull(child)); //Get the most likely combination //Only check for most likely combination if at least a parent and the child have genotypes if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){ int mvCount; double configurationLikelihood; - int configurationGenotypeDiffs; for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ for(Map.Entry childGenotype : childLikelihoods.entrySet()){ mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); norm += configurationLikelihood; - configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey()); //Keep this combination if //It has a better likelihood //Or it has the same likelihood but requires less changes from original genotypes - if ((configurationLikelihood > bestConfigurationLikelihood) || - (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) { + if (configurationLikelihood > bestConfigurationLikelihood){ bestConfigurationLikelihood = configurationLikelihood; - bestMotherGenotype = motherGenotype.getKey(); - bestFatherGenotype = fatherGenotype.getKey(); - bestChildGenotype = childGenotype.getKey(); - isMV = mvCount>0; - bestConfigurationGenotypeDiffs=configurationGenotypeDiffs; + isMV.clear(); + isMV.add(mvCount>0); + bestMotherGenotype.clear(); + bestMotherGenotype.add(motherGenotype.getKey()); + bestFatherGenotype.clear(); + bestFatherGenotype.add(fatherGenotype.getKey()); + bestChildGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestMotherGenotype.add(motherGenotype.getKey()); + bestFatherGenotype.add(fatherGenotype.getKey()); + bestChildGenotype.add(childGenotype.getKey()); + isMV.add(mvCount>0); } } } @@ -575,17 +593,22 @@ public class PhaseByTransmission extends RodWalker, HashMa //normalize the best configuration probability bestConfigurationLikelihood = bestConfigurationLikelihood / norm; + + //In case of multiple equally likely combinations, take a random one + if(bestMotherGenotype.size()>1){ + configuration_index = rand.nextInt(bestMotherGenotype.size()-1); + } + } else{ bestConfigurationLikelihood = NO_TRANSMISSION_PROB; } - //Get the phased alleles for the genotype configuration - TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype); + TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype.get(configuration_index)).get(bestFatherGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); - //Return the phased genotypes - phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); - return isMV; + //Return the phased genotypes + phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); + return isMV.get(configuration_index); } From b91a9c4711d60b361a226a517a97e68217fdd5cf Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 2 Nov 2011 08:04:01 +0100 Subject: [PATCH 12/27] - Fixed parent/child pairs handling (was crashing before) - Added parent/child pair reporting --- .../walkers/phasing/PhaseByTransmission.java | 151 ++++++++++++------ 1 file changed, 101 insertions(+), 50 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index f5e9ce6ea..5b2024f96 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -105,9 +105,13 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1; private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; - private final Byte NUM_HET = 3; - private final Byte NUM_HET_HET_HET = 4; - private final Byte NUM_VIOLATIONS = 5; + private final Byte NUM_TRIO_HET_HET_HET = 3; + private final Byte NUM_TRIO_VIOLATIONS = 4; + private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; + private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; + private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; + private final Byte NUM_PAIR_HET_HET = 8; + private final Byte NUM_PAIR_VIOLATIONS = 9; //Random number generator private Random rand = new Random(); @@ -172,6 +176,7 @@ public class PhaseByTransmission extends RodWalker, HashMa if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false)); trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false)); + return; } ArrayList parentAlleles = getAlleles(parentGenotype); @@ -612,6 +617,48 @@ public class PhaseByTransmission extends RodWalker, HashMa } + + private void updatePairMetricsCounters(Genotype parent, Genotype child, boolean isMV, HashMap counters){ + + //Increment metrics counters + if(parent.isCalled() && child.isCalled()){ + counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1); + if(parent.isPhased()) + counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); + else{ + if(parent.isHet() && child.isHet()) + counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); + + else if(isMV) + counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+1); + } + }else{ + counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); + } + + } + + private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, boolean isMV, HashMap counters){ + + //Increment metrics counters + if(mother.isCalled() && father.isCalled() && child.isCalled()){ + counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1); + if(mother.isPhased()) + counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); + + else{ + if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); + + else if(isMV) + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + + } + }else{ + counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1); + } + } + /** * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype * @@ -623,13 +670,17 @@ public class PhaseByTransmission extends RodWalker, HashMa @Override public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - //Local cars to avoid lookups on increment - int numTrioGenotypesCalled = 0; - int numTrioGenotypesNoCall = 0; - int numTrioGenotypesPhased = 0; - int numHet = 0 ; - int numHetHetHet = 0; - int numMVs = 0; + HashMap metricsCounters = new HashMap(10); + metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); @@ -654,30 +705,26 @@ public class PhaseByTransmission extends RodWalker, HashMa Genotype phasedFather = trioGenotypes.get(1); Genotype phasedChild = trioGenotypes.get(2); - genotypeMap.put(phasedMother.getSampleName(), phasedMother); - genotypeMap.put(phasedFather.getSampleName(), phasedFather); - genotypeMap.put(phasedChild.getSampleName(), phasedChild); - - //Increment metrics counters - if(phasedMother.isCalled() && phasedFather.isCalled() && phasedChild.isCalled()){ - numTrioGenotypesCalled++; - if(phasedMother.isPhased()) - numTrioGenotypesPhased++; - - if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){ - numHet++; - if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()) - numHetHetHet++; + //Fill the genotype map with the new genotypes and increment metrics counters + genotypeMap.put(phasedChild.getSampleName(),phasedChild); + if(mother != null){ + genotypeMap.put(phasedMother.getSampleName(), phasedMother); + if(father != null){ + genotypeMap.put(phasedFather.getSampleName(), phasedFather); + updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters); } - - if(isMV){ - numMVs++; - if(mvFile != null) - mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString())); - } - }else{ - numTrioGenotypesNoCall++; + else + updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters); } + else{ + genotypeMap.put(phasedFather.getSampleName(),phasedFather); + updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters); + } + + //Report violation if set so + //TODO: ADAPT FOR PAIRS TOO!! + if(isMV && mvFile != null) + mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString())); } @@ -686,14 +733,6 @@ public class PhaseByTransmission extends RodWalker, HashMa vcfWriter.add(newvc); } - - HashMap metricsCounters = new HashMap(5); - metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,numTrioGenotypesCalled); - metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,numTrioGenotypesNoCall); - metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,numTrioGenotypesPhased); - metricsCounters.put(NUM_HET,numHet); - metricsCounters.put(NUM_HET_HET_HET,numHetHetHet); - metricsCounters.put(NUM_VIOLATIONS,numMVs); return metricsCounters; } @@ -704,13 +743,17 @@ public class PhaseByTransmission extends RodWalker, HashMa */ @Override public HashMap reduceInit() { - HashMap metricsCounters = new HashMap(5); + HashMap metricsCounters = new HashMap(10); metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0); metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0); metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0); - metricsCounters.put(NUM_HET,0); - metricsCounters.put(NUM_HET_HET_HET,0); - metricsCounters.put(NUM_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_HET_HET_HET,0); + metricsCounters.put(NUM_TRIO_VIOLATIONS,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0); + metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); + metricsCounters.put(NUM_PAIR_HET_HET,0); + metricsCounters.put(NUM_PAIR_VIOLATIONS,0); return metricsCounters; } @@ -726,9 +769,13 @@ public class PhaseByTransmission extends RodWalker, HashMa sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED)); sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL)); sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED)); - sum.put(NUM_HET,value.get(NUM_HET)+sum.get(NUM_HET)); - sum.put(NUM_HET_HET_HET,value.get(NUM_HET_HET_HET)+sum.get(NUM_HET_HET_HET)); - sum.put(NUM_VIOLATIONS,value.get(NUM_VIOLATIONS)+sum.get(NUM_VIOLATIONS)); + sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET)); + sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS)); + sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED)); + sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL)); + sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); + sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); + sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); return sum; } @@ -742,8 +789,12 @@ public class PhaseByTransmission extends RodWalker, HashMa logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED)); logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); - logger.info("Number of resulting Hom/Hom/Hom trios: " + result.get(NUM_HET)); - logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_HET_HET_HET)); - logger.info("Number of remaining mendelian violations: " + result.get(NUM_VIOLATIONS)); + logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); + logger.info("Number of remaining mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); + logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); + logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); + logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); + logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); } } From 119ca7d742371423060bcc7150f908e1cb04de8a Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 2 Nov 2011 08:22:33 +0100 Subject: [PATCH 13/27] Fixed a bug in parent/child pairs reporting causing a crash in case the -mvf option was used and mother was not provided --- .../sting/gatk/walkers/phasing/PhaseByTransmission.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 5b2024f96..806a84e9d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -681,6 +681,7 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + String mvfLine = ""; if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); @@ -712,19 +713,23 @@ public class PhaseByTransmission extends RodWalker, HashMa if(father != null){ genotypeMap.put(phasedFather.getSampleName(), phasedFather); updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } - else + else{ updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + } } else{ genotypeMap.put(phasedFather.getSampleName(),phasedFather); updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters); + mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } //Report violation if set so //TODO: ADAPT FOR PAIRS TOO!! if(isMV && mvFile != null) - mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString())); + mvFile.println(mvfLine); } From 19ad5b635ad46f119ed1beccccf7bb2bc6a18aef Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 2 Nov 2011 18:35:31 +0100 Subject: [PATCH 14/27] - Calculation of parent/child pairs corrected - Separated the reporting of single and double mendelian violations in trios --- .../walkers/phasing/PhaseByTransmission.java | 177 ++++++++++++------ 1 file changed, 120 insertions(+), 57 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 806a84e9d..3c39b58d3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -107,6 +107,7 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_TRIO_GENOTYPES_PHASED = 2; private final Byte NUM_TRIO_HET_HET_HET = 3; private final Byte NUM_TRIO_VIOLATIONS = 4; + private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10; private final Byte NUM_PAIR_GENOTYPES_CALLED = 5; private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6; private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; @@ -507,17 +508,14 @@ public class PhaseByTransmission extends RodWalker, HashMa return count; } - //Get a Map of genotype likelihoods. If the genotype is NO_CALL or UNAVAILABLE, the Map will contain a single - //NO_CALL resp. UNAVAILABLE element with a likelihood of 1.0 + //Get a Map of genotype likelihoods. + //In case of null, unavailable or no call, all likelihoods are 1/3. private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ - if(genotype == null || !genotype.isAvailable()){ + if(genotype == null || !genotype.isCalled()){ EnumMap likelihoods = new EnumMap(Genotype.Type.class); - likelihoods.put(Genotype.Type.UNAVAILABLE,1.0); - return likelihoods; - } - else if(genotype.isNoCall()){ - EnumMap likelihoods = new EnumMap(Genotype.Type.class); - likelihoods.put(Genotype.Type.NO_CALL,1.0); + likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0); + likelihoods.put(Genotype.Type.HET,1.0/3.0); + likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0); return likelihoods; } return genotype.getLikelihoods().getAsMap(true); @@ -541,57 +539,113 @@ public class PhaseByTransmission extends RodWalker, HashMa * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child * @return */ - private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { + private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) { - //Get the PL - Map motherLikelihoods = getLikelihoodsAsMapSafeNull(mother); - Map fatherLikelihoods = getLikelihoodsAsMapSafeNull(father); + //Check whether it is a pair or trio + //Always assign the first parent as the parent having genotype information in pairs + //Always assign the mother as the first parent in trios + int parentsCalled = 0; + Map firstParentLikelihoods; + Map secondParentLikelihoods; + Genotype.Type pairSecondParentGenotype = null; + if(mother == null || !mother.isCalled()){ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType(); + if(father != null && father.isCalled()) + parentsCalled = 1; + } + else{ + firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + if(father == null || !father.isCalled()){ + parentsCalled = 1; + pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType(); + }else{ + parentsCalled = 2; + } + } Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); //Prior vars double bestConfigurationLikelihood = 0.0; double norm = 0.0; int configuration_index =0; - ArrayList isMV = new ArrayList(); - isMV.add(false); - ArrayList bestMotherGenotype = new ArrayList(); - bestMotherGenotype.add(getTypeSafeNull(mother)); - ArrayList bestFatherGenotype = new ArrayList(); - bestFatherGenotype.add(getTypeSafeNull(father)); + ArrayList bestMVCount = new ArrayList(); + bestMVCount.add(0); + + ArrayList bestFirstParentGenotype = new ArrayList(); + ArrayList bestSecondParentGenotype = new ArrayList(); ArrayList bestChildGenotype = new ArrayList(); + bestFirstParentGenotype.add(getTypeSafeNull(mother)); + bestSecondParentGenotype.add(getTypeSafeNull(father)); bestChildGenotype.add(getTypeSafeNull(child)); //Get the most likely combination //Only check for most likely combination if at least a parent and the child have genotypes - if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){ + if(child.isCalled() && parentsCalled > 0){ int mvCount; - double configurationLikelihood; - for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){ - for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){ - for(Map.Entry childGenotype : childLikelihoods.entrySet()){ - mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey()); - configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue(); + int cumulativeMVCount = 0; + double configurationLikelihood = 0; + for(Map.Entry childGenotype : childLikelihoods.entrySet()){ + for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){ + for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){ + mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); + //For parent/child pairs, sum over the possible genotype configurations of the missing parent + if(parentsCalled<2){ + cumulativeMVCount += mvCount; + configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + } + //Evaluate configurations of trios + else{ + configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue(); + norm += configurationLikelihood; + //Keep this combination if + //It has a better likelihood + //Or it has the same likelihood but requires less changes from original genotypes + if (configurationLikelihood > bestConfigurationLikelihood){ + bestConfigurationLikelihood = configurationLikelihood; + bestMVCount.clear(); + bestMVCount.add(mvCount); + bestFirstParentGenotype.clear(); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.clear(); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.clear(); + bestChildGenotype.add(childGenotype.getKey()); + } + else if(configurationLikelihood == bestConfigurationLikelihood) { + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(secondParentGenotype.getKey()); + bestChildGenotype.add(childGenotype.getKey()); + bestMVCount.add(mvCount); + } + } + } + //Evaluate configurations of parent/child pairs + if(parentsCalled<2){ norm += configurationLikelihood; //Keep this combination if //It has a better likelihood //Or it has the same likelihood but requires less changes from original genotypes if (configurationLikelihood > bestConfigurationLikelihood){ bestConfigurationLikelihood = configurationLikelihood; - isMV.clear(); - isMV.add(mvCount>0); - bestMotherGenotype.clear(); - bestMotherGenotype.add(motherGenotype.getKey()); - bestFatherGenotype.clear(); - bestFatherGenotype.add(fatherGenotype.getKey()); + bestMVCount.clear(); + bestMVCount.add(cumulativeMVCount/3); bestChildGenotype.clear(); + bestFirstParentGenotype.clear(); + bestSecondParentGenotype.clear(); bestChildGenotype.add(childGenotype.getKey()); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); } else if(configurationLikelihood == bestConfigurationLikelihood) { - bestMotherGenotype.add(motherGenotype.getKey()); - bestFatherGenotype.add(fatherGenotype.getKey()); + bestFirstParentGenotype.add(firstParentGenotype.getKey()); + bestSecondParentGenotype.add(pairSecondParentGenotype); bestChildGenotype.add(childGenotype.getKey()); - isMV.add(mvCount>0); + bestMVCount.add(cumulativeMVCount/3); } + configurationLikelihood = 0; } } } @@ -600,8 +654,8 @@ public class PhaseByTransmission extends RodWalker, HashMa bestConfigurationLikelihood = bestConfigurationLikelihood / norm; //In case of multiple equally likely combinations, take a random one - if(bestMotherGenotype.size()>1){ - configuration_index = rand.nextInt(bestMotherGenotype.size()-1); + if(bestFirstParentGenotype.size()>1){ + configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1); } } @@ -609,16 +663,20 @@ public class PhaseByTransmission extends RodWalker, HashMa bestConfigurationLikelihood = NO_TRANSMISSION_PROB; } - TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype.get(configuration_index)).get(bestFatherGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + TrioPhase phasedTrioGenotypes; + if(parentsCalled < 2 && mother == null || !mother.isCalled()) + phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); + else + phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index)); //Return the phased genotypes phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes); - return isMV.get(configuration_index); + return bestMVCount.get(configuration_index); } - private void updatePairMetricsCounters(Genotype parent, Genotype child, boolean isMV, HashMap counters){ + private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){ //Increment metrics counters if(parent.isCalled() && child.isCalled()){ @@ -626,11 +684,9 @@ public class PhaseByTransmission extends RodWalker, HashMa if(parent.isPhased()) counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1); else{ + counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount); if(parent.isHet() && child.isHet()) counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1); - - else if(isMV) - counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+1); } }else{ counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1); @@ -638,7 +694,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } - private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, boolean isMV, HashMap counters){ + private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){ //Increment metrics counters if(mother.isCalled() && father.isCalled() && child.isCalled()){ @@ -647,11 +703,14 @@ public class PhaseByTransmission extends RodWalker, HashMa counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1); else{ - if(mother.isHet() && father.isHet() && child.isHet()) - counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); - - else if(isMV) - counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + if(mvCount > 0){ + if(mvCount >1) + counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1); + else + counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1); + } + else if(mother.isHet() && father.isHet() && child.isHet()) + counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1); } }else{ @@ -681,14 +740,15 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); - String mvfLine = ""; + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + String mvfLine; if (tracker != null) { VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); Map genotypeMap = vc.getGenotypes(); - boolean isMV; + int mvCount; for (Sample sample : trios) { Genotype mother = vc.getGenotype(sample.getMaternalID()); @@ -700,7 +760,7 @@ public class PhaseByTransmission extends RodWalker, HashMa continue; ArrayList trioGenotypes = new ArrayList(3); - isMV = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); + mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes); Genotype phasedMother = trioGenotypes.get(0); Genotype phasedFather = trioGenotypes.get(1); @@ -712,23 +772,23 @@ public class PhaseByTransmission extends RodWalker, HashMa genotypeMap.put(phasedMother.getSampleName(), phasedMother); if(father != null){ genotypeMap.put(phasedFather.getSampleName(), phasedFather); - updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters); + updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } else{ - updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters); + updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } } else{ genotypeMap.put(phasedFather.getSampleName(),phasedFather); - updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters); + updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } //Report violation if set so //TODO: ADAPT FOR PAIRS TOO!! - if(isMV && mvFile != null) + if(mvCount>0 && mvFile != null) mvFile.println(mvfLine); } @@ -759,6 +819,7 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); return metricsCounters; } @@ -781,6 +842,7 @@ public class PhaseByTransmission extends RodWalker, HashMa sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED)); sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); + sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); return sum; } @@ -795,7 +857,8 @@ public class PhaseByTransmission extends RodWalker, HashMa logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL)); logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED)); logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET)); - logger.info("Number of remaining mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS)); + logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS)); logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED)); logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL)); logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); From 893787de535fde77b56d02e5c006400e021f76c1 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 3 Nov 2011 13:04:11 +0100 Subject: [PATCH 15/27] Functions getAsMap and getNegLog10GQ now handle missing genotype case. --- .../sting/utils/variantcontext/GenotypeLikelihoods.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 47c93bb1b..8c8e4f257 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -101,9 +101,12 @@ public class GenotypeLikelihoods { } //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values + //Returns null in case of missing likelihoods public EnumMap getAsMap(boolean normalizeFromLog10){ //Make sure that the log10likelihoods are set double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector(); + if(likelihoods == null) + return null; EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class); likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]); likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]); @@ -112,10 +115,13 @@ public class GenotypeLikelihoods { } //Return the neg log10 Genotype Quality (GQ) for the given genotype + //Returns Double.NEGATIVE_INFINITY in case of missing genotype public double getNegLog10GQ(Genotype.Type genotype){ double qual = Double.NEGATIVE_INFINITY; EnumMap likelihoods = getAsMap(false); + if(likelihoods == null) + return qual; for(Map.Entry likelihood : likelihoods.entrySet()){ if(likelihood.getKey() == genotype) continue; From 385a6abec144fc72c726f88216cdcd9c3a4fa435 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Thu, 3 Nov 2011 13:04:53 +0100 Subject: [PATCH 16/27] Fixed a bug that wrongly swapped the mother and father genotypes in case the child genotype missing. --- .../gatk/walkers/phasing/PhaseByTransmission.java | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 3c39b58d3..244527212 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -547,10 +547,15 @@ public class PhaseByTransmission extends RodWalker, HashMa int parentsCalled = 0; Map firstParentLikelihoods; Map secondParentLikelihoods; + ArrayList bestFirstParentGenotype = new ArrayList(); + ArrayList bestSecondParentGenotype = new ArrayList(); + ArrayList bestChildGenotype = new ArrayList(); Genotype.Type pairSecondParentGenotype = null; if(mother == null || !mother.isCalled()){ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); + bestFirstParentGenotype.add(getTypeSafeNull(father)); + bestSecondParentGenotype.add(getTypeSafeNull(mother)); pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType(); if(father != null && father.isCalled()) parentsCalled = 1; @@ -558,6 +563,8 @@ public class PhaseByTransmission extends RodWalker, HashMa else{ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father); + bestFirstParentGenotype.add(getTypeSafeNull(mother)); + bestSecondParentGenotype.add(getTypeSafeNull(father)); if(father == null || !father.isCalled()){ parentsCalled = 1; pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType(); @@ -566,6 +573,7 @@ public class PhaseByTransmission extends RodWalker, HashMa } } Map childLikelihoods = getLikelihoodsAsMapSafeNull(child); + bestChildGenotype.add(getTypeSafeNull(child)); //Prior vars double bestConfigurationLikelihood = 0.0; @@ -574,13 +582,6 @@ public class PhaseByTransmission extends RodWalker, HashMa ArrayList bestMVCount = new ArrayList(); bestMVCount.add(0); - ArrayList bestFirstParentGenotype = new ArrayList(); - ArrayList bestSecondParentGenotype = new ArrayList(); - ArrayList bestChildGenotype = new ArrayList(); - bestFirstParentGenotype.add(getTypeSafeNull(mother)); - bestSecondParentGenotype.add(getTypeSafeNull(father)); - bestChildGenotype.add(getTypeSafeNull(child)); - //Get the most likely combination //Only check for most likely combination if at least a parent and the child have genotypes if(child.isCalled() && parentsCalled > 0){ From 571c724cfdc378dac05573146f64e3da7e6424ec Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Tue, 8 Nov 2011 15:15:51 +0100 Subject: [PATCH 17/27] Added reporting of the number of genotypes updated. --- .../walkers/phasing/PhaseByTransmission.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 244527212..6394e0e24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -113,6 +113,7 @@ public class PhaseByTransmission extends RodWalker, HashMa private final Byte NUM_PAIR_GENOTYPES_PHASED = 7; private final Byte NUM_PAIR_HET_HET = 8; private final Byte NUM_PAIR_VIOLATIONS = 9; + private final Byte NUM_GENOTYPES_MODIFIED = 11; //Random number generator private Random rand = new Random(); @@ -742,6 +743,8 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + String mvfLine; if (tracker != null) { @@ -775,15 +778,21 @@ public class PhaseByTransmission extends RodWalker, HashMa genotypeMap.put(phasedFather.getSampleName(), phasedFather); updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); + if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); } else{ updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters); + if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } } else{ genotypeMap.put(phasedFather.getSampleName(),phasedFather); updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters); + if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) + metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()); } @@ -820,7 +829,9 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0); metricsCounters.put(NUM_PAIR_HET_HET,0); metricsCounters.put(NUM_PAIR_VIOLATIONS,0); - metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0); + metricsCounters.put(NUM_GENOTYPES_MODIFIED,0); + return metricsCounters; } @@ -844,6 +855,8 @@ public class PhaseByTransmission extends RodWalker, HashMa sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET)); sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS)); sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS)); + sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED)); + return sum; } @@ -865,5 +878,7 @@ public class PhaseByTransmission extends RodWalker, HashMa logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED)); logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET)); logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS)); + logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED)); + } } From 34acf8b9789a263b991b65477e28b8fe3f25881d Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 14 Nov 2011 10:47:02 +0100 Subject: [PATCH 18/27] Added Unit tests for new methods in GenotypeLikelihoods --- .../GenotypeLikelihoodsUnitTest.java | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 9243588ab..f3d0dedcd 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -29,10 +29,13 @@ package org.broadinstitute.sting.utils.variantcontext; // the imports for unit testing. +import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; import org.testng.annotations.Test; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import java.util.EnumMap; + /** * Basic unit test for Genotype likelihoods objects @@ -69,6 +72,50 @@ public class GenotypeLikelihoodsUnitTest { gl.getAsVector(); } + @Test + public void testGetAsMap(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(v); + //Log scale + EnumMap glMap = gl.getAsMap(false); + Assert.assertEquals(v[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(v[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(v[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Linear scale + glMap = gl.getAsMap(true); + double [] vl = MathUtils.normalizeFromLog10(v); + Assert.assertEquals(vl[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF)); + Assert.assertEquals(vl[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET)); + Assert.assertEquals(vl[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR)); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + glMap = gl.getAsMap(false); + Assert.assertNull(glMap); + + } + + @Test + public void testGetNegLog10GQ(){ + GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString); + + //GQ for the best guess genotype + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),3.9); + + double[] test = MathUtils.normalizeFromLog10(gl.getAsVector()); + + //GQ for the other genotypes + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1])); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1])); + + //Test missing likelihoods + gl = new GenotypeLikelihoods("."); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY); + Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY); + + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { From 6881d4800c3f782255152f63d453f12451a96509 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Mon, 14 Nov 2011 10:47:51 +0100 Subject: [PATCH 19/27] Added Integration tests for Phasing by Transmission --- .../PhaseByTransmissionIntegrationTest.java | 122 +++++++++++++++++- 1 file changed, 115 insertions(+), 7 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index c663c1dd7..2cd76e7a5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -6,23 +6,131 @@ import org.testng.annotations.Test; import java.util.Arrays; public class PhaseByTransmissionIntegrationTest extends WalkerTest { - private static String phaseByTransmissionTestDataRoot = validationDataLocation + "/PhaseByTransmission"; - private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf"; + private static String phaseByTransmissionTestDataRoot = validationDataLocation + "PhaseByTransmission/"; + private static String goodFamilyFile = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.goodFamilies.ped"; + private static String TNTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TN.vcf"; + private static String TPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TP.vcf"; + private static String FPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.FP.vcf"; + private static String SpecialTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.Special.vcf"; + //Tests using PbT on all genotypes with default parameters + //And all reporting options @Test - public void testBasicFunctionality() { + public void testTrueNegativeMV() { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T PhaseByTransmission", "-NO_HEADER", "-R " + b37KGReference, - "--variant " + fundamentalTestVCF, - "-f NA12892+NA12891=NA12878", + "--variant " + TNTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("16fefda693156eadf1481fd9de23facb","9418a7a6405b78179ca13a67b8bfcc14") + ); + executeTest("testTrueNegativeMV", spec); + } + + @Test + public void testTruePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + TPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("14cf1d21a54d8b9fb506df178b634c56","efc66ae3d036715b721f9bd35b65d556") + ); + executeTest("testTruePositiveMV", spec); + } + + @Test + public void testFalsePositiveMV() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("f9b0fae9fe1e0f09b883a292b0e70a12","398724bc1e65314cc5ee92706e05a3ee") + ); + executeTest("testFalsePositiveMV", spec); + } + + @Test + public void testSpecialCases() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + SpecialTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("b8d1aa3789ce77b45430c62d13ee3006","a1a333e08fafb288cda0e7711909e1c3") + ); + executeTest("testSpecialCases", spec); + } + + //Test using a different prior + //Here the FP file is used but as the prior is lowered, 3 turn to TP + @Test + public void testPriorOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-prior 1e-4", + "-mvf %s", + "-o %s" + ), + 2, + Arrays.asList("7201ce7cc47db5840ac6b647709f7c33","c11b5e7cd7459d90d0160f917eff3b1e") + ); + executeTest("testPriorOption", spec); + } + + //Test when running without MV reporting option + //This is the exact same test file as FP but should not generate a .mvf file + @Test + public void testMVFileOption() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "-NO_HEADER", + "-R " + b37KGReference, + "--variant " + FPTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", "-o %s" ), 1, - Arrays.asList("") + Arrays.asList("398724bc1e65314cc5ee92706e05a3ee") ); - executeTest("testBasicFunctionality", spec); + executeTest("testMVFileOption", spec); } + } From b66556f4a0faf3036275741b07e0bace4df34943 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 15 Nov 2011 09:22:57 -0500 Subject: [PATCH 20/27] Update error message so that it's clear ReadPair Walkers are exceptions --- .../src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f8e87aa58..2ceb4ab46 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -480,7 +480,7 @@ public class GenomeAnalysisEngine { } } else if (walker instanceof ReadPairWalker) { if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers can only walk over query name-sorted data. Please resort your input BAM file."); + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); if(intervals != null && !intervals.isEmpty()) throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); From b45d10e6f1fc3443b9cfada54f4554f5f2e6d34f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 15 Nov 2011 10:23:59 -0500 Subject: [PATCH 21/27] The DP in the FORMAT field (per sample) must also use the representative count or else it's always 1 for reduced reads. --- .../genotyper/GenotypeLikelihoodsCalculationModel.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 489e963e8..74c55dbfe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Map; @@ -83,8 +81,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { * @param priors priors to use for GLs * @param GLs hash of sample->GL to fill in * @param alternateAlleleToUse the alternate allele to use, null if not set - * - * @param useBAQedPileup + * @param useBAQedPileup should we use the BAQed pileup or the raw one? * @return genotype likelihoods per sample for AA, AB, BB */ public abstract Allele getLikelihoods(RefMetaDataTracker tracker, @@ -93,13 +90,14 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, Map GLs, - Allele alternateAlleleToUse, boolean useBAQedPileup); + Allele alternateAlleleToUse, + boolean useBAQedPileup); protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; for ( PileupElement p : pileup ) { if ( BaseUtils.isRegularBase( p.getBase() ) ) - count++; + count += p.getRepresentativeCount(); } return count; From 7fada320a9dc36b8ff335dcfbc334b20dd81475b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 15 Nov 2011 14:53:27 -0500 Subject: [PATCH 23/27] The right fix for this test is just to delete it. --- .../org/broadinstitute/sting/utils/SimpleTimerUnitTest.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 3f5d05e66..7a2696b7b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -41,11 +41,6 @@ public class SimpleTimerUnitTest extends BaseTest { double t6 = t.getElapsedTime(); Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); - - t.stop().start(); - Assert.assertTrue(t.isRunning(), "second started timer isn't running"); - Assert.assertTrue(t.getElapsedTime() >= 0.0, "elapsed time should have been reset"); - Assert.assertTrue(t.getElapsedTime() < t6, "elapsed time isn't less than time before start call"); // we should have effective no elapsed time } private final static void idleLoop() { From 0d163e3f52b4de1b54c2213aea5aca14f336acd0 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Tue, 15 Nov 2011 16:05:20 -0500 Subject: [PATCH 24/27] SnpEff 2.0.4 support -Modified the SnpEff parser to work with the SnpEff 2.0.4 VCF output format -Assigning functional classes and effect impacts now handled directly by SnpEff rather than the GATK -Removed support for SnpEff 2.0.2, as we no longer trust the output of that version since it doesn't exclude effects associated with certain nonsensical transcripts. These effects are excluded as of 2.0.4. -Updated unit and integration tests This support is based on a *release-candidate* of SnpEff 2.0.4, and so is subject to change between now and the next GATK release. --- .../sting/gatk/walkers/annotator/SnpEff.java | 180 +++++++++--------- .../walkers/annotator/SnpEffUnitTest.java | 20 +- .../VariantAnnotatorIntegrationTest.java | 4 +- .../VariantEvalIntegrationTest.java | 6 +- 4 files changed, 106 insertions(+), 104 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 85977bf8e..1956dac6c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -56,7 +56,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // We refuse to parse SnpEff output files generated by unsupported versions, or // lacking a SnpEff version number in the VCF header: - public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.2" }; + public static final String[] SUPPORTED_SNPEFF_VERSIONS = { "2.0.4" }; public static final String SNPEFF_VCF_HEADER_VERSION_LINE_KEY = "SnpEffVersion"; public static final String SNPEFF_VCF_HEADER_COMMAND_LINE_KEY = "SnpEffCmd"; @@ -77,13 +77,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public enum InfoFieldKey { EFFECT_KEY ("SNPEFF_EFFECT", -1), IMPACT_KEY ("SNPEFF_IMPACT", 0), - CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 1), - AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 2), - GENE_NAME_KEY ("SNPEFF_GENE_NAME", 3), - GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 4), - TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 6), - EXON_ID_KEY ("SNPEFF_EXON_ID", 7), - FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", -1); + FUNCTIONAL_CLASS_KEY ("SNPEFF_FUNCTIONAL_CLASS", 1), + CODON_CHANGE_KEY ("SNPEFF_CODON_CHANGE", 2), + AMINO_ACID_CHANGE_KEY ("SNPEFF_AMINO_ACID_CHANGE", 3), + GENE_NAME_KEY ("SNPEFF_GENE_NAME", 4), + GENE_BIOTYPE_KEY ("SNPEFF_GENE_BIOTYPE", 5), + TRANSCRIPT_ID_KEY ("SNPEFF_TRANSCRIPT_ID", 7), + EXON_ID_KEY ("SNPEFF_EXON_ID", 8); // Actual text of the key private final String keyName; @@ -110,70 +110,53 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio // are validated against this list. public enum EffectType { // High-impact effects: - FRAME_SHIFT (EffectFunctionalClass.NONE, false), - STOP_GAINED (EffectFunctionalClass.NONSENSE, false), - START_LOST (EffectFunctionalClass.NONE, false), - SPLICE_SITE_ACCEPTOR (EffectFunctionalClass.NONE, false), - SPLICE_SITE_DONOR (EffectFunctionalClass.NONE, false), - EXON_DELETED (EffectFunctionalClass.NONE, false), - STOP_LOST (EffectFunctionalClass.NONE, false), + SPLICE_SITE_ACCEPTOR, + SPLICE_SITE_DONOR, + START_LOST, + EXON_DELETED, + FRAME_SHIFT, + STOP_GAINED, + STOP_LOST, // Moderate-impact effects: - NON_SYNONYMOUS_CODING (EffectFunctionalClass.MISSENSE, false), - CODON_CHANGE (EffectFunctionalClass.NONE, false), - CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_INSERTION (EffectFunctionalClass.NONE, false), - CODON_DELETION (EffectFunctionalClass.NONE, false), - CODON_CHANGE_PLUS_CODON_DELETION (EffectFunctionalClass.NONE, false), - UTR_5_DELETED (EffectFunctionalClass.NONE, false), - UTR_3_DELETED (EffectFunctionalClass.NONE, false), + NON_SYNONYMOUS_CODING, + CODON_CHANGE, + CODON_INSERTION, + CODON_CHANGE_PLUS_CODON_INSERTION, + CODON_DELETION, + CODON_CHANGE_PLUS_CODON_DELETION, + UTR_5_DELETED, + UTR_3_DELETED, // Low-impact effects: - SYNONYMOUS_CODING (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_START (EffectFunctionalClass.SILENT, false), - SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - NON_SYNONYMOUS_STOP (EffectFunctionalClass.SILENT, false), - START_GAINED (EffectFunctionalClass.NONE, false), + SYNONYMOUS_START, + NON_SYNONYMOUS_START, + START_GAINED, + SYNONYMOUS_CODING, + SYNONYMOUS_STOP, + NON_SYNONYMOUS_STOP, // Modifiers: - NONE (EffectFunctionalClass.NONE, true), - CHROMOSOME (EffectFunctionalClass.NONE, true), - INTERGENIC (EffectFunctionalClass.NONE, true), - UPSTREAM (EffectFunctionalClass.NONE, true), - UTR_5_PRIME (EffectFunctionalClass.NONE, true), - CDS (EffectFunctionalClass.NONE, true), - GENE (EffectFunctionalClass.NONE, true), - TRANSCRIPT (EffectFunctionalClass.NONE, true), - EXON (EffectFunctionalClass.NONE, true), - INTRON (EffectFunctionalClass.NONE, true), - UTR_3_PRIME (EffectFunctionalClass.NONE, true), - DOWNSTREAM (EffectFunctionalClass.NONE, true), - INTRON_CONSERVED (EffectFunctionalClass.NONE, true), - INTERGENIC_CONSERVED (EffectFunctionalClass.NONE, true), - REGULATION (EffectFunctionalClass.NONE, true), - CUSTOM (EffectFunctionalClass.NONE, true), - WITHIN_NON_CODING_GENE (EffectFunctionalClass.NONE, true); - - private final EffectFunctionalClass functionalClass; - private final boolean isModifier; - - EffectType ( EffectFunctionalClass functionalClass, boolean isModifier ) { - this.functionalClass = functionalClass; - this.isModifier = isModifier; - } - - public EffectFunctionalClass getFunctionalClass() { - return functionalClass; - } - - public boolean isModifier() { - return isModifier; - } + NONE, + CHROMOSOME, + CUSTOM, + CDS, + GENE, + TRANSCRIPT, + EXON, + INTRON_CONSERVED, + UTR_5_PRIME, + UTR_3_PRIME, + DOWNSTREAM, + INTRAGENIC, + INTERGENIC, + INTERGENIC_CONSERVED, + UPSTREAM, + REGULATION, + INTRON } - // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact. We take the additional step of - // classifying some of the LOW impact effects as MODIFIERs. + // SnpEff labels each effect as either LOW, MODERATE, or HIGH impact, or as a MODIFIER. public enum EffectImpact { MODIFIER (0), LOW (1), @@ -202,7 +185,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio UNKNOWN } - // We assign a functional class to each SnpEff effect. + // SnpEff assigns a functional class to each effect. public enum EffectFunctionalClass { NONE (0), SILENT (1), @@ -379,13 +362,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio public List getKeyNames() { return Arrays.asList( InfoFieldKey.EFFECT_KEY.getKeyName(), InfoFieldKey.IMPACT_KEY.getKeyName(), + InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), InfoFieldKey.GENE_NAME_KEY.getKeyName(), InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), - InfoFieldKey.EXON_ID_KEY.getKeyName(), - InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName() + InfoFieldKey.EXON_ID_KEY.getKeyName() ); } @@ -393,13 +376,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return Arrays.asList( new VCFInfoHeaderLine(InfoFieldKey.EFFECT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"), new VCFInfoHeaderLine(InfoFieldKey.IMPACT_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(EffectImpact.values())), + new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())), new VCFInfoHeaderLine(InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"), + new VCFInfoHeaderLine(InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant (in HGVS style)"), new VCFInfoHeaderLine(InfoFieldKey.GENE_NAME_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Gene biotype for the highest-impact effect resulting from the current variant"), new VCFInfoHeaderLine(InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"), - new VCFInfoHeaderLine(InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Functional class of the highest-impact effect resulting from the current variant: " + Arrays.toString(EffectFunctionalClass.values())) + new VCFInfoHeaderLine(InfoFieldKey.EXON_ID_KEY.getKeyName(), 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant") ); } @@ -409,6 +392,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio protected static class SnpEffEffect { private EffectType effect; private EffectImpact impact; + private EffectFunctionalClass functionalClass; private String codonChange; private String aminoAcidChange; private String geneName; @@ -420,16 +404,21 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio private String parseError = null; private boolean isWellFormed = true; - private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 8; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_WARNING = 9; - private static final int NUMBER_OF_METADATA_FIELDS_UPON_ERROR = 10; + private static final int EXPECTED_NUMBER_OF_METADATA_FIELDS = 9; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR = 10; + private static final int NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR = 11; - // Note that contrary to the description for the EFF field layout that SnpEff adds to the VCF header, - // errors come after warnings, not vice versa: - private static final int SNPEFF_WARNING_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_WARNING - 1; - private static final int SNPEFF_ERROR_FIELD_INDEX = NUMBER_OF_METADATA_FIELDS_UPON_ERROR - 1; + // If there is either a warning OR an error, it will be in the last field. If there is both + // a warning AND an error, the warning will be in the second-to-last field, and the error will + // be in the last field. + private static final int SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR - 1; + private static final int SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 2; + private static final int SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR = NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR - 1; - private static final int SNPEFF_CODING_FIELD_INDEX = 5; + // Position of the field indicating whether the effect is coding or non-coding. This field is used + // in selecting the most significant effect, but is not included in the annotations we return + // since it can be deduced from the SNPEFF_GENE_BIOTYPE field. + private static final int SNPEFF_CODING_FIELD_INDEX = 6; public SnpEffEffect ( String effectName, String[] effectMetadata ) { parseEffectName(effectName); @@ -447,11 +436,14 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio private void parseEffectMetadata ( String[] effectMetadata ) { if ( effectMetadata.length != EXPECTED_NUMBER_OF_METADATA_FIELDS ) { - if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_WARNING ) { - parseError(String.format("SnpEff issued the following warning: %s", effectMetadata[SNPEFF_WARNING_FIELD_INDEX])); + if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_EITHER_WARNING_OR_ERROR ) { + parseError(String.format("SnpEff issued the following warning or error: \"%s\"", + effectMetadata[SNPEFF_WARNING_OR_ERROR_FIELD_UPON_SINGLE_ERROR])); } - else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_ERROR ) { - parseError(String.format("SnpEff issued the following error: %s", effectMetadata[SNPEFF_ERROR_FIELD_INDEX])); + else if ( effectMetadata.length == NUMBER_OF_METADATA_FIELDS_UPON_BOTH_WARNING_AND_ERROR ) { + parseError(String.format("SnpEff issued the following warning: \"%s\", and the following error: \"%s\"", + effectMetadata[SNPEFF_WARNING_FIELD_UPON_BOTH_WARNING_AND_ERROR], + effectMetadata[SNPEFF_ERROR_FIELD_UPON_BOTH_WARNING_AND_ERROR])); } else { parseError(String.format("Wrong number of effect metadata fields. Expected %d but found %d", @@ -461,23 +453,33 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return; } - if ( effect != null && effect.isModifier() ) { - impact = EffectImpact.MODIFIER; + // The impact field will never be empty, and should always contain one of the enumerated values: + try { + impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); } - else { + catch ( IllegalArgumentException e ) { + parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + } + + // The functional class field will be empty when the effect has no functional class associated with it: + if ( effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()].trim().length() > 0 ) { try { - impact = EffectImpact.valueOf(effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()]); + functionalClass = EffectFunctionalClass.valueOf(effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()]); } catch ( IllegalArgumentException e ) { - parseError(String.format("Unrecognized value for effect impact: %s", effectMetadata[InfoFieldKey.IMPACT_KEY.getFieldIndex()])); + parseError(String.format("Unrecognized value for effect functional class: %s", effectMetadata[InfoFieldKey.FUNCTIONAL_CLASS_KEY.getFieldIndex()])); } } + else { + functionalClass = EffectFunctionalClass.NONE; + } codonChange = effectMetadata[InfoFieldKey.CODON_CHANGE_KEY.getFieldIndex()]; aminoAcidChange = effectMetadata[InfoFieldKey.AMINO_ACID_CHANGE_KEY.getFieldIndex()]; geneName = effectMetadata[InfoFieldKey.GENE_NAME_KEY.getFieldIndex()]; geneBiotype = effectMetadata[InfoFieldKey.GENE_BIOTYPE_KEY.getFieldIndex()]; + // The coding field will be empty when SnpEff has no coding info for the effect: if ( effectMetadata[SNPEFF_CODING_FIELD_INDEX].trim().length() > 0 ) { try { coding = EffectCoding.valueOf(effectMetadata[SNPEFF_CODING_FIELD_INDEX]); @@ -534,7 +536,7 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio return true; } else if ( impact.isSameImpactAs(other.impact) ) { - return effect.getFunctionalClass().isHigherPriorityThan(other.effect.getFunctionalClass()); + return functionalClass.isHigherPriorityThan(other.functionalClass); } return false; @@ -545,13 +547,13 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio addAnnotation(annotations, InfoFieldKey.EFFECT_KEY.getKeyName(), effect.toString()); addAnnotation(annotations, InfoFieldKey.IMPACT_KEY.getKeyName(), impact.toString()); + addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), functionalClass.toString()); addAnnotation(annotations, InfoFieldKey.CODON_CHANGE_KEY.getKeyName(), codonChange); addAnnotation(annotations, InfoFieldKey.AMINO_ACID_CHANGE_KEY.getKeyName(), aminoAcidChange); addAnnotation(annotations, InfoFieldKey.GENE_NAME_KEY.getKeyName(), geneName); addAnnotation(annotations, InfoFieldKey.GENE_BIOTYPE_KEY.getKeyName(), geneBiotype); addAnnotation(annotations, InfoFieldKey.TRANSCRIPT_ID_KEY.getKeyName(), transcriptID); addAnnotation(annotations, InfoFieldKey.EXON_ID_KEY.getKeyName(), exonID); - addAnnotation(annotations, InfoFieldKey.FUNCTIONAL_CLASS_KEY.getKeyName(), effect.getFunctionalClass().toString()); return annotations; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java index 462abeba1..5c8fa32a8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUnitTest.java @@ -33,7 +33,7 @@ public class SnpEffUnitTest { @Test public void testParseWellFormedEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertTrue( effect.isWellFormed() && effect.isCoding() ); @@ -42,7 +42,7 @@ public class SnpEffUnitTest { @Test public void testParseInvalidEffectNameEffect() { String effectName = "MADE_UP_EFFECT"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -51,7 +51,7 @@ public class SnpEffUnitTest { @Test public void testParseInvalidEffectImpactEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MEDIUM", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; + String[] effectMetadata = { "MEDIUM", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); @@ -60,27 +60,27 @@ public class SnpEffUnitTest { @Test public void testParseWrongNumberOfMetadataFieldsEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); Assert.assertFalse(effect.isWellFormed()); } @Test - public void testParseSnpEffWarningEffect() { + public void testParseSnpEffOneWarningOrErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_OR_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: SNPEFF_WARNING") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning or error: \"SNPEFF_WARNING_OR_ERROR_TEXT\"") ); } @Test - public void testParseSnpEffErrorEffect() { + public void testParseSnpEffBothWarningAndErrorEffect() { String effectName = "NON_SYNONYMOUS_CODING"; - String[] effectMetadata = { "MODERATE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "", "SNPEFF_ERROR" }; + String[] effectMetadata = { "MODERATE", "MISSENSE", "Aca/Gca", "T/A", "OR4F5", "protein_coding", "CODING", "ENST00000534990", "exon_1_69037_69829", "SNPEFF_WARNING_TEXT", "SNPEFF_ERROR_TEXT" }; SnpEffEffect effect = new SnpEffEffect(effectName, effectMetadata); - Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following error: SNPEFF_ERROR") ); + Assert.assertTrue( ! effect.isWellFormed() && effect.getParseError().equals("SnpEff issued the following warning: \"SNPEFF_WARNING_TEXT\", and the following error: \"SNPEFF_ERROR_TEXT\"") ); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index bde4c4a8f..919e3d9bd 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -148,9 +148,9 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T VariantAnnotator -R " + hg19Reference + " -NO_HEADER -o %s -A SnpEff --variant " + validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation + - "snpEff.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", + "snpEff2.0.4.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429", 1, - Arrays.asList("122321a85e448f21679f6ca15c5e22ad") + Arrays.asList("51258f5c880bd1ca3eb45a1711335c66") ); executeTest("Testing SnpEff annotations", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 3dceb9bd2..102d4715e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -21,16 +21,16 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-T VariantEval", "-R " + b37KGReference, "--dbsnp " + b37dbSNP132, - "--eval " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "--eval " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-noEV", "-EV TiTvVariantEvaluator", "-noST", "-ST FunctionalClass", - "-L " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", + "-L " + validationDataLocation + "snpEff2.0.4.AFR.unfiltered.VariantAnnotator.output.vcf", "-o %s" ), 1, - Arrays.asList("d9dcb352c53106f54fcc981f15d35a90") + Arrays.asList("a36414421621b377d6146d58d2fcecd0") ); executeTest("testFunctionClassWithSnpeff", spec); } From 7d77fc51f55948510e9d230864dbbe78e1f791b2 Mon Sep 17 00:00:00 2001 From: Laurent Francioli Date: Wed, 16 Nov 2011 03:32:43 -0500 Subject: [PATCH 25/27] Corrected bug causing PhaseByTransmission to crash in case of new Genotype.Type --- .../walkers/phasing/PhaseByTransmission.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 6394e0e24..847165e3e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -151,16 +151,16 @@ public class PhaseByTransmission extends RodWalker, HashMa alleles.add(VAR); alleles.add(VAR); } - else if(genotype == Genotype.Type.NO_CALL){ - alleles.add(NO_CALL); - alleles.add(NO_CALL); - } else{ return null; } return alleles; } + private boolean isPhasable(Genotype.Type genotype){ + return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR; + } + //Create a new Genotype based on information from a single individual //Homozygous genotypes will be set as phased, heterozygous won't be private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ @@ -271,21 +271,21 @@ public class PhaseByTransmission extends RodWalker, HashMa public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ //Take care of cases where one or more family members are no call - if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){ + if(!isPhasable(child)){ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER); phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } - else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){ + else if(!isPhasable(mother)){ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); - if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + if(!isPhasable(father)){ phaseSingleIndividualAlleles(father, FamilyMember.FATHER); phaseSingleIndividualAlleles(child, FamilyMember.CHILD); } else phasePairAlleles(father, child, FamilyMember.FATHER); } - else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){ + else if(!isPhasable(father)){ phasePairAlleles(mother, child, FamilyMember.MOTHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER); } @@ -327,7 +327,7 @@ public class PhaseByTransmission extends RodWalker, HashMa //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable //genotype so it is safe to return the original genotype in this case. //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. - if(phredScoreTransmission ==0 || genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall()) + if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) return genotype; //Add the transmission probability From 7ac5cf8430e82822d008c35b99d37f35b151230e Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Wed, 16 Nov 2011 09:53:59 -0500 Subject: [PATCH 26/27] Getting rid of unsupported CountReadPairs walker in stable. Removal of remainder of pairs processing framework to follow in unstable. --- .../gatk/walkers/qc/CountPairsWalker.java | 140 ------------------ 1 file changed, 140 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java deleted file mode 100644 index e770418c1..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.walkers.ReadPairWalker; -import org.broadinstitute.sting.utils.collections.ExpandingArrayList; - -import java.io.PrintStream; -import java.util.Collection; -import java.util.List; - -/** - * Counts the number of read pairs encountered in a file sorted in - * query name order. Breaks counts down by total pairs and number - * of paired reads. - * - * - *

Input

- *

- * One or more bam files. - *

- * - *

Output

- *

- * Number of pairs seen. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T CountPairs \
- *   -o output.txt \
- *   -I input.bam
- * 
- * - * @author mhanna - */ -public class CountPairsWalker extends ReadPairWalker { - @Output - private PrintStream out; - - /** - * How many reads are the first in a pair, based on flag 0x0040 from the SAM spec. - */ - private long firstOfPair = 0; - - /** - * How many reads are the second in a pair, based on flag 0x0080 from the SAM spec. - */ - private long secondOfPair = 0; - - /** - * A breakdown of the total number of reads seen with exactly the same read name. - */ - private List pairCountsByType = new ExpandingArrayList(); - - /** - * Maps a read pair to a given reduce of type MapType. Semantics determined by subclasser. - * @param reads Collection of reads having the same name. - * @return Semantics defined by implementer. - */ - @Override - public Integer map(Collection reads) { - if(pairCountsByType.get(reads.size()) != null) - pairCountsByType.set(reads.size(),pairCountsByType.get(reads.size())+1); - else - pairCountsByType.set(reads.size(),1L); - - for(SAMRecord read: reads) { - if(read.getFirstOfPairFlag()) firstOfPair++; - if(read.getSecondOfPairFlag()) secondOfPair++; - } - - return 1; - } - - /** - * No pairs at the beginning of a traversal. - * @return 0 always. - */ - @Override - public Long reduceInit() { - return 0L; - } - - /** - * Combine number of pairs seen in this iteration (always 1) with total number of pairs - * seen in previous iterations. - * @param value Pairs in this iteration (1), from the map function. - * @param sum Count of all pairs in prior iterations. - * @return All pairs encountered in previous iterations + all pairs encountered in this iteration (sum + 1). - */ - @Override - public Long reduce(Integer value, Long sum) { - return value + sum; - } - - /** - * Print summary statistics over the entire traversal. - * @param sum A count of all read pairs viewed. - */ - @Override - public void onTraversalDone(Long sum) { - out.printf("Total number of pairs : %d%n",sum); - out.printf("Total number of first reads in pair : %d%n",firstOfPair); - out.printf("Total number of second reads in pair: %d%n",secondOfPair); - for(int i = 1; i < pairCountsByType.size(); i++) { - if(pairCountsByType.get(i) == null) - continue; - out.printf("Pairs of size %d: %d%n",i,pairCountsByType.get(i)); - } - } - -}