From 76dd816e709ef230231173b43076c4a423577a4a Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 20 Oct 2011 12:47:27 +0200
Subject: [PATCH 01/44] Added getParents() -> returns an arrayList containing
the sample's parent(s) if available
---
.../broadinstitute/sting/gatk/samples/Sample.java | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
index b39fdd79d..a14d999ea 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
@@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples;
import org.broadinstitute.sting.utils.exceptions.UserException;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
@@ -110,6 +111,17 @@ public class Sample implements Comparable { // implements java.io.Serial
return infoDB.getSample(paternalID);
}
+ public ArrayList getParents(){
+ ArrayList parents = new ArrayList(2);
+ Sample parent = getMother();
+ if(parent != null)
+ parents.add(parent);
+ parent = getFather();
+ if(parent != null)
+ parents.add(parent);
+ return parents;
+ }
+
/**
* Get gender of the sample
* @return property of key "gender" - must be of type Gender
From ef6a6fdfe499809fc3fd6282a460c2fd542963c5 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 20 Oct 2011 12:49:18 +0200
Subject: [PATCH 02/44] Added getAsMap -> returns the likelihoods as an EnumMap
with Genotypes as keys and likelihoods as values.
---
.../utils/variantcontext/GenotypeLikelihoods.java | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index dba16cf86..292bc17cd 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -25,8 +25,12 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.TribbleException;
+import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import java.util.EnumMap;
+import java.util.Map;
+
public class GenotypeLikelihoods {
public static final boolean CAP_PLS = false;
public static final int PL_CAP = 255;
@@ -94,6 +98,16 @@ public class GenotypeLikelihoods {
return likelihoodsAsString_PLs;
}
+ public EnumMap getAsMap(boolean normalizeFromLog10){
+ //Make sure that the log10likelihoods are set
+ double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
+ EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class);
+ likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]);
+ likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]);
+ likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]);
+ return likelihoodsMap;
+ }
+
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = likelihoodsAsString_PLs.split(",");
From 1c61a573296f6df9a3b2af20255c0deff1ece565 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 20 Oct 2011 13:06:44 +0200
Subject: [PATCH 03/44] Original rewrite of PhaseByTransmission: - Adapted to
get the trio information from the SampleDB (i.e. from Pedigree file (ped)) =>
Multiple trios can be passed as argument - Mendelian violations and trio
phasing possibilities are pre-calculated and stored in Maps. => Runtime is
~3x faster - Genotype combinations possible only given two MVs are now given
a squared MV prior (e.g. 0/0+0/0=>1/1 is given 10^-16 prior if the MV prior
is 10^-8) - Corrected bug: In case the best genotype combination is
Het/Het/Het, the genotypes are now set appropriately (before original
genotypes were left even if they weren't Het/Het/Het) - Basic reporting
added: -- mvf argument let the user specify a file to report remaining MVs --
When the walker ends, some basic stats about the genotype reconfiguration and
phasing are output
Known problems:
- GQ is not recalculated even if the genotype changes
Possible improvements:
- Phase partially typed trios
- Use standard Allele/Genotype Classes for the storage of the pre-calculated phase
---
.../walkers/phasing/PhaseByTransmission.java | 781 +++++++++++++-----
1 file changed, 569 insertions(+), 212 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 3eedc2a28..12b541526 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -7,18 +7,18 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
-import org.broadinstitute.sting.utils.text.XReadLines;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
-import java.io.File;
-import java.io.FileNotFoundException;
+import java.io.PrintStream;
import java.util.*;
/**
@@ -32,13 +32,13 @@ import java.util.*;
* the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to
* begin.
*/
-public class PhaseByTransmission extends RodWalker {
+public class PhaseByTransmission extends RodWalker, HashMap> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
- @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.")
- public ArrayList familySpecs = null;
+ @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.")
+ private PrintStream mvFile = null;
@Output
protected VCFWriter vcfWriter = null;
@@ -48,239 +48,523 @@ public class PhaseByTransmission extends RodWalker {
private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8;
- private class Trio {
- private String mother;
- private String father;
- private String child;
+ private ArrayList trios = new ArrayList();
- public Trio(String mother, String father, String child) {
- this.mother = mother;
- this.father = father;
- this.child = child;
- }
+ //Matrix of priors for all genotype combinations
+ private EnumMap>> mvCountMatrix;
- public Trio(String familySpec) {
- String[] pieces = familySpec.split("[\\+\\=]");
+ //Matrix of allele transmission
+ private EnumMap>> transmissionMatrix;
- this.mother = pieces[0];
- this.father = pieces[1];
- this.child = pieces[2];
- }
+ //Metrics counters hashkeys
+ private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
+ private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1;
+ private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
+ private final Byte NUM_HET = 3;
+ private final Byte NUM_HET_HET_HET = 4;
+ private final Byte NUM_VIOLATIONS = 5;
- public String getMother() { return mother; }
- public String getFather() { return father; }
- public String getChild() { return child; }
+ private enum AlleleType {
+ NO_CALL,
+ REF,
+ VAR,
+ UNPHASED_REF,
+ UNPHASED_VAR
}
- private ArrayList trios = new ArrayList();
+ //Stores a trio-genotype
+ private class TrioPhase {
- public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) {
- if (familySpecs != null) {
- // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
- // spec list set, and treat the entries as if they had been specified on the command line.
- ArrayList specs = new ArrayList();
- for (String familySpec : familySpecs) {
- File specFile = new File(familySpec);
+ private ArrayList trioAlleles = new ArrayList(6);
- try {
- XReadLines reader = new XReadLines(specFile);
+ private ArrayList getAlleles(Genotype.Type genotype){
+ ArrayList alleles = new ArrayList(2);
+ if(genotype == Genotype.Type.HOM_REF){
+ alleles.add(AlleleType.REF);
+ alleles.add(AlleleType.REF);
+ }
+ else if(genotype == Genotype.Type.HET){
+ alleles.add(AlleleType.REF);
+ alleles.add(AlleleType.VAR);
+ }
+ else if(genotype == Genotype.Type.HOM_VAR){
+ alleles.add(AlleleType.VAR);
+ alleles.add(AlleleType.VAR);
+ }
+ else{
+ alleles.add(AlleleType.NO_CALL);
+ alleles.add(AlleleType.NO_CALL);
+ }
+ return alleles;
+ }
- List lines = reader.readLines();
- for (String line : lines) {
- specs.add(new Trio(line));
- }
- } catch (FileNotFoundException e) {
- specs.add(new Trio(familySpec)); // not a file, so must be a family spec
- }
- }
+ private ArrayList phaseSingleIndividualAlleles(Genotype.Type genotype){
+ if(genotype == Genotype.Type.HET){
+ ArrayList phasedAlleles = new ArrayList(2);
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ return phasedAlleles;
+ }
+ else
+ return getAlleles(genotype);
+ }
- return specs;
+ private ArrayList phaseMonoParentFamilyAlleles(Genotype.Type parent, Genotype.Type child){
+ ArrayList phasedAlleles = new ArrayList(4);
+ //Special case for Het/Het as it is ambiguous
+ if(parent == Genotype.Type.HET && child == Genotype.Type.HET){
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
}
- return new ArrayList();
+ ArrayList parentAlleles = getAlleles(parent);
+ ArrayList childAlleles = getAlleles(child);
+
+ int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
+ if(childTransmittedAlleleIndex > -1){
+ phasedAlleles.add(parentAlleles.get(0));
+ phasedAlleles.add(parentAlleles.get(1));
+ phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ phasedAlleles.add(childAlleles.get(0));
+ }
+ else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
+ phasedAlleles.add(parentAlleles.get(1));
+ phasedAlleles.add(parentAlleles.get(0));
+ phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ phasedAlleles.add(childAlleles.get(0));
+ }
+ else{
+ parentAlleles.addAll(childAlleles);
+ for(AlleleType allele : parentAlleles){
+ if(allele == AlleleType.REF){
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ }
+ else if(allele == AlleleType.VAR){
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ }
+ else{
+ phasedAlleles.add(AlleleType.NO_CALL);
+ }
+ }
+ }
+
+ return phasedAlleles;
+ }
+
+ private ArrayList phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+ ArrayList phasedAlleles = new ArrayList(6);
+
+ Set> possiblePhasedChildGenotypes = new HashSet>();
+ ArrayList motherAlleles = getAlleles(mother);
+ ArrayList fatherAlleles = getAlleles(father);
+ ArrayList childAlleles = getAlleles(child);
+
+ //Build all possible child genotypes for the given parent's genotypes
+ for (AlleleType momAllele : motherAlleles) {
+ for (AlleleType fatherAllele : fatherAlleles) {
+ ArrayList possiblePhasedChildAlleles = new ArrayList(2);
+ possiblePhasedChildAlleles.add(momAllele);
+ possiblePhasedChildAlleles.add(fatherAllele);
+ possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
+ }
+ }
+
+ for (ArrayList phasedChildGenotype : possiblePhasedChildGenotypes) {
+ int firstAlleleIndex = phasedChildGenotype.indexOf(childAlleles.get(0));
+ int secondAlleleIndex = phasedChildGenotype.lastIndexOf(childAlleles.get(1));
+ if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
+ //Add mother's alleles
+ phasedAlleles.add(phasedChildGenotype.get(0));
+ if(motherAlleles.get(0) != phasedAlleles.get(0))
+ phasedAlleles.add(motherAlleles.get(0));
+ else
+ phasedAlleles.add(motherAlleles.get(1));
+
+ //Add father's alleles
+ phasedAlleles.add(phasedChildGenotype.get(1));
+ if(fatherAlleles.get(0) != phasedAlleles.get(2))
+ phasedAlleles.add(fatherAlleles.get(0));
+ else
+ phasedAlleles.add(fatherAlleles.get(1));
+
+ //Add child's alleles
+ phasedAlleles.addAll(phasedChildGenotype);
+ return phasedAlleles;
+ }
+ }
+
+ //If this is reached then no phasing could be found
+ motherAlleles.addAll(fatherAlleles);
+ motherAlleles.addAll(childAlleles);
+ for(AlleleType allele : motherAlleles){
+ if(allele == AlleleType.REF){
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ }
+ else if(allele == AlleleType.VAR){
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ }
+ else{
+ phasedAlleles.add(AlleleType.NO_CALL);
+ }
+ }
+ return phasedAlleles;
+ }
+
+
+ public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ //Take care of cases where one or more family members are no call
+ if(child == Genotype.Type.NO_CALL){
+ trioAlleles.addAll(phaseSingleIndividualAlleles(mother));
+ trioAlleles.addAll(phaseSingleIndividualAlleles(father));
+ trioAlleles.add(AlleleType.NO_CALL);
+ trioAlleles.add(AlleleType.NO_CALL);
+ }
+ else if(mother == Genotype.Type.NO_CALL){
+ trioAlleles.add(AlleleType.NO_CALL);
+ trioAlleles.add(AlleleType.NO_CALL);
+ if(father == Genotype.Type.NO_CALL){
+ trioAlleles.add(AlleleType.NO_CALL);
+ trioAlleles.add(AlleleType.NO_CALL);
+ trioAlleles.addAll(phaseSingleIndividualAlleles(child));
+ }
+ else
+ trioAlleles.addAll(phaseMonoParentFamilyAlleles(father, child));
+ }
+ else if(father == Genotype.Type.NO_CALL){
+ trioAlleles.addAll(phaseMonoParentFamilyAlleles(mother, child));
+ trioAlleles.add(2, AlleleType.NO_CALL);
+ trioAlleles.add(3, AlleleType.NO_CALL);
+ }
+ //Special case for Het/Het/Het as it is ambiguous
+ else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
+ trioAlleles.add(AlleleType.UNPHASED_REF);
+ trioAlleles.add(AlleleType.UNPHASED_VAR);
+ trioAlleles.add(AlleleType.UNPHASED_REF);
+ trioAlleles.add(AlleleType.UNPHASED_VAR);
+ trioAlleles.add(AlleleType.UNPHASED_REF);
+ trioAlleles.add(AlleleType.UNPHASED_VAR);
+ }
+ //All family members have genotypes and at least one of them is not Het
+ else{
+ trioAlleles = phaseFamilyAlleles(mother, father, child);
+ }
+ }
+
+ public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,trioAlleles.subList(0,2)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,trioAlleles.subList(2,4)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,trioAlleles.subList(4,6)));
+ return phasedGenotypes;
+ }
+
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, List phasedAlleles){
+
+ //Add the transmission probability
+ Map genotypeAttributes = new HashMap();
+ genotypeAttributes.putAll(genotype.getAttributes());
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb);
+
+ boolean isPhased = true;
+
+ List alleles = new ArrayList(2);
+
+ //If unphased, return original genotype
+ for(AlleleType allele : phasedAlleles){
+ if(allele == AlleleType.NO_CALL){
+ genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
+ return genotype;
+ }
+ //Otherwise add the appropriate allele
+ else if(allele == AlleleType.UNPHASED_REF){
+ isPhased = false;
+ alleles.add(refAllele);
+ }
+ else if(allele == AlleleType.UNPHASED_VAR){
+ isPhased = false;
+ alleles.add(altAllele);
+ }
+ else if(allele == AlleleType.REF){
+ alleles.add(refAllele);
+ }
+ else if(allele == AlleleType.VAR){
+ alleles.add(altAllele);
+ }
+ }
+ //TODO: Recalculate GQ field for the new genotype
+ //Remove the GQ attribute as it needs to be recalculated for the new genotype assignment
+ //double[] likelihoods = genotype.getLikelihoods().getAsVector();
+
+ //genotypeAttributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,likelihoods[1]);
+ genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
+ return new Genotype(genotype.getSampleName(), alleles, genotype.getNegLog10PError(), null, genotype.getAttributes(), isPhased);
+ }
+
+
}
/**
* Parse the familial relationship specification, and initialize VCF writer
*/
public void initialize() {
- trios = getFamilySpecsFromCommandLineInput(familySpecs);
-
ArrayList rodNames = new ArrayList();
rodNames.add(variantCollection.variants.getName());
-
Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
+ //Get the trios from the families passed as ped
+ setTrios();
+ if(trios.size()<1)
+ throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted.");
+
+
Set headerLines = new HashSet();
headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit()));
- headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct"));
+ headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the phase given that the genotypes are correct"));
headerLines.add(new VCFHeaderLine("source", SOURCE_NAME));
vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples));
+
+ buildMatrices();
+
+ if(mvFile != null)
+ mvFile.println("#CHROM\tPOS\tFILTER\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_RAD\tMOTHER_AAD\tMOTHER_HRPL\tMOTHER_HETPL\tMOTHER_HAPL\tFATHER_GT\tFATHER_DP\tFATHER_RAD\tFATHER_AAD\tFATHER_HRPL\tFATHER_HETPL\tFATHER_HAPL\tCHILD_GT\tCHILD_DP\tCHILD_RAD\tCHILD_AAD\tCHILD_HRPL\tCHILD_HETPL\tCHILD_HAPL");
+
}
- private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) {
- double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector());
- double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector());
- double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector());
+ /**
+ * Select Trios only
+ */
+ private void setTrios(){
- int momIndex = mom.getType().ordinal() - 1;
- int dadIndex = dad.getType().ordinal() - 1;
- int childIndex = child.getType().ordinal() - 1;
-
- return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex];
- }
-
- private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) {
- List homRefAlleles = new ArrayList();
- homRefAlleles.add(refAllele);
- homRefAlleles.add(refAllele);
- Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getNegLog10PError(), null, g.getAttributes(), false);
-
- List hetAlleles = new ArrayList();
- hetAlleles.add(refAllele);
- hetAlleles.add(altAllele);
- Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getNegLog10PError(), null, g.getAttributes(), false);
-
- List homVarAlleles = new ArrayList();
- homVarAlleles.add(altAllele);
- homVarAlleles.add(altAllele);
- Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getNegLog10PError(), null, g.getAttributes(), false);
-
- ArrayList genotypes = new ArrayList();
- genotypes.add(homRef);
- genotypes.add(het);
- genotypes.add(homVar);
-
- return genotypes;
- }
-
- private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) {
- List alleles = g.getAlleles();
- int matchingAlleles = 0;
-
- for (Allele a : alleles) {
- if (!alleleToMatch.equals(a)) {
- matchingAlleles++;
+ Map> families = this.getSampleDB().getFamilies();
+ Set family;
+ ArrayList parents;
+ for(String familyID : families.keySet()){
+ family = families.get(familyID);
+ if(family.size()!=3){
+ logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios. Family skipped.",familyID,family.size()));
}
- }
-
- return matchingAlleles;
- }
-
- private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) {
- int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0;
- int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0;
-
- int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0;
- int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0;
-
- int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child);
- int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child);
-
- return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles);
- }
-
- private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) {
- Set possiblePhasedChildGenotypes = new HashSet();
-
- for (Allele momAllele : mom.getAlleles()) {
- for (Allele dadAllele : dad.getAlleles()) {
- ArrayList possiblePhasedChildAlleles = new ArrayList();
- possiblePhasedChildAlleles.add(momAllele);
- possiblePhasedChildAlleles.add(dadAllele);
-
- Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getNegLog10PError(), child.getFilters(), child.getAttributes(), true);
-
- possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype);
- }
- }
-
- ArrayList finalGenotypes = new ArrayList();
-
- for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) {
- if (child.sameGenotype(phasedChildGenotype, true)) {
- Allele momTransmittedAllele = phasedChildGenotype.getAllele(0);
- Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1);
-
- ArrayList phasedMomAlleles = new ArrayList();
- phasedMomAlleles.add(momTransmittedAllele);
- phasedMomAlleles.add(momUntransmittedAllele);
-
- Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getNegLog10PError(), mom.getFilters(), mom.getAttributes(), true);
-
- Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1);
- Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1);
-
- ArrayList phasedDadAlleles = new ArrayList();
- phasedDadAlleles.add(dadTransmittedAllele);
- phasedDadAlleles.add(dadUntransmittedAllele);
-
- Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getNegLog10PError(), dad.getFilters(), dad.getAttributes(), true);
-
- finalGenotypes.add(phasedMomGenotype);
- finalGenotypes.add(phasedDadGenotype);
- finalGenotypes.add(phasedChildGenotype);
-
- return finalGenotypes;
- }
- }
-
- finalGenotypes.add(mom);
- finalGenotypes.add(dad);
- finalGenotypes.add(child);
-
- return finalGenotypes;
- }
-
- private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) {
- ArrayList finalGenotypes = new ArrayList();
- finalGenotypes.add(mother);
- finalGenotypes.add(father);
- finalGenotypes.add(child);
-
- if (mother.isCalled() && father.isCalled() && child.isCalled()) {
- ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother);
- ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father);
- ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child);
-
- double bestConfigurationLikelihood = 0.0;
- double bestPrior = 0.0;
- Genotype bestMotherGenotype = mother;
- Genotype bestFatherGenotype = father;
- Genotype bestChildGenotype = child;
-
- double norm = 0.0;
-
- for (Genotype motherGenotype : possibleMotherGenotypes) {
- for (Genotype fatherGenotype : possibleFatherGenotypes) {
- for (Genotype childGenotype : possibleChildGenotypes) {
- double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR;
- double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype);
- norm += prior*configurationLikelihood;
-
- if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) {
- bestConfigurationLikelihood = configurationLikelihood;
- bestPrior = prior;
- bestMotherGenotype = motherGenotype;
- bestFatherGenotype = fatherGenotype;
- bestChildGenotype = childGenotype;
- }
- }
+ else{
+ for(Sample familyMember : family){
+ parents = familyMember.getParents();
+ if(parents.size()>0){
+ if(family.containsAll(parents))
+ this.trios.add(familyMember);
+ else
+ logger.info(String.format("Caution: Family %s is not a trio; At the moment Phase By Transmission only supports trios. Family skipped.",familyID));
+ break;
+ }
}
}
- if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) {
- Map attributes = new HashMap();
- attributes.putAll(bestChildGenotype.getAttributes());
- attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm);
- bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes);
+ }
- finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype);
+
+
+ }
+
+ private void buildMatrices(){
+ mvCountMatrix = new EnumMap>>(Genotype.Type.class);
+ transmissionMatrix = new EnumMap>>(Genotype.Type.class);
+ for(Genotype.Type mother : Genotype.Type.values()){
+ mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class));
+ transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class));
+ for(Genotype.Type father : Genotype.Type.values()){
+ mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class));
+ transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class));
+ for(Genotype.Type child : Genotype.Type.values()){
+ mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
+ transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
+ }
+ }
+ }
+ }
+
+ private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ //Child is no call => No MV
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
+ return 0;
+ //Add parents with genotypes for the evaluation
+ ArrayList parents = new ArrayList();
+ if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
+ parents.add(mother);
+ if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
+ parents.add(father);
+
+ //Both parents no calls => No MV
+ if (parents.isEmpty())
+ return 0;
+
+ //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed
+ int parentsNumRefAlleles = 0;
+ int parentsNumAltAlleles = 0;
+
+ for(Genotype.Type parent : parents){
+ if(parent == Genotype.Type.HOM_REF){
+ parentsNumRefAlleles++;
+ }
+ else if(parent == Genotype.Type.HET){
+ parentsNumRefAlleles++;
+ parentsNumAltAlleles++;
+ }
+ else if(parent == Genotype.Type.HOM_VAR){
+ parentsNumAltAlleles++;
}
}
- return finalGenotypes;
+ //Case Child is HomRef
+ if(child == Genotype.Type.HOM_REF){
+ if(parentsNumRefAlleles == parents.size())
+ return 0;
+ else return (parents.size()-parentsNumRefAlleles);
+ }
+
+ //Case child is HomVar
+ if(child == Genotype.Type.HOM_VAR){
+ if(parentsNumAltAlleles == parents.size())
+ return 0;
+ else return parents.size()-parentsNumAltAlleles;
+ }
+
+ //Case child is Het
+ if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
+ return 0;
+
+ //MV
+ return 1;
+ }
+
+ private Double getCombinationPrior(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ double nonMVPrior = 1.0 - 12*MENDELIAN_VIOLATION_PRIOR;
+
+ //Child is no call => No MV
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
+ return nonMVPrior;
+ //Add parents with genotypes for the evaluation
+ ArrayList parents = new ArrayList();
+ if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
+ parents.add(mother);
+ if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
+ parents.add(father);
+
+ //Both parents no calls => No MV
+ if (parents.isEmpty())
+ return nonMVPrior;
+
+ //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed
+ int parentsNumRefAlleles = 0;
+ int parentsNumAltAlleles = 0;
+
+ for(Genotype.Type parent : parents){
+ if(parent == Genotype.Type.HOM_REF){
+ parentsNumRefAlleles++;
+ }
+ else if(parent == Genotype.Type.HET){
+ parentsNumRefAlleles++;
+ parentsNumAltAlleles++;
+ }
+ else if(parent == Genotype.Type.HOM_VAR){
+ parentsNumAltAlleles++;
+ }
+ }
+
+ //Case Child is HomRef
+ if(child == Genotype.Type.HOM_REF){
+ if(parentsNumRefAlleles == parents.size())
+ return nonMVPrior;
+ else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumRefAlleles);
+ }
+
+ //Case child is HomVar
+ if(child == Genotype.Type.HOM_VAR){
+ if(parentsNumAltAlleles == parents.size())
+ return nonMVPrior;
+ else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumAltAlleles);
+ }
+
+ //Case child is Het
+ if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
+ return nonMVPrior;
+
+ //MV
+ return MENDELIAN_VIOLATION_PRIOR;
+ }
+
+ private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
+ int count = 0;
+ if(motherOriginal!=motherNew)
+ count++;
+ if(fatherOriginal!=fatherNew)
+ count++;
+ if(childOriginal!=childNew)
+ count++;
+ return count;
+ }
+
+
+ private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
+
+ //For now, only consider trios with complete information
+ //TODO: Phasing of trios with missing information
+ if(!(mother.isCalled() && father.isCalled() && child.isCalled())) {
+ finalGenotypes.add(mother);
+ finalGenotypes.add(father);
+ finalGenotypes.add(child);
+ return false;
+ }
+
+ //Get the PL
+ Map motherLikelihoods = mother.getLikelihoods().getAsMap(true);
+ Map fatherLikelihoods = father.getLikelihoods().getAsMap(true);
+ Map childLikelihoods = child.getLikelihoods().getAsMap(true);
+
+ //Prior vars
+ double bestConfigurationLikelihood = 0.0;
+ double norm = 0.0;
+ boolean isMV = false;
+ int bestConfigurationGenotypeDiffs=4;
+ Genotype.Type bestMotherGenotype = mother.getType();
+ Genotype.Type bestFatherGenotype = father.getType();
+ Genotype.Type bestChildGenotype = child.getType();
+
+ //Get the most likely combination
+ int mvCount;
+ double configurationLikelihood;
+ int configurationGenotypeDiffs;
+ for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
+ for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
+ for(Map.Entry childGenotype : childLikelihoods.entrySet()){
+ mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
+ configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-12*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
+ norm += configurationLikelihood;
+ configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if ((configurationLikelihood > bestConfigurationLikelihood) ||
+ (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) {
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMotherGenotype = motherGenotype.getKey();
+ bestFatherGenotype = fatherGenotype.getKey();
+ bestChildGenotype = childGenotype.getKey();
+ isMV = mvCount>0;
+ bestConfigurationGenotypeDiffs=configurationGenotypeDiffs;
+ }
+ }
+ }
+ }
+
+ //Get the phased alleles for the genotype configuration
+ TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype);
+
+ //Return the phased genotypes
+ phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,MathUtils.probabilityToPhredScale(1-(bestConfigurationLikelihood / norm)),finalGenotypes);
+ return isMV;
+
}
/**
@@ -292,18 +576,34 @@ public class PhaseByTransmission extends RodWalker {
* @return null
*/
@Override
- public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+
+ //Local cars to avoid lookups on increment
+ int numTrioGenotypesCalled = 0;
+ int numTrioGenotypesNoCall = 0;
+ int numTrioGenotypesPhased = 0;
+ int numHet = 0 ;
+ int numHetHetHet = 0;
+ int numMVs = 0;
+
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
Map genotypeMap = vc.getGenotypes();
- for (Trio trio : trios) {
- Genotype mother = vc.getGenotype(trio.getMother());
- Genotype father = vc.getGenotype(trio.getFather());
- Genotype child = vc.getGenotype(trio.getChild());
+ boolean isMV;
- ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child);
+ for (Sample sample : trios) {
+ Genotype mother = vc.getGenotype(sample.getMaternalID());
+ Genotype father = vc.getGenotype(sample.getPaternalID());
+ Genotype child = vc.getGenotype(sample.getID());
+
+ //Skip trios where any of the genotype is missing in the variant context
+ if(mother == null || father == null | child == null)
+ continue;
+
+ ArrayList trioGenotypes = new ArrayList(3);
+ isMV = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
Genotype phasedMother = trioGenotypes.get(0);
Genotype phasedFather = trioGenotypes.get(1);
@@ -312,14 +612,47 @@ public class PhaseByTransmission extends RodWalker {
genotypeMap.put(phasedMother.getSampleName(), phasedMother);
genotypeMap.put(phasedFather.getSampleName(), phasedFather);
genotypeMap.put(phasedChild.getSampleName(), phasedChild);
+
+ //Increment metrics counters
+ if(phasedMother.isCalled() && phasedFather.isCalled() && phasedChild.isCalled()){
+ numTrioGenotypesCalled++;
+ if(phasedMother.isPhased())
+ numTrioGenotypesPhased++;
+
+ if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){
+ numHet++;
+ if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()){
+ numHetHetHet++;
+ }else if(!phasedMother.isPhased()){
+ int x =9;
+ }
+ }
+
+ if(isMV){
+ numMVs++;
+ if(mvFile != null)
+ mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()));
+ }
+ }else{
+ numTrioGenotypesNoCall++;
+ }
+
}
+
VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap);
vcfWriter.add(newvc);
}
- return null;
+ HashMap metricsCounters = new HashMap(5);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,numTrioGenotypesCalled);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,numTrioGenotypesNoCall);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,numTrioGenotypesPhased);
+ metricsCounters.put(NUM_HET,numHet);
+ metricsCounters.put(NUM_HET_HET_HET,numHetHetHet);
+ metricsCounters.put(NUM_VIOLATIONS,numMVs);
+ return metricsCounters;
}
/**
@@ -328,8 +661,15 @@ public class PhaseByTransmission extends RodWalker {
* @return Initial value of reduce.
*/
@Override
- public Integer reduceInit() {
- return null;
+ public HashMap reduceInit() {
+ HashMap metricsCounters = new HashMap(5);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_HET,0);
+ metricsCounters.put(NUM_HET_HET_HET,0);
+ metricsCounters.put(NUM_VIOLATIONS,0);
+ return metricsCounters;
}
/**
@@ -340,7 +680,24 @@ public class PhaseByTransmission extends RodWalker {
* @return accumulator with result of the map taken into account.
*/
@Override
- public Integer reduce(Integer value, Integer sum) {
- return null;
+ public HashMap reduce(HashMap value, HashMap sum) {
+ sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED));
+ sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL));
+ sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED));
+ sum.put(NUM_HET,value.get(NUM_HET)+sum.get(NUM_HET));
+ sum.put(NUM_HET_HET_HET,value.get(NUM_HET_HET_HET)+sum.get(NUM_HET_HET_HET));
+ sum.put(NUM_VIOLATIONS,value.get(NUM_VIOLATIONS)+sum.get(NUM_VIOLATIONS));
+ return sum;
+ }
+
+
+ @Override
+ public void onTraversalDone(HashMap result) {
+ logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED));
+ logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL));
+ logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED));
+ logger.info("Number of resulting Hom/Hom/Hom trios: " + result.get(NUM_HET));
+ logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_HET_HET_HET));
+ logger.info("Number of remaining mendelian violations: " + result.get(NUM_VIOLATIONS));
}
}
From edea90786a4555ff665b07a3dc2f97ce7cbae24c Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 20 Oct 2011 17:04:19 +0200
Subject: [PATCH 04/44] Genotype quality is now recalculated for each of the
phased Genotypes. Small problem is that we unnecessarily loose a little
precision on the genotypes that do not change after assignment.
---
.../walkers/phasing/PhaseByTransmission.java | 9 ++----
.../variantcontext/GenotypeLikelihoods.java | 29 +++++++++++++++++++
2 files changed, 31 insertions(+), 7 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 12b541526..52629277f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -266,6 +266,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
Map genotypeAttributes = new HashMap();
genotypeAttributes.putAll(genotype.getAttributes());
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb);
+ genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
boolean isPhased = true;
@@ -274,7 +275,6 @@ public class PhaseByTransmission extends RodWalker, HashMa
//If unphased, return original genotype
for(AlleleType allele : phasedAlleles){
if(allele == AlleleType.NO_CALL){
- genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
return genotype;
}
//Otherwise add the appropriate allele
@@ -293,13 +293,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
alleles.add(altAllele);
}
}
- //TODO: Recalculate GQ field for the new genotype
- //Remove the GQ attribute as it needs to be recalculated for the new genotype assignment
- //double[] likelihoods = genotype.getLikelihoods().getAsVector();
- //genotypeAttributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,likelihoods[1]);
- genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
- return new Genotype(genotype.getSampleName(), alleles, genotype.getNegLog10PError(), null, genotype.getAttributes(), isPhased);
+ return new Genotype(genotype.getSampleName(), alleles, genotype.getLikelihoods().getLog10GQ(genotype.getType()), null, genotype.getAttributes(), isPhased);
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index 292bc17cd..9ed9b41cc 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -25,8 +25,10 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.TribbleException;
+import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.jgrapht.util.MathUtil;
import java.util.EnumMap;
import java.util.Map;
@@ -98,6 +100,7 @@ public class GenotypeLikelihoods {
return likelihoodsAsString_PLs;
}
+ //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
public EnumMap getAsMap(boolean normalizeFromLog10){
//Make sure that the log10likelihoods are set
double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
@@ -108,6 +111,32 @@ public class GenotypeLikelihoods {
return likelihoodsMap;
}
+ //Return the log10 Genotype Quality (GQ) for the given genotype
+ public double getLog10GQ(Genotype.Type genotype){
+
+ double qual = Double.NEGATIVE_INFINITY;
+ EnumMap likelihoods = getAsMap(false);
+ for(Map.Entry likelihood : likelihoods.entrySet()){
+ if(likelihood.getKey() == genotype)
+ continue;
+ if(likelihood.getValue() > qual)
+ qual = likelihood.getValue();
+
+ }
+
+ qual = likelihoods.get(genotype) - qual;
+
+ if (qual < 0) {
+ // QUAL can be negative if the chosen genotype is not the most likely one individually.
+ // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one
+ double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
+ double chosenGenotype = normalized[genotype.ordinal()-1];
+ qual = -1.0 * Math.log10(1.0 - chosenGenotype);
+ }
+
+ return qual;
+ }
+
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = likelihoodsAsString_PLs.split(",");
From 01b16abc8dcc233e29258eaa4e1d1c0415180e91 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 24 Oct 2011 10:24:41 +0200
Subject: [PATCH 05/44] Genotype quality calculation modified to handle all
genotypes the same way. This is inconsistent with GQ output by the UG but is
correct even for cases of poor quality genotypes.
---
.../variantcontext/GenotypeLikelihoods.java | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index 9ed9b41cc..b83ffa2fd 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -111,8 +111,8 @@ public class GenotypeLikelihoods {
return likelihoodsMap;
}
- //Return the log10 Genotype Quality (GQ) for the given genotype
- public double getLog10GQ(Genotype.Type genotype){
+ //Return the neg log10 Genotype Quality (GQ) for the given genotype
+ public double getNegLog10GQ(Genotype.Type genotype){
double qual = Double.NEGATIVE_INFINITY;
EnumMap likelihoods = getAsMap(false);
@@ -124,15 +124,9 @@ public class GenotypeLikelihoods {
}
- qual = likelihoods.get(genotype) - qual;
-
- if (qual < 0) {
- // QUAL can be negative if the chosen genotype is not the most likely one individually.
- // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one
- double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
- double chosenGenotype = normalized[genotype.ordinal()-1];
- qual = -1.0 * Math.log10(1.0 - chosenGenotype);
- }
+ double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
+ double chosenGenotype = normalized[genotype.ordinal()-1];
+ qual = -1.0 * Math.log10(1.0 - chosenGenotype);
return qual;
}
From 7312e35c71167dee3510bf428204b3a4b75ab52f Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 24 Oct 2011 10:25:53 +0200
Subject: [PATCH 06/44] Now makes use of standard Allele and Genotype classes.
This allowed quite some code cleaning.
---
.../walkers/phasing/PhaseByTransmission.java | 341 +++++++-----------
1 file changed, 135 insertions(+), 206 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 52629277f..62b7bfffa 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -56,7 +56,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
//Matrix of allele transmission
private EnumMap>> transmissionMatrix;
- //Metrics counters hashkeys
+ //Metrics counters hash keys
private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1;
private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
@@ -64,237 +64,223 @@ public class PhaseByTransmission extends RodWalker, HashMa
private final Byte NUM_HET_HET_HET = 4;
private final Byte NUM_VIOLATIONS = 5;
- private enum AlleleType {
- NO_CALL,
- REF,
- VAR,
- UNPHASED_REF,
- UNPHASED_VAR
+ private enum FamilyMember {
+ MOTHER,
+ FATHER,
+ CHILD
}
//Stores a trio-genotype
private class TrioPhase {
- private ArrayList trioAlleles = new ArrayList(6);
+ //Create 2 fake alleles
+ //The actual bases will never be used but the Genotypes created using the alleles will be.
+ private final Allele REF = Allele.create("A",true);
+ private final Allele VAR = Allele.create("A",false);
+ private final Allele NO_CALL = Allele.create(".",false);
+ private final String DUMMY_NAME = "DummySample";
- private ArrayList getAlleles(Genotype.Type genotype){
- ArrayList alleles = new ArrayList(2);
+ private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class);
+
+ private ArrayList getAlleles(Genotype.Type genotype){
+ ArrayList alleles = new ArrayList(2);
if(genotype == Genotype.Type.HOM_REF){
- alleles.add(AlleleType.REF);
- alleles.add(AlleleType.REF);
+ alleles.add(REF);
+ alleles.add(REF);
}
else if(genotype == Genotype.Type.HET){
- alleles.add(AlleleType.REF);
- alleles.add(AlleleType.VAR);
+ alleles.add(REF);
+ alleles.add(VAR);
}
else if(genotype == Genotype.Type.HOM_VAR){
- alleles.add(AlleleType.VAR);
- alleles.add(AlleleType.VAR);
+ alleles.add(VAR);
+ alleles.add(VAR);
+ }
+ else if(genotype == Genotype.Type.NO_CALL){
+ alleles.add(NO_CALL);
+ alleles.add(NO_CALL);
}
else{
- alleles.add(AlleleType.NO_CALL);
- alleles.add(AlleleType.NO_CALL);
+ return null;
}
return alleles;
}
- private ArrayList phaseSingleIndividualAlleles(Genotype.Type genotype){
- if(genotype == Genotype.Type.HET){
- ArrayList phasedAlleles = new ArrayList(2);
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
- return phasedAlleles;
+ //Create a new Genotype based on information from a single individual
+ //Homozygous genotypes will be set as phased, heterozygous won't be
+ private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
+ if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true));
}
else
- return getAlleles(genotype);
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- private ArrayList phaseMonoParentFamilyAlleles(Genotype.Type parent, Genotype.Type child){
- ArrayList phasedAlleles = new ArrayList(4);
+ private void phaseMonoParentFamilyAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
+
//Special case for Het/Het as it is ambiguous
- if(parent == Genotype.Type.HET && child == Genotype.Type.HET){
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- ArrayList parentAlleles = getAlleles(parent);
- ArrayList childAlleles = getAlleles(child);
+ ArrayList parentAlleles = getAlleles(parentGenotype);
+ ArrayList childAlleles = getAlleles(childGenotype);
+ ArrayList parentPhasedAlleles = new ArrayList(2);
+ ArrayList childPhasedAlleles = new ArrayList(2);
+ //If there is a possible phasing between the mother and child => phase
int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
if(childTransmittedAlleleIndex > -1){
- phasedAlleles.add(parentAlleles.get(0));
- phasedAlleles.add(parentAlleles.get(1));
- phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
- phasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
}
else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
- phasedAlleles.add(parentAlleles.get(1));
- phasedAlleles.add(parentAlleles.get(0));
- phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
- phasedAlleles.add(childAlleles.get(0));
+ parentPhasedAlleles.add(parentAlleles.get(1));
+ parentPhasedAlleles.add(parentAlleles.get(0));
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
}
+ //This is a Mendelian Violation => Do not phase
else{
- parentAlleles.addAll(childAlleles);
- for(AlleleType allele : parentAlleles){
- if(allele == AlleleType.REF){
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- }
- else if(allele == AlleleType.VAR){
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
- }
- else{
- phasedAlleles.add(AlleleType.NO_CALL);
- }
- }
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
-
- return phasedAlleles;
}
- private ArrayList phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
- ArrayList phasedAlleles = new ArrayList(6);
+ private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
- Set> possiblePhasedChildGenotypes = new HashSet>();
- ArrayList motherAlleles = getAlleles(mother);
- ArrayList fatherAlleles = getAlleles(father);
- ArrayList childAlleles = getAlleles(child);
+ Set> possiblePhasedChildGenotypes = new HashSet>();
+ ArrayList motherAlleles = getAlleles(mother);
+ ArrayList fatherAlleles = getAlleles(father);
+ ArrayList childAlleles = getAlleles(child);
//Build all possible child genotypes for the given parent's genotypes
- for (AlleleType momAllele : motherAlleles) {
- for (AlleleType fatherAllele : fatherAlleles) {
- ArrayList possiblePhasedChildAlleles = new ArrayList(2);
+ for (Allele momAllele : motherAlleles) {
+ for (Allele fatherAllele : fatherAlleles) {
+ ArrayList possiblePhasedChildAlleles = new ArrayList(2);
possiblePhasedChildAlleles.add(momAllele);
possiblePhasedChildAlleles.add(fatherAllele);
possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
}
}
- for (ArrayList phasedChildGenotype : possiblePhasedChildGenotypes) {
- int firstAlleleIndex = phasedChildGenotype.indexOf(childAlleles.get(0));
- int secondAlleleIndex = phasedChildGenotype.lastIndexOf(childAlleles.get(1));
+ for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) {
+ int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0));
+ int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1));
+ //If a possible combination has been found, create the genotypes
if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
- //Add mother's alleles
- phasedAlleles.add(phasedChildGenotype.get(0));
- if(motherAlleles.get(0) != phasedAlleles.get(0))
- phasedAlleles.add(motherAlleles.get(0));
+ //Create mother's genotype
+ ArrayList motherPhasedAlleles = new ArrayList(2);
+ motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0));
+ if(motherAlleles.get(0) != motherPhasedAlleles.get(0))
+ motherPhasedAlleles.add(motherAlleles.get(0));
else
- phasedAlleles.add(motherAlleles.get(1));
+ motherPhasedAlleles.add(motherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
- //Add father's alleles
- phasedAlleles.add(phasedChildGenotype.get(1));
- if(fatherAlleles.get(0) != phasedAlleles.get(2))
- phasedAlleles.add(fatherAlleles.get(0));
+ //Create father's genotype
+ ArrayList fatherPhasedAlleles = new ArrayList(2);
+ fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1));
+ if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0))
+ fatherPhasedAlleles.add(fatherAlleles.get(0));
else
- phasedAlleles.add(fatherAlleles.get(1));
+ fatherPhasedAlleles.add(fatherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
- //Add child's alleles
- phasedAlleles.addAll(phasedChildGenotype);
- return phasedAlleles;
+ //Create child's genotype
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Once a phased combination is found; exit
+ return;
}
}
//If this is reached then no phasing could be found
- motherAlleles.addAll(fatherAlleles);
- motherAlleles.addAll(childAlleles);
- for(AlleleType allele : motherAlleles){
- if(allele == AlleleType.REF){
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- }
- else if(allele == AlleleType.VAR){
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
- }
- else{
- phasedAlleles.add(AlleleType.NO_CALL);
- }
- }
- return phasedAlleles;
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
//Take care of cases where one or more family members are no call
- if(child == Genotype.Type.NO_CALL){
- trioAlleles.addAll(phaseSingleIndividualAlleles(mother));
- trioAlleles.addAll(phaseSingleIndividualAlleles(father));
- trioAlleles.add(AlleleType.NO_CALL);
- trioAlleles.add(AlleleType.NO_CALL);
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
- else if(mother == Genotype.Type.NO_CALL){
- trioAlleles.add(AlleleType.NO_CALL);
- trioAlleles.add(AlleleType.NO_CALL);
- if(father == Genotype.Type.NO_CALL){
- trioAlleles.add(AlleleType.NO_CALL);
- trioAlleles.add(AlleleType.NO_CALL);
- trioAlleles.addAll(phaseSingleIndividualAlleles(child));
+ else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
else
- trioAlleles.addAll(phaseMonoParentFamilyAlleles(father, child));
+ phaseMonoParentFamilyAlleles(father, child, FamilyMember.FATHER);
}
- else if(father == Genotype.Type.NO_CALL){
- trioAlleles.addAll(phaseMonoParentFamilyAlleles(mother, child));
- trioAlleles.add(2, AlleleType.NO_CALL);
- trioAlleles.add(3, AlleleType.NO_CALL);
+ else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ phaseMonoParentFamilyAlleles(mother, child, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
}
//Special case for Het/Het/Het as it is ambiguous
else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
- trioAlleles.add(AlleleType.UNPHASED_REF);
- trioAlleles.add(AlleleType.UNPHASED_VAR);
- trioAlleles.add(AlleleType.UNPHASED_REF);
- trioAlleles.add(AlleleType.UNPHASED_VAR);
- trioAlleles.add(AlleleType.UNPHASED_REF);
- trioAlleles.add(AlleleType.UNPHASED_VAR);
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
//All family members have genotypes and at least one of them is not Het
else{
- trioAlleles = phaseFamilyAlleles(mother, father, child);
+ phaseFamilyAlleles(mother, father, child);
}
}
public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){
- phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,trioAlleles.subList(0,2)));
- phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,trioAlleles.subList(2,4)));
- phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,trioAlleles.subList(4,6)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
return phasedGenotypes;
}
- private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, List phasedAlleles){
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, Genotype phasedGenotype){
//Add the transmission probability
Map genotypeAttributes = new HashMap();
genotypeAttributes.putAll(genotype.getAttributes());
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb);
- genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
- boolean isPhased = true;
+ //Handle missing genotype
+ if(!phasedGenotype.isAvailable())
+ return new Genotype(genotype.getSampleName(), null);
+ //Handle NoCall genotype
+ else if(phasedGenotype.isNoCall())
+ return new Genotype(genotype.getSampleName(), phasedGenotype.getAlleles());
- List alleles = new ArrayList(2);
+ ArrayList phasedAlleles = new ArrayList(2);
+ for(Allele allele : phasedGenotype.getAlleles()){
+ if(allele.isReference())
+ phasedAlleles.add(refAllele);
+ else if(allele.isNonReference())
+ phasedAlleles.add(altAllele);
+ //At this point there should not be any other alleles left
+ else
+ throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
- //If unphased, return original genotype
- for(AlleleType allele : phasedAlleles){
- if(allele == AlleleType.NO_CALL){
- return genotype;
- }
- //Otherwise add the appropriate allele
- else if(allele == AlleleType.UNPHASED_REF){
- isPhased = false;
- alleles.add(refAllele);
- }
- else if(allele == AlleleType.UNPHASED_VAR){
- isPhased = false;
- alleles.add(altAllele);
- }
- else if(allele == AlleleType.REF){
- alleles.add(refAllele);
- }
- else if(allele == AlleleType.VAR){
- alleles.add(altAllele);
- }
- }
+ }
- return new Genotype(genotype.getSampleName(), alleles, genotype.getLikelihoods().getLog10GQ(genotype.getType()), null, genotype.getAttributes(), isPhased);
+ //Compute the new Log10Error if the genotype is different from the original genotype
+ double negLog10Error;
+ if(genotype.getType() == phasedGenotype.getType())
+ negLog10Error = genotype.getNegLog10PError();
+ else
+ negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType());
+
+ return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased());
}
@@ -432,63 +418,6 @@ public class PhaseByTransmission extends RodWalker, HashMa
return 1;
}
- private Double getCombinationPrior(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
-
- double nonMVPrior = 1.0 - 12*MENDELIAN_VIOLATION_PRIOR;
-
- //Child is no call => No MV
- if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
- return nonMVPrior;
- //Add parents with genotypes for the evaluation
- ArrayList parents = new ArrayList();
- if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
- parents.add(mother);
- if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
- parents.add(father);
-
- //Both parents no calls => No MV
- if (parents.isEmpty())
- return nonMVPrior;
-
- //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed
- int parentsNumRefAlleles = 0;
- int parentsNumAltAlleles = 0;
-
- for(Genotype.Type parent : parents){
- if(parent == Genotype.Type.HOM_REF){
- parentsNumRefAlleles++;
- }
- else if(parent == Genotype.Type.HET){
- parentsNumRefAlleles++;
- parentsNumAltAlleles++;
- }
- else if(parent == Genotype.Type.HOM_VAR){
- parentsNumAltAlleles++;
- }
- }
-
- //Case Child is HomRef
- if(child == Genotype.Type.HOM_REF){
- if(parentsNumRefAlleles == parents.size())
- return nonMVPrior;
- else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumRefAlleles);
- }
-
- //Case child is HomVar
- if(child == Genotype.Type.HOM_VAR){
- if(parentsNumAltAlleles == parents.size())
- return nonMVPrior;
- else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumAltAlleles);
- }
-
- //Case child is Het
- if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
- return nonMVPrior;
-
- //MV
- return MENDELIAN_VIOLATION_PRIOR;
- }
-
private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
int count = 0;
if(motherOriginal!=motherNew)
@@ -534,7 +463,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
for(Map.Entry childGenotype : childLikelihoods.entrySet()){
mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
- configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-12*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
+ configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
norm += configurationLikelihood;
configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
//Keep this combination if
From 38ebf3141a13ebafcc43b69c3364e0066ba17ea9 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 24 Oct 2011 12:30:04 +0200
Subject: [PATCH 07/44] - Now supports parent/child pairs - Sites with missing
genotypes in pairs/trios are handled as follows: -- Missing child ->
Homozygous parents are phased, no transmission probability is emitted -- Two
individuals missing -> Phase if homozygous, no transmission probability is
emitted -- One parent missing -> Phased / transmission probability emitted -
Mutation prior set as argument
---
.../walkers/phasing/PhaseByTransmission.java | 134 ++++++++++--------
1 file changed, 77 insertions(+), 57 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 62b7bfffa..d128ff40e 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -40,13 +40,16 @@ public class PhaseByTransmission extends RodWalker, HashMa
@Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.")
private PrintStream mvFile = null;
+ @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8")
+ private double deNovoPrior=1e-8;
+
@Output
protected VCFWriter vcfWriter = null;
private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP";
private final String SOURCE_NAME = "PhaseByTransmission";
- private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8;
+ public final double NO_TRANSMISSION_PROB = -1.0;
private ArrayList trios = new ArrayList();
@@ -240,26 +243,26 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
- public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){
+ public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
return phasedGenotypes;
}
- private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, Genotype phasedGenotype){
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
+
+ //Handle null, missing and unavailable genotypes
+ //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
+ //genotype so it is safe to return the original genotype in this case.
+ if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
+ return genotype;
//Add the transmission probability
Map genotypeAttributes = new HashMap();
genotypeAttributes.putAll(genotype.getAttributes());
- genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb);
-
- //Handle missing genotype
- if(!phasedGenotype.isAvailable())
- return new Genotype(genotype.getSampleName(), null);
- //Handle NoCall genotype
- else if(phasedGenotype.isNoCall())
- return new Genotype(genotype.getSampleName(), phasedGenotype.getAlleles());
+ if(transmissionProb>NO_TRANSMISSION_PROB)
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb)));
ArrayList phasedAlleles = new ArrayList(2);
for(Allele allele : phasedGenotype.getAlleles()){
@@ -324,8 +327,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
ArrayList parents;
for(String familyID : families.keySet()){
family = families.get(familyID);
- if(family.size()!=3){
- logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios. Family skipped.",familyID,family.size()));
+ if(family.size()<2 || family.size()>3){
+ logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID,family.size()));
}
else{
for(Sample familyMember : family){
@@ -334,7 +337,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(family.containsAll(parents))
this.trios.add(familyMember);
else
- logger.info(String.format("Caution: Family %s is not a trio; At the moment Phase By Transmission only supports trios. Family skipped.",familyID));
+ logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID));
break;
}
}
@@ -429,64 +432,84 @@ public class PhaseByTransmission extends RodWalker, HashMa
return count;
}
+ private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){
+ if(genotype == null || !genotype.isAvailable()){
+ EnumMap likelihoods = new EnumMap(Genotype.Type.class);
+ likelihoods.put(Genotype.Type.UNAVAILABLE,1.0);
+ return likelihoods;
+ }
+ else if(genotype.isNoCall()){
+ EnumMap likelihoods = new EnumMap(Genotype.Type.class);
+ likelihoods.put(Genotype.Type.NO_CALL,1.0);
+ return likelihoods;
+ }
+ return genotype.getLikelihoods().getAsMap(true);
+ }
+
+ private Genotype.Type getTypeSafeNull(Genotype genotype){
+ if(genotype == null)
+ return Genotype.Type.UNAVAILABLE;
+ return genotype.getType();
+ }
+
private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
- //For now, only consider trios with complete information
- //TODO: Phasing of trios with missing information
- if(!(mother.isCalled() && father.isCalled() && child.isCalled())) {
- finalGenotypes.add(mother);
- finalGenotypes.add(father);
- finalGenotypes.add(child);
- return false;
- }
-
//Get the PL
- Map motherLikelihoods = mother.getLikelihoods().getAsMap(true);
- Map fatherLikelihoods = father.getLikelihoods().getAsMap(true);
- Map childLikelihoods = child.getLikelihoods().getAsMap(true);
+ Map motherLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ Map fatherLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ Map childLikelihoods = getLikelihoodsAsMapSafeNull(child);
//Prior vars
double bestConfigurationLikelihood = 0.0;
double norm = 0.0;
boolean isMV = false;
int bestConfigurationGenotypeDiffs=4;
- Genotype.Type bestMotherGenotype = mother.getType();
- Genotype.Type bestFatherGenotype = father.getType();
- Genotype.Type bestChildGenotype = child.getType();
+ Genotype.Type bestMotherGenotype = getTypeSafeNull(mother);
+ Genotype.Type bestFatherGenotype = getTypeSafeNull(father);
+ Genotype.Type bestChildGenotype = getTypeSafeNull(child);
//Get the most likely combination
- int mvCount;
- double configurationLikelihood;
- int configurationGenotypeDiffs;
- for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
- for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
- for(Map.Entry childGenotype : childLikelihoods.entrySet()){
- mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
- configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
- norm += configurationLikelihood;
- configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
- //Keep this combination if
- //It has a better likelihood
- //Or it has the same likelihood but requires less changes from original genotypes
- if ((configurationLikelihood > bestConfigurationLikelihood) ||
- (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) {
- bestConfigurationLikelihood = configurationLikelihood;
- bestMotherGenotype = motherGenotype.getKey();
- bestFatherGenotype = fatherGenotype.getKey();
- bestChildGenotype = childGenotype.getKey();
- isMV = mvCount>0;
- bestConfigurationGenotypeDiffs=configurationGenotypeDiffs;
+ //Only check for most likely combination if at least a parent and the child have genotypes
+ if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){
+ int mvCount;
+ double configurationLikelihood;
+ int configurationGenotypeDiffs;
+ for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
+ for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
+ for(Map.Entry childGenotype : childLikelihoods.entrySet()){
+ mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
+ configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
+ norm += configurationLikelihood;
+ configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if ((configurationLikelihood > bestConfigurationLikelihood) ||
+ (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) {
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMotherGenotype = motherGenotype.getKey();
+ bestFatherGenotype = fatherGenotype.getKey();
+ bestChildGenotype = childGenotype.getKey();
+ isMV = mvCount>0;
+ bestConfigurationGenotypeDiffs=configurationGenotypeDiffs;
+ }
}
}
}
+
+ //normalize the best configuration probability
+ bestConfigurationLikelihood = bestConfigurationLikelihood / norm;
+ }
+ else{
+ bestConfigurationLikelihood = NO_TRANSMISSION_PROB;
}
//Get the phased alleles for the genotype configuration
TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype);
//Return the phased genotypes
- phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,MathUtils.probabilityToPhredScale(1-(bestConfigurationLikelihood / norm)),finalGenotypes);
+ phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
return isMV;
}
@@ -522,8 +545,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
Genotype father = vc.getGenotype(sample.getPaternalID());
Genotype child = vc.getGenotype(sample.getID());
- //Skip trios where any of the genotype is missing in the variant context
- if(mother == null || father == null | child == null)
+ //Keep only trios and parent/child pairs
+ if(mother == null && father == null || child == null)
continue;
ArrayList trioGenotypes = new ArrayList(3);
@@ -545,11 +568,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){
numHet++;
- if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()){
+ if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet())
numHetHetHet++;
- }else if(!phasedMother.isPhased()){
- int x =9;
- }
}
if(isMV){
From 62477a08104c3a158ccbb547ab5f347625676809 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 24 Oct 2011 13:45:21 +0200
Subject: [PATCH 08/44] Added documentation and comments
---
.../walkers/phasing/PhaseByTransmission.java | 127 ++++++++++++++----
1 file changed, 103 insertions(+), 24 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index d128ff40e..c4c6b69b7 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -22,15 +22,57 @@ import java.io.PrintStream;
import java.util.*;
/**
- * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio,
- * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype
- * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents
- * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the
- * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to
- * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally
- * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or
- * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to
- * begin.
+ * Computes the most likely genotype combination and phases trios and parent/child pairs
+ *
+ *
+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases
+ * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability.
+ * Ambiguous sites are:
+ *
+ * - Sites where all individuals are heterozygous
+ * - Sites where there is a Mendelian violation
+ *
+ * Missing genotypes are handled as follows:
+ *
+ * - In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
+ * - In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
+ * - In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
+ * - In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
+ *
+ *
+ * Input
+ *
+ *
+ * - A VCF variant set containing trio(s) and/or parent/child pair(s).
+ * - A PED pedigree file containing the description of the individuals relationships.
+ *
+ *
+ *
+ * Options
+ *
+ *
+ * - MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype
+ * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability,
+ * and each individual genotype, depth, allelic depth and likelihoods.
+ * - DeNovoPrior: Mutation prio; default is 1e-8
+ *
+ *
+ *
+ * Output
+ *
+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous..
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T PhaseByTransmission \
+ * -V input.vcf \
+ * -ped input.ped \
+ * -o output.vcf
+ *
+ *
*/
public class PhaseByTransmission extends RodWalker, HashMap> {
@@ -73,7 +115,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
CHILD
}
- //Stores a trio-genotype
+ //Stores a conceptual trio or parent/child pair genotype combination along with its phasing.
+ //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method.
private class TrioPhase {
//Create 2 fake alleles
@@ -119,7 +162,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- private void phaseMonoParentFamilyAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
+ //Find the phase for a parent/child pair
+ private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
//Special case for Het/Het as it is ambiguous
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
@@ -155,6 +199,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
+ //Phases a family by transmission
private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
Set> possiblePhasedChildGenotypes = new HashSet>();
@@ -209,7 +254,10 @@ public class PhaseByTransmission extends RodWalker, HashMa
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
-
+ /* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
+ If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
+ or single individual.
+ */
public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
//Take care of cases where one or more family members are no call
@@ -225,10 +273,10 @@ public class PhaseByTransmission extends RodWalker, HashMa
phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
else
- phaseMonoParentFamilyAlleles(father, child, FamilyMember.FATHER);
+ phasePairAlleles(father, child, FamilyMember.FATHER);
}
else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
- phaseMonoParentFamilyAlleles(mother, child, FamilyMember.MOTHER);
+ phasePairAlleles(mother, child, FamilyMember.MOTHER);
phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
}
//Special case for Het/Het/Het as it is ambiguous
@@ -243,11 +291,20 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
- public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
+ /**
+ * Applies the trio genotype combination to the given trio.
+ * @param ref: Reference allele
+ * @param alt: Alternate allele
+ * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
+ * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
+ * @param childGenotype: Genotype of the child to phase using this trio genotype combination
+ * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
+ * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child
+ */
+ public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
- return phasedGenotypes;
}
private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
@@ -290,7 +347,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
/**
- * Parse the familial relationship specification, and initialize VCF writer
+ * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer
*/
public void initialize() {
ArrayList rodNames = new ArrayList();
@@ -306,7 +363,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
Set headerLines = new HashSet();
headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit()));
- headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the phase given that the genotypes are correct"));
+ headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct"));
headerLines.add(new VCFHeaderLine("source", SOURCE_NAME));
vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples));
@@ -318,7 +375,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
/**
- * Select Trios only
+ * Select trios and parent/child pairs only
*/
private void setTrios(){
@@ -349,6 +406,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
+ //Create the transmission matrices
private void buildMatrices(){
mvCountMatrix = new EnumMap>>(Genotype.Type.class);
transmissionMatrix = new EnumMap>>(Genotype.Type.class);
@@ -366,6 +424,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
+ //Returns the number of Mendelian Violations for a given genotype combination.
+ //If one of the parents genotype is missing, it will consider it as a parent/child pair
+ //If the child genotype or both parents genotypes are missing, 0 is returned.
private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
//Child is no call => No MV
@@ -421,6 +482,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
return 1;
}
+ //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations.
private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
int count = 0;
if(motherOriginal!=motherNew)
@@ -432,6 +494,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
return count;
}
+ //Get a Map of genotype likelihoods. If the genotype is NO_CALL or UNAVAILABLE, the Map will contain a single
+ //NO_CALL resp. UNAVAILABLE element with a likelihood of 1.0
private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){
if(genotype == null || !genotype.isAvailable()){
EnumMap likelihoods = new EnumMap(Genotype.Type.class);
@@ -446,6 +510,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
return genotype.getLikelihoods().getAsMap(true);
}
+ //Returns the Genotype.Type; returns UNVAILABLE if given null
private Genotype.Type getTypeSafeNull(Genotype genotype){
if(genotype == null)
return Genotype.Type.UNAVAILABLE;
@@ -453,6 +518,16 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
+ /**
+ * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair.
+ * @param ref: Reference allele
+ * @param alt: Alternative allele
+ * @param mother: Mother's genotype
+ * @param father: Father's genotype
+ * @param child: Child's genotype
+ * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child
+ * @return
+ */
private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
//Get the PL
@@ -600,9 +675,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
/**
- * Provide an initial value for reduce computations.
+ * Initializes the reporting counters.
*
- * @return Initial value of reduce.
+ * @return All counters initialized to 0
*/
@Override
public HashMap reduceInit() {
@@ -617,10 +692,10 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
/**
- * Reduces a single map with the accumulator provided as the ReduceType.
+ * Adds the value of the site phased to the reporting counters.
*
- * @param value result of the map.
- * @param sum accumulator for the reduce.
+ * @param value Site values
+ * @param sum accumulator for the reporting counters
* @return accumulator with result of the map taken into account.
*/
@Override
@@ -635,6 +710,10 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
+ /**
+ * Reports statistics on the phasing by transmission process.
+ * @param result Accumulator with all counters.
+ */
@Override
public void onTraversalDone(HashMap result) {
logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED));
From 62cff266d4e20aa2a48f88c47f282d9a6bc25287 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 26 Oct 2011 14:40:04 +0200
Subject: [PATCH 09/44] GQ calculation corrected for most likely genotype
---
.../utils/variantcontext/GenotypeLikelihoods.java | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index b83ffa2fd..47c93bb1b 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -124,10 +124,15 @@ public class GenotypeLikelihoods {
}
- double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
- double chosenGenotype = normalized[genotype.ordinal()-1];
- qual = -1.0 * Math.log10(1.0 - chosenGenotype);
+ //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best)
+ qual = likelihoods.get(genotype) - qual;
+ //Quality of other genotypes 1-P(G)
+ if (qual < 0) {
+ double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
+ double chosenGenotype = normalized[genotype.ordinal()-1];
+ qual = -1.0 * Math.log10(1.0 - chosenGenotype);
+ }
return qual;
}
From 81b163ff4d6a8203cb4cb155a92a751ee1b2349d Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 26 Oct 2011 14:49:12 +0200
Subject: [PATCH 10/44] Indentation
---
.../walkers/phasing/PhaseByTransmission.java | 370 +++++++++---------
1 file changed, 185 insertions(+), 185 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index c4c6b69b7..956cf7c89 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -129,220 +129,220 @@ public class PhaseByTransmission extends RodWalker, HashMa
private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class);
private ArrayList getAlleles(Genotype.Type genotype){
- ArrayList alleles = new ArrayList(2);
- if(genotype == Genotype.Type.HOM_REF){
- alleles.add(REF);
- alleles.add(REF);
- }
- else if(genotype == Genotype.Type.HET){
- alleles.add(REF);
- alleles.add(VAR);
- }
- else if(genotype == Genotype.Type.HOM_VAR){
- alleles.add(VAR);
- alleles.add(VAR);
- }
- else if(genotype == Genotype.Type.NO_CALL){
- alleles.add(NO_CALL);
- alleles.add(NO_CALL);
- }
- else{
- return null;
- }
- return alleles;
- }
-
- //Create a new Genotype based on information from a single individual
- //Homozygous genotypes will be set as phased, heterozygous won't be
- private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
- if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
- trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true));
- }
- else
- trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- }
-
- //Find the phase for a parent/child pair
- private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
-
- //Special case for Het/Het as it is ambiguous
- if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
- trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ ArrayList alleles = new ArrayList(2);
+ if(genotype == Genotype.Type.HOM_REF){
+ alleles.add(REF);
+ alleles.add(REF);
+ }
+ else if(genotype == Genotype.Type.HET){
+ alleles.add(REF);
+ alleles.add(VAR);
+ }
+ else if(genotype == Genotype.Type.HOM_VAR){
+ alleles.add(VAR);
+ alleles.add(VAR);
+ }
+ else if(genotype == Genotype.Type.NO_CALL){
+ alleles.add(NO_CALL);
+ alleles.add(NO_CALL);
+ }
+ else{
+ return null;
+ }
+ return alleles;
}
- ArrayList parentAlleles = getAlleles(parentGenotype);
- ArrayList childAlleles = getAlleles(childGenotype);
- ArrayList parentPhasedAlleles = new ArrayList(2);
- ArrayList childPhasedAlleles = new ArrayList(2);
-
- //If there is a possible phasing between the mother and child => phase
- int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
- if(childTransmittedAlleleIndex > -1){
- trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
- childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
- childPhasedAlleles.add(childAlleles.get(0));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ //Create a new Genotype based on information from a single individual
+ //Homozygous genotypes will be set as phased, heterozygous won't be
+ private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
+ if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ }
+ else
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
- parentPhasedAlleles.add(parentAlleles.get(1));
- parentPhasedAlleles.add(parentAlleles.get(0));
- trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
- childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
- childPhasedAlleles.add(childAlleles.get(0));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
- }
- //This is a Mendelian Violation => Do not phase
- else{
- trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- }
- }
- //Phases a family by transmission
- private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+ //Find the phase for a parent/child pair
+ private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
- Set> possiblePhasedChildGenotypes = new HashSet>();
- ArrayList motherAlleles = getAlleles(mother);
- ArrayList fatherAlleles = getAlleles(father);
- ArrayList childAlleles = getAlleles(child);
+ //Special case for Het/Het as it is ambiguous
+ if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ }
- //Build all possible child genotypes for the given parent's genotypes
- for (Allele momAllele : motherAlleles) {
- for (Allele fatherAllele : fatherAlleles) {
- ArrayList possiblePhasedChildAlleles = new ArrayList(2);
- possiblePhasedChildAlleles.add(momAllele);
- possiblePhasedChildAlleles.add(fatherAllele);
- possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
+ ArrayList parentAlleles = getAlleles(parentGenotype);
+ ArrayList childAlleles = getAlleles(childGenotype);
+ ArrayList parentPhasedAlleles = new ArrayList(2);
+ ArrayList childPhasedAlleles = new ArrayList(2);
+
+ //If there is a possible phasing between the mother and child => phase
+ int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
+ if(childTransmittedAlleleIndex > -1){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ }
+ else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
+ parentPhasedAlleles.add(parentAlleles.get(1));
+ parentPhasedAlleles.add(parentAlleles.get(0));
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ }
+ //This is a Mendelian Violation => Do not phase
+ else{
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
}
- for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) {
- int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0));
- int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1));
- //If a possible combination has been found, create the genotypes
- if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
- //Create mother's genotype
- ArrayList motherPhasedAlleles = new ArrayList(2);
- motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0));
- if(motherAlleles.get(0) != motherPhasedAlleles.get(0))
- motherPhasedAlleles.add(motherAlleles.get(0));
- else
- motherPhasedAlleles.add(motherAlleles.get(1));
- trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+ //Phases a family by transmission
+ private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
- //Create father's genotype
- ArrayList fatherPhasedAlleles = new ArrayList(2);
- fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1));
- if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0))
- fatherPhasedAlleles.add(fatherAlleles.get(0));
- else
- fatherPhasedAlleles.add(fatherAlleles.get(1));
- trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+ Set> possiblePhasedChildGenotypes = new HashSet>();
+ ArrayList motherAlleles = getAlleles(mother);
+ ArrayList fatherAlleles = getAlleles(father);
+ ArrayList childAlleles = getAlleles(child);
- //Create child's genotype
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
-
- //Once a phased combination is found; exit
- return;
+ //Build all possible child genotypes for the given parent's genotypes
+ for (Allele momAllele : motherAlleles) {
+ for (Allele fatherAllele : fatherAlleles) {
+ ArrayList possiblePhasedChildAlleles = new ArrayList(2);
+ possiblePhasedChildAlleles.add(momAllele);
+ possiblePhasedChildAlleles.add(fatherAllele);
+ possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
+ }
}
+
+ for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) {
+ int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0));
+ int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1));
+ //If a possible combination has been found, create the genotypes
+ if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
+ //Create mother's genotype
+ ArrayList motherPhasedAlleles = new ArrayList(2);
+ motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0));
+ if(motherAlleles.get(0) != motherPhasedAlleles.get(0))
+ motherPhasedAlleles.add(motherAlleles.get(0));
+ else
+ motherPhasedAlleles.add(motherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Create father's genotype
+ ArrayList fatherPhasedAlleles = new ArrayList(2);
+ fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1));
+ if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0))
+ fatherPhasedAlleles.add(fatherAlleles.get(0));
+ else
+ fatherPhasedAlleles.add(fatherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Create child's genotype
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Once a phased combination is found; exit
+ return;
+ }
+ }
+
+ //If this is reached then no phasing could be found
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- //If this is reached then no phasing could be found
- trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- }
+ /* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
+ If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
+ or single individual.
+ */
+ public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
- /* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
- If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
- or single individual.
- */
- public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
-
- //Take care of cases where one or more family members are no call
- if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){
- phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
- phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
- phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
- }
- else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){
- phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
- if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ //Take care of cases where one or more family members are no call
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
- else
- phasePairAlleles(father, child, FamilyMember.FATHER);
+ else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ else
+ phasePairAlleles(father, child, FamilyMember.FATHER);
+ }
+ else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ phasePairAlleles(mother, child, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ }
+ //Special case for Het/Het/Het as it is ambiguous
+ else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ //All family members have genotypes and at least one of them is not Het
+ else{
+ phaseFamilyAlleles(mother, father, child);
+ }
}
- else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
- phasePairAlleles(mother, child, FamilyMember.MOTHER);
- phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+
+ /**
+ * Applies the trio genotype combination to the given trio.
+ * @param ref: Reference allele
+ * @param alt: Alternate allele
+ * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
+ * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
+ * @param childGenotype: Genotype of the child to phase using this trio genotype combination
+ * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
+ * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child
+ */
+ public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
}
- //Special case for Het/Het/Het as it is ambiguous
- else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
- phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
- phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
- phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
- }
- //All family members have genotypes and at least one of them is not Het
- else{
- phaseFamilyAlleles(mother, father, child);
- }
- }
- /**
- * Applies the trio genotype combination to the given trio.
- * @param ref: Reference allele
- * @param alt: Alternate allele
- * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
- * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
- * @param childGenotype: Genotype of the child to phase using this trio genotype combination
- * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
- * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child
- */
- public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
- phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
- phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
- phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
- }
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
- private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
+ //Handle null, missing and unavailable genotypes
+ //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
+ //genotype so it is safe to return the original genotype in this case.
+ if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
+ return genotype;
- //Handle null, missing and unavailable genotypes
- //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
- //genotype so it is safe to return the original genotype in this case.
- if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
- return genotype;
+ //Add the transmission probability
+ Map genotypeAttributes = new HashMap();
+ genotypeAttributes.putAll(genotype.getAttributes());
+ if(transmissionProb>NO_TRANSMISSION_PROB)
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb)));
- //Add the transmission probability
- Map genotypeAttributes = new HashMap();
- genotypeAttributes.putAll(genotype.getAttributes());
- if(transmissionProb>NO_TRANSMISSION_PROB)
- genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb)));
+ ArrayList phasedAlleles = new ArrayList(2);
+ for(Allele allele : phasedGenotype.getAlleles()){
+ if(allele.isReference())
+ phasedAlleles.add(refAllele);
+ else if(allele.isNonReference())
+ phasedAlleles.add(altAllele);
+ //At this point there should not be any other alleles left
+ else
+ throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
- ArrayList phasedAlleles = new ArrayList(2);
- for(Allele allele : phasedGenotype.getAlleles()){
- if(allele.isReference())
- phasedAlleles.add(refAllele);
- else if(allele.isNonReference())
- phasedAlleles.add(altAllele);
- //At this point there should not be any other alleles left
+ }
+
+ //Compute the new Log10Error if the genotype is different from the original genotype
+ double negLog10Error;
+ if(genotype.getType() == phasedGenotype.getType())
+ negLog10Error = genotype.getNegLog10PError();
else
- throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
+ negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType());
+ return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased());
}
- //Compute the new Log10Error if the genotype is different from the original genotype
- double negLog10Error;
- if(genotype.getType() == phasedGenotype.getType())
- negLog10Error = genotype.getNegLog10PError();
- else
- negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType());
-
- return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased());
- }
-
}
From 1f044faedd01317b71c3abf91ac935beb6581223 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 26 Oct 2011 19:57:09 +0200
Subject: [PATCH 11/44] - Genotype assignment in case of equally likeli
combination is now random - Genotype combinations with 0 confidence are now
left unphased
---
.../walkers/phasing/PhaseByTransmission.java | 65 +++++++++++++------
1 file changed, 44 insertions(+), 21 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 956cf7c89..f5e9ce6ea 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -109,6 +109,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
private final Byte NUM_HET_HET_HET = 4;
private final Byte NUM_VIOLATIONS = 5;
+ //Random number generator
+ private Random rand = new Random();
+
private enum FamilyMember {
MOTHER,
FATHER,
@@ -309,17 +312,22 @@ public class PhaseByTransmission extends RodWalker, HashMa
private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
+ int phredScoreTransmission = -1;
+ if(transmissionProb != NO_TRANSMISSION_PROB)
+ phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb));
+
//Handle null, missing and unavailable genotypes
//Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
//genotype so it is safe to return the original genotype in this case.
- if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
+ //In addition, if the phasing confidence is 0, then return the unphased, original genotypes.
+ if(phredScoreTransmission ==0 || genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
return genotype;
//Add the transmission probability
Map genotypeAttributes = new HashMap();
genotypeAttributes.putAll(genotype.getAttributes());
if(transmissionProb>NO_TRANSMISSION_PROB)
- genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb)));
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
ArrayList phasedAlleles = new ArrayList(2);
for(Allele allele : phasedGenotype.getAlleles()){
@@ -538,36 +546,46 @@ public class PhaseByTransmission extends RodWalker, HashMa
//Prior vars
double bestConfigurationLikelihood = 0.0;
double norm = 0.0;
- boolean isMV = false;
- int bestConfigurationGenotypeDiffs=4;
- Genotype.Type bestMotherGenotype = getTypeSafeNull(mother);
- Genotype.Type bestFatherGenotype = getTypeSafeNull(father);
- Genotype.Type bestChildGenotype = getTypeSafeNull(child);
+ int configuration_index =0;
+ ArrayList isMV = new ArrayList();
+ isMV.add(false);
+ ArrayList bestMotherGenotype = new ArrayList();
+ bestMotherGenotype.add(getTypeSafeNull(mother));
+ ArrayList bestFatherGenotype = new ArrayList();
+ bestFatherGenotype.add(getTypeSafeNull(father));
+ ArrayList bestChildGenotype = new ArrayList();
+ bestChildGenotype.add(getTypeSafeNull(child));
//Get the most likely combination
//Only check for most likely combination if at least a parent and the child have genotypes
if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){
int mvCount;
double configurationLikelihood;
- int configurationGenotypeDiffs;
for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
for(Map.Entry childGenotype : childLikelihoods.entrySet()){
mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
norm += configurationLikelihood;
- configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
//Keep this combination if
//It has a better likelihood
//Or it has the same likelihood but requires less changes from original genotypes
- if ((configurationLikelihood > bestConfigurationLikelihood) ||
- (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) {
+ if (configurationLikelihood > bestConfigurationLikelihood){
bestConfigurationLikelihood = configurationLikelihood;
- bestMotherGenotype = motherGenotype.getKey();
- bestFatherGenotype = fatherGenotype.getKey();
- bestChildGenotype = childGenotype.getKey();
- isMV = mvCount>0;
- bestConfigurationGenotypeDiffs=configurationGenotypeDiffs;
+ isMV.clear();
+ isMV.add(mvCount>0);
+ bestMotherGenotype.clear();
+ bestMotherGenotype.add(motherGenotype.getKey());
+ bestFatherGenotype.clear();
+ bestFatherGenotype.add(fatherGenotype.getKey());
+ bestChildGenotype.clear();
+ bestChildGenotype.add(childGenotype.getKey());
+ }
+ else if(configurationLikelihood == bestConfigurationLikelihood) {
+ bestMotherGenotype.add(motherGenotype.getKey());
+ bestFatherGenotype.add(fatherGenotype.getKey());
+ bestChildGenotype.add(childGenotype.getKey());
+ isMV.add(mvCount>0);
}
}
}
@@ -575,17 +593,22 @@ public class PhaseByTransmission extends RodWalker, HashMa
//normalize the best configuration probability
bestConfigurationLikelihood = bestConfigurationLikelihood / norm;
+
+ //In case of multiple equally likely combinations, take a random one
+ if(bestMotherGenotype.size()>1){
+ configuration_index = rand.nextInt(bestMotherGenotype.size()-1);
+ }
+
}
else{
bestConfigurationLikelihood = NO_TRANSMISSION_PROB;
}
- //Get the phased alleles for the genotype configuration
- TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype);
+ TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype.get(configuration_index)).get(bestFatherGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
- //Return the phased genotypes
- phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
- return isMV;
+ //Return the phased genotypes
+ phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
+ return isMV.get(configuration_index);
}
From b91a9c4711d60b361a226a517a97e68217fdd5cf Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 2 Nov 2011 08:04:01 +0100
Subject: [PATCH 12/44] - Fixed parent/child pairs handling (was crashing
before) - Added parent/child pair reporting
---
.../walkers/phasing/PhaseByTransmission.java | 151 ++++++++++++------
1 file changed, 101 insertions(+), 50 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index f5e9ce6ea..5b2024f96 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -105,9 +105,13 @@ public class PhaseByTransmission extends RodWalker, HashMa
private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1;
private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
- private final Byte NUM_HET = 3;
- private final Byte NUM_HET_HET_HET = 4;
- private final Byte NUM_VIOLATIONS = 5;
+ private final Byte NUM_TRIO_HET_HET_HET = 3;
+ private final Byte NUM_TRIO_VIOLATIONS = 4;
+ private final Byte NUM_PAIR_GENOTYPES_CALLED = 5;
+ private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6;
+ private final Byte NUM_PAIR_GENOTYPES_PHASED = 7;
+ private final Byte NUM_PAIR_HET_HET = 8;
+ private final Byte NUM_PAIR_VIOLATIONS = 9;
//Random number generator
private Random rand = new Random();
@@ -172,6 +176,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ return;
}
ArrayList parentAlleles = getAlleles(parentGenotype);
@@ -612,6 +617,48 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
+
+ private void updatePairMetricsCounters(Genotype parent, Genotype child, boolean isMV, HashMap counters){
+
+ //Increment metrics counters
+ if(parent.isCalled() && child.isCalled()){
+ counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1);
+ if(parent.isPhased())
+ counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
+ else{
+ if(parent.isHet() && child.isHet())
+ counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1);
+
+ else if(isMV)
+ counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+1);
+ }
+ }else{
+ counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1);
+ }
+
+ }
+
+ private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, boolean isMV, HashMap counters){
+
+ //Increment metrics counters
+ if(mother.isCalled() && father.isCalled() && child.isCalled()){
+ counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1);
+ if(mother.isPhased())
+ counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
+
+ else{
+ if(mother.isHet() && father.isHet() && child.isHet())
+ counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
+
+ else if(isMV)
+ counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
+
+ }
+ }else{
+ counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1);
+ }
+ }
+
/**
* For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype
*
@@ -623,13 +670,17 @@ public class PhaseByTransmission extends RodWalker, HashMa
@Override
public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
- //Local cars to avoid lookups on increment
- int numTrioGenotypesCalled = 0;
- int numTrioGenotypesNoCall = 0;
- int numTrioGenotypesPhased = 0;
- int numHet = 0 ;
- int numHetHetHet = 0;
- int numMVs = 0;
+ HashMap metricsCounters = new HashMap(10);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_TRIO_HET_HET_HET,0);
+ metricsCounters.put(NUM_TRIO_VIOLATIONS,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_PAIR_HET_HET,0);
+ metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
@@ -654,30 +705,26 @@ public class PhaseByTransmission extends RodWalker, HashMa
Genotype phasedFather = trioGenotypes.get(1);
Genotype phasedChild = trioGenotypes.get(2);
- genotypeMap.put(phasedMother.getSampleName(), phasedMother);
- genotypeMap.put(phasedFather.getSampleName(), phasedFather);
- genotypeMap.put(phasedChild.getSampleName(), phasedChild);
-
- //Increment metrics counters
- if(phasedMother.isCalled() && phasedFather.isCalled() && phasedChild.isCalled()){
- numTrioGenotypesCalled++;
- if(phasedMother.isPhased())
- numTrioGenotypesPhased++;
-
- if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){
- numHet++;
- if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet())
- numHetHetHet++;
+ //Fill the genotype map with the new genotypes and increment metrics counters
+ genotypeMap.put(phasedChild.getSampleName(),phasedChild);
+ if(mother != null){
+ genotypeMap.put(phasedMother.getSampleName(), phasedMother);
+ if(father != null){
+ genotypeMap.put(phasedFather.getSampleName(), phasedFather);
+ updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters);
}
-
- if(isMV){
- numMVs++;
- if(mvFile != null)
- mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()));
- }
- }else{
- numTrioGenotypesNoCall++;
+ else
+ updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters);
}
+ else{
+ genotypeMap.put(phasedFather.getSampleName(),phasedFather);
+ updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters);
+ }
+
+ //Report violation if set so
+ //TODO: ADAPT FOR PAIRS TOO!!
+ if(isMV && mvFile != null)
+ mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()));
}
@@ -686,14 +733,6 @@ public class PhaseByTransmission extends RodWalker, HashMa
vcfWriter.add(newvc);
}
-
- HashMap metricsCounters = new HashMap(5);
- metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,numTrioGenotypesCalled);
- metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,numTrioGenotypesNoCall);
- metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,numTrioGenotypesPhased);
- metricsCounters.put(NUM_HET,numHet);
- metricsCounters.put(NUM_HET_HET_HET,numHetHetHet);
- metricsCounters.put(NUM_VIOLATIONS,numMVs);
return metricsCounters;
}
@@ -704,13 +743,17 @@ public class PhaseByTransmission extends RodWalker, HashMa
*/
@Override
public HashMap reduceInit() {
- HashMap metricsCounters = new HashMap(5);
+ HashMap metricsCounters = new HashMap(10);
metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
- metricsCounters.put(NUM_HET,0);
- metricsCounters.put(NUM_HET_HET_HET,0);
- metricsCounters.put(NUM_VIOLATIONS,0);
+ metricsCounters.put(NUM_TRIO_HET_HET_HET,0);
+ metricsCounters.put(NUM_TRIO_VIOLATIONS,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_PAIR_HET_HET,0);
+ metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
return metricsCounters;
}
@@ -726,9 +769,13 @@ public class PhaseByTransmission extends RodWalker, HashMa
sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED));
sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL));
sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED));
- sum.put(NUM_HET,value.get(NUM_HET)+sum.get(NUM_HET));
- sum.put(NUM_HET_HET_HET,value.get(NUM_HET_HET_HET)+sum.get(NUM_HET_HET_HET));
- sum.put(NUM_VIOLATIONS,value.get(NUM_VIOLATIONS)+sum.get(NUM_VIOLATIONS));
+ sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET));
+ sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS));
+ sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED));
+ sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL));
+ sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED));
+ sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET));
+ sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS));
return sum;
}
@@ -742,8 +789,12 @@ public class PhaseByTransmission extends RodWalker, HashMa
logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED));
logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL));
logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED));
- logger.info("Number of resulting Hom/Hom/Hom trios: " + result.get(NUM_HET));
- logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_HET_HET_HET));
- logger.info("Number of remaining mendelian violations: " + result.get(NUM_VIOLATIONS));
+ logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET));
+ logger.info("Number of remaining mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS));
+ logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED));
+ logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL));
+ logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED));
+ logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET));
+ logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS));
}
}
From 119ca7d742371423060bcc7150f908e1cb04de8a Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 2 Nov 2011 08:22:33 +0100
Subject: [PATCH 13/44] Fixed a bug in parent/child pairs reporting causing a
crash in case the -mvf option was used and mother was not provided
---
.../sting/gatk/walkers/phasing/PhaseByTransmission.java | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 5b2024f96..806a84e9d 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -681,6 +681,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
metricsCounters.put(NUM_PAIR_HET_HET,0);
metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
+ String mvfLine = "";
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
@@ -712,19 +713,23 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(father != null){
genotypeMap.put(phasedFather.getSampleName(), phasedFather);
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
- else
+ else{
updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
+ }
}
else{
genotypeMap.put(phasedFather.getSampleName(),phasedFather);
updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
//Report violation if set so
//TODO: ADAPT FOR PAIRS TOO!!
if(isMV && mvFile != null)
- mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()));
+ mvFile.println(mvfLine);
}
From 19ad5b635ad46f119ed1beccccf7bb2bc6a18aef Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 2 Nov 2011 18:35:31 +0100
Subject: [PATCH 14/44] - Calculation of parent/child pairs corrected -
Separated the reporting of single and double mendelian violations in trios
---
.../walkers/phasing/PhaseByTransmission.java | 177 ++++++++++++------
1 file changed, 120 insertions(+), 57 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 806a84e9d..3c39b58d3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -107,6 +107,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
private final Byte NUM_TRIO_HET_HET_HET = 3;
private final Byte NUM_TRIO_VIOLATIONS = 4;
+ private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10;
private final Byte NUM_PAIR_GENOTYPES_CALLED = 5;
private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6;
private final Byte NUM_PAIR_GENOTYPES_PHASED = 7;
@@ -507,17 +508,14 @@ public class PhaseByTransmission extends RodWalker, HashMa
return count;
}
- //Get a Map of genotype likelihoods. If the genotype is NO_CALL or UNAVAILABLE, the Map will contain a single
- //NO_CALL resp. UNAVAILABLE element with a likelihood of 1.0
+ //Get a Map of genotype likelihoods.
+ //In case of null, unavailable or no call, all likelihoods are 1/3.
private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){
- if(genotype == null || !genotype.isAvailable()){
+ if(genotype == null || !genotype.isCalled()){
EnumMap likelihoods = new EnumMap(Genotype.Type.class);
- likelihoods.put(Genotype.Type.UNAVAILABLE,1.0);
- return likelihoods;
- }
- else if(genotype.isNoCall()){
- EnumMap likelihoods = new EnumMap(Genotype.Type.class);
- likelihoods.put(Genotype.Type.NO_CALL,1.0);
+ likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0);
+ likelihoods.put(Genotype.Type.HET,1.0/3.0);
+ likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0);
return likelihoods;
}
return genotype.getLikelihoods().getAsMap(true);
@@ -541,57 +539,113 @@ public class PhaseByTransmission extends RodWalker, HashMa
* @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child
* @return
*/
- private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
+ private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
- //Get the PL
- Map motherLikelihoods = getLikelihoodsAsMapSafeNull(mother);
- Map fatherLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ //Check whether it is a pair or trio
+ //Always assign the first parent as the parent having genotype information in pairs
+ //Always assign the mother as the first parent in trios
+ int parentsCalled = 0;
+ Map firstParentLikelihoods;
+ Map secondParentLikelihoods;
+ Genotype.Type pairSecondParentGenotype = null;
+ if(mother == null || !mother.isCalled()){
+ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType();
+ if(father != null && father.isCalled())
+ parentsCalled = 1;
+ }
+ else{
+ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ if(father == null || !father.isCalled()){
+ parentsCalled = 1;
+ pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType();
+ }else{
+ parentsCalled = 2;
+ }
+ }
Map childLikelihoods = getLikelihoodsAsMapSafeNull(child);
//Prior vars
double bestConfigurationLikelihood = 0.0;
double norm = 0.0;
int configuration_index =0;
- ArrayList isMV = new ArrayList();
- isMV.add(false);
- ArrayList bestMotherGenotype = new ArrayList();
- bestMotherGenotype.add(getTypeSafeNull(mother));
- ArrayList bestFatherGenotype = new ArrayList();
- bestFatherGenotype.add(getTypeSafeNull(father));
+ ArrayList bestMVCount = new ArrayList();
+ bestMVCount.add(0);
+
+ ArrayList bestFirstParentGenotype = new ArrayList();
+ ArrayList bestSecondParentGenotype = new ArrayList();
ArrayList bestChildGenotype = new ArrayList();
+ bestFirstParentGenotype.add(getTypeSafeNull(mother));
+ bestSecondParentGenotype.add(getTypeSafeNull(father));
bestChildGenotype.add(getTypeSafeNull(child));
//Get the most likely combination
//Only check for most likely combination if at least a parent and the child have genotypes
- if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){
+ if(child.isCalled() && parentsCalled > 0){
int mvCount;
- double configurationLikelihood;
- for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
- for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
- for(Map.Entry childGenotype : childLikelihoods.entrySet()){
- mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
- configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
+ int cumulativeMVCount = 0;
+ double configurationLikelihood = 0;
+ for(Map.Entry childGenotype : childLikelihoods.entrySet()){
+ for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){
+ for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){
+ mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey());
+ //For parent/child pairs, sum over the possible genotype configurations of the missing parent
+ if(parentsCalled<2){
+ cumulativeMVCount += mvCount;
+ configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue();
+ }
+ //Evaluate configurations of trios
+ else{
+ configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue();
+ norm += configurationLikelihood;
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if (configurationLikelihood > bestConfigurationLikelihood){
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMVCount.clear();
+ bestMVCount.add(mvCount);
+ bestFirstParentGenotype.clear();
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.clear();
+ bestSecondParentGenotype.add(secondParentGenotype.getKey());
+ bestChildGenotype.clear();
+ bestChildGenotype.add(childGenotype.getKey());
+ }
+ else if(configurationLikelihood == bestConfigurationLikelihood) {
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(secondParentGenotype.getKey());
+ bestChildGenotype.add(childGenotype.getKey());
+ bestMVCount.add(mvCount);
+ }
+ }
+ }
+ //Evaluate configurations of parent/child pairs
+ if(parentsCalled<2){
norm += configurationLikelihood;
//Keep this combination if
//It has a better likelihood
//Or it has the same likelihood but requires less changes from original genotypes
if (configurationLikelihood > bestConfigurationLikelihood){
bestConfigurationLikelihood = configurationLikelihood;
- isMV.clear();
- isMV.add(mvCount>0);
- bestMotherGenotype.clear();
- bestMotherGenotype.add(motherGenotype.getKey());
- bestFatherGenotype.clear();
- bestFatherGenotype.add(fatherGenotype.getKey());
+ bestMVCount.clear();
+ bestMVCount.add(cumulativeMVCount/3);
bestChildGenotype.clear();
+ bestFirstParentGenotype.clear();
+ bestSecondParentGenotype.clear();
bestChildGenotype.add(childGenotype.getKey());
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(pairSecondParentGenotype);
}
else if(configurationLikelihood == bestConfigurationLikelihood) {
- bestMotherGenotype.add(motherGenotype.getKey());
- bestFatherGenotype.add(fatherGenotype.getKey());
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(pairSecondParentGenotype);
bestChildGenotype.add(childGenotype.getKey());
- isMV.add(mvCount>0);
+ bestMVCount.add(cumulativeMVCount/3);
}
+ configurationLikelihood = 0;
}
}
}
@@ -600,8 +654,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
bestConfigurationLikelihood = bestConfigurationLikelihood / norm;
//In case of multiple equally likely combinations, take a random one
- if(bestMotherGenotype.size()>1){
- configuration_index = rand.nextInt(bestMotherGenotype.size()-1);
+ if(bestFirstParentGenotype.size()>1){
+ configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1);
}
}
@@ -609,16 +663,20 @@ public class PhaseByTransmission extends RodWalker, HashMa
bestConfigurationLikelihood = NO_TRANSMISSION_PROB;
}
- TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype.get(configuration_index)).get(bestFatherGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
+ TrioPhase phasedTrioGenotypes;
+ if(parentsCalled < 2 && mother == null || !mother.isCalled())
+ phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
+ else
+ phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
//Return the phased genotypes
phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
- return isMV.get(configuration_index);
+ return bestMVCount.get(configuration_index);
}
- private void updatePairMetricsCounters(Genotype parent, Genotype child, boolean isMV, HashMap counters){
+ private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){
//Increment metrics counters
if(parent.isCalled() && child.isCalled()){
@@ -626,11 +684,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(parent.isPhased())
counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
else{
+ counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount);
if(parent.isHet() && child.isHet())
counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1);
-
- else if(isMV)
- counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+1);
}
}else{
counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1);
@@ -638,7 +694,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
- private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, boolean isMV, HashMap counters){
+ private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){
//Increment metrics counters
if(mother.isCalled() && father.isCalled() && child.isCalled()){
@@ -647,11 +703,14 @@ public class PhaseByTransmission extends RodWalker, HashMa
counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
else{
- if(mother.isHet() && father.isHet() && child.isHet())
- counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
-
- else if(isMV)
- counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
+ if(mvCount > 0){
+ if(mvCount >1)
+ counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1);
+ else
+ counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
+ }
+ else if(mother.isHet() && father.isHet() && child.isHet())
+ counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
}
}else{
@@ -681,14 +740,15 @@ public class PhaseByTransmission extends RodWalker, HashMa
metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
metricsCounters.put(NUM_PAIR_HET_HET,0);
metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
- String mvfLine = "";
+ metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
+ String mvfLine;
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
Map genotypeMap = vc.getGenotypes();
- boolean isMV;
+ int mvCount;
for (Sample sample : trios) {
Genotype mother = vc.getGenotype(sample.getMaternalID());
@@ -700,7 +760,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
continue;
ArrayList trioGenotypes = new ArrayList(3);
- isMV = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
+ mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
Genotype phasedMother = trioGenotypes.get(0);
Genotype phasedFather = trioGenotypes.get(1);
@@ -712,23 +772,23 @@ public class PhaseByTransmission extends RodWalker, HashMa
genotypeMap.put(phasedMother.getSampleName(), phasedMother);
if(father != null){
genotypeMap.put(phasedFather.getSampleName(), phasedFather);
- updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters);
+ updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
else{
- updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters);
+ updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
}
else{
genotypeMap.put(phasedFather.getSampleName(),phasedFather);
- updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters);
+ updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
//Report violation if set so
//TODO: ADAPT FOR PAIRS TOO!!
- if(isMV && mvFile != null)
+ if(mvCount>0 && mvFile != null)
mvFile.println(mvfLine);
}
@@ -759,6 +819,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
metricsCounters.put(NUM_PAIR_HET_HET,0);
metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
+ metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
return metricsCounters;
}
@@ -781,6 +842,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED));
sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET));
sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS));
+ sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS));
return sum;
}
@@ -795,7 +857,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL));
logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED));
logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET));
- logger.info("Number of remaining mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS));
+ logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS));
+ logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS));
logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED));
logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL));
logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED));
From 893787de535fde77b56d02e5c006400e021f76c1 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 3 Nov 2011 13:04:11 +0100
Subject: [PATCH 15/44] Functions getAsMap and getNegLog10GQ now handle missing
genotype case.
---
.../sting/utils/variantcontext/GenotypeLikelihoods.java | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index 47c93bb1b..8c8e4f257 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -101,9 +101,12 @@ public class GenotypeLikelihoods {
}
//Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
+ //Returns null in case of missing likelihoods
public EnumMap getAsMap(boolean normalizeFromLog10){
//Make sure that the log10likelihoods are set
double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
+ if(likelihoods == null)
+ return null;
EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class);
likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]);
likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]);
@@ -112,10 +115,13 @@ public class GenotypeLikelihoods {
}
//Return the neg log10 Genotype Quality (GQ) for the given genotype
+ //Returns Double.NEGATIVE_INFINITY in case of missing genotype
public double getNegLog10GQ(Genotype.Type genotype){
double qual = Double.NEGATIVE_INFINITY;
EnumMap likelihoods = getAsMap(false);
+ if(likelihoods == null)
+ return qual;
for(Map.Entry likelihood : likelihoods.entrySet()){
if(likelihood.getKey() == genotype)
continue;
From 385a6abec144fc72c726f88216cdcd9c3a4fa435 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 3 Nov 2011 13:04:53 +0100
Subject: [PATCH 16/44] Fixed a bug that wrongly swapped the mother and father
genotypes in case the child genotype missing.
---
.../gatk/walkers/phasing/PhaseByTransmission.java | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 3c39b58d3..244527212 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -547,10 +547,15 @@ public class PhaseByTransmission extends RodWalker, HashMa
int parentsCalled = 0;
Map firstParentLikelihoods;
Map secondParentLikelihoods;
+ ArrayList bestFirstParentGenotype = new ArrayList();
+ ArrayList bestSecondParentGenotype = new ArrayList();
+ ArrayList bestChildGenotype = new ArrayList();
Genotype.Type pairSecondParentGenotype = null;
if(mother == null || !mother.isCalled()){
firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ bestFirstParentGenotype.add(getTypeSafeNull(father));
+ bestSecondParentGenotype.add(getTypeSafeNull(mother));
pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType();
if(father != null && father.isCalled())
parentsCalled = 1;
@@ -558,6 +563,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
else{
firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ bestFirstParentGenotype.add(getTypeSafeNull(mother));
+ bestSecondParentGenotype.add(getTypeSafeNull(father));
if(father == null || !father.isCalled()){
parentsCalled = 1;
pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType();
@@ -566,6 +573,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
Map childLikelihoods = getLikelihoodsAsMapSafeNull(child);
+ bestChildGenotype.add(getTypeSafeNull(child));
//Prior vars
double bestConfigurationLikelihood = 0.0;
@@ -574,13 +582,6 @@ public class PhaseByTransmission extends RodWalker, HashMa
ArrayList bestMVCount = new ArrayList();
bestMVCount.add(0);
- ArrayList bestFirstParentGenotype = new ArrayList();
- ArrayList bestSecondParentGenotype = new ArrayList();
- ArrayList bestChildGenotype = new ArrayList();
- bestFirstParentGenotype.add(getTypeSafeNull(mother));
- bestSecondParentGenotype.add(getTypeSafeNull(father));
- bestChildGenotype.add(getTypeSafeNull(child));
-
//Get the most likely combination
//Only check for most likely combination if at least a parent and the child have genotypes
if(child.isCalled() && parentsCalled > 0){
From 571c724cfdc378dac05573146f64e3da7e6424ec Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Tue, 8 Nov 2011 15:15:51 +0100
Subject: [PATCH 17/44] Added reporting of the number of genotypes updated.
---
.../walkers/phasing/PhaseByTransmission.java | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 244527212..6394e0e24 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -113,6 +113,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
private final Byte NUM_PAIR_GENOTYPES_PHASED = 7;
private final Byte NUM_PAIR_HET_HET = 8;
private final Byte NUM_PAIR_VIOLATIONS = 9;
+ private final Byte NUM_GENOTYPES_MODIFIED = 11;
//Random number generator
private Random rand = new Random();
@@ -742,6 +743,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
metricsCounters.put(NUM_PAIR_HET_HET,0);
metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,0);
+
String mvfLine;
if (tracker != null) {
@@ -775,15 +778,21 @@ public class PhaseByTransmission extends RodWalker, HashMa
genotypeMap.put(phasedFather.getSampleName(), phasedFather);
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
+ if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
}
else{
updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters);
+ if(!(phasedMother.getType()==mother.getType() && phasedChild.getType()==child.getType()))
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
}
else{
genotypeMap.put(phasedFather.getSampleName(),phasedFather);
updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters);
+ if(!(phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
@@ -820,7 +829,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
metricsCounters.put(NUM_PAIR_HET_HET,0);
metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
- metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
+ metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
+ metricsCounters.put(NUM_GENOTYPES_MODIFIED,0);
+
return metricsCounters;
}
@@ -844,6 +855,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET));
sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS));
sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS));
+ sum.put(NUM_GENOTYPES_MODIFIED,value.get(NUM_GENOTYPES_MODIFIED)+sum.get(NUM_GENOTYPES_MODIFIED));
+
return sum;
}
@@ -865,5 +878,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED));
logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET));
logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS));
+ logger.info("Number of genotypes updated: " + result.get(NUM_GENOTYPES_MODIFIED));
+
}
}
From 1202a809cb8ac10193a4fb2cd49f3b843e18f82e Mon Sep 17 00:00:00 2001
From: Roger Zurawicki
Date: Sun, 13 Nov 2011 22:27:49 -0500
Subject: [PATCH 18/44] Added Basic Unit Tests for ReadClipper
Tests some but not all functions
Some tests have been disabled because they are not working
---
.../utils/clipreads/ReadClipperUnitTest.java | 152 +++++++++++-------
1 file changed, 94 insertions(+), 58 deletions(-)
diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java
index f625af23c..0c71a845e 100644
--- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java
@@ -30,9 +30,12 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
-import org.testng.annotations.BeforeClass;
+import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
+import java.util.LinkedList;
+import java.util.List;
+
/**
* Created by IntelliJ IDEA.
* User: roger
@@ -44,44 +47,57 @@ public class ReadClipperUnitTest extends BaseTest {
// TODO: Add error messages on failed tests
+
+ //int debug = 0;
+
GATKSAMRecord read, expected;
ReadClipper readClipper;
final static String BASES = "ACTG";
final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63
- @BeforeClass
+ // What the test read looks like
+ // Ref: 1 2 3 4 5 6 7 8
+ // Read: 0 1 2 3 - - - -
+ // -----------------------------
+ // Bases: A C T G - - - -
+ // Quals: ! + 5 ? - - - -
+
+ @BeforeTest
public void init() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length());
- read.setReadUnmappedFlag(true);
read.setReadBases(new String(BASES).getBytes());
read.setBaseQualityString(new String(QUALS));
readClipper = new ReadClipper(read);
+ //logger.warn(read.getCigarString());
}
- @Test ( enabled = false )
+ @Test ( enabled = true )
public void testHardClipBothEndsByReferenceCoordinates() {
+ init();
logger.warn("Executing testHardClipBothEndsByReferenceCoordinates");
-
+ //int debug = 1;
//Clip whole read
- Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(0,0), new GATKSAMRecord(read.getHeader()));
+ Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(1,1), new GATKSAMRecord(read.getHeader()));
//clip 1 base
- expected = readClipper.hardClipBothEndsByReferenceCoordinates(0,3);
+ expected = readClipper.hardClipBothEndsByReferenceCoordinates(1,4);
Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes());
Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3));
Assert.assertEquals(expected.getCigarString(), "1H2M1H");
}
- @Test ( enabled = false )
+ @Test ( enabled = false ) // TODO This fails at hardClipCigar and returns a NullPointerException
public void testHardClipByReadCoordinates() {
+ init();
logger.warn("Executing testHardClipByReadCoordinates");
//Clip whole read
Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new GATKSAMRecord(read.getHeader()));
//clip 1 base at start
+ System.out.println(readClipper.read.getCigarString());
expected = readClipper.hardClipByReadCoordinates(0,0);
Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
@@ -107,83 +123,101 @@ public class ReadClipperUnitTest extends BaseTest {
}
- @Test ( enabled = false )
+ public void testIfEqual( GATKSAMRecord read, byte[] readBases, String baseQuals, String cigar) {
+ Assert.assertEquals(read.getReadBases(), readBases);
+ Assert.assertEquals(read.getBaseQualityString(), baseQuals);
+ Assert.assertEquals(read.getCigarString(), cigar);
+ }
+
+ public class testParameter {
+ int inputStart;
+ int inputStop;
+ int substringStart;
+ int substringStop;
+ String cigar;
+
+ public testParameter(int InputStart, int InputStop, int SubstringStart, int SubstringStop, String Cigar) {
+ inputStart = InputStart;
+ inputStop = InputStop;
+ substringStart = SubstringStart;
+ substringStop = SubstringStop;
+ cigar = Cigar;
+ }
+ }
+
+ @Test ( enabled = true )
public void testHardClipByReferenceCoordinates() {
logger.warn("Executing testHardClipByReferenceCoordinates");
-
+ //logger.warn(debug);
//Clip whole read
Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(1,4), new GATKSAMRecord(read.getHeader()));
- //clip 1 base at start
- expected = readClipper.hardClipByReferenceCoordinates(-1,1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
- Assert.assertEquals(expected.getCigarString(), "1H3M");
+ List testList = new LinkedList();
+ testList.add(new testParameter(-1,1,1,4,"1H3M"));//clip 1 base at start
+ testList.add(new testParameter(4,-1,0,3,"3M1H"));//clip 1 base at end
+ testList.add(new testParameter(-1,2,2,4,"2H2M"));//clip 2 bases at start
+ testList.add(new testParameter(3,-1,0,2,"2M2H"));//clip 2 bases at end
- //clip 1 base at end
- expected = readClipper.hardClipByReferenceCoordinates(3,-1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3));
- Assert.assertEquals(expected.getCigarString(), "3M1H");
-
- //clip 2 bases at start
- expected = readClipper.hardClipByReferenceCoordinates(-1,2);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4));
- Assert.assertEquals(expected.getCigarString(), "2H2M");
-
- //clip 2 bases at end
- expected = readClipper.hardClipByReferenceCoordinates(2,-1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2));
- Assert.assertEquals(expected.getCigarString(), "2M2H");
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipByReferenceCoordinates(p.inputStart,p.inputStop),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
}
- @Test ( enabled = false )
+ @Test ( enabled = true )
public void testHardClipByReferenceCoordinatesLeftTail() {
+ init();
logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail");
//Clip whole read
Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new GATKSAMRecord(read.getHeader()));
- //clip 1 base at start
- expected = readClipper.hardClipByReferenceCoordinatesLeftTail(1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
- Assert.assertEquals(expected.getCigarString(), "1H3M");
+ List testList = new LinkedList();
+ testList.add(new testParameter(1,-1,1,4,"1H3M"));//clip 1 base at start
+ testList.add(new testParameter(2,-1,2,4,"2H2M"));//clip 2 bases at start
- //clip 2 bases at start
- expected = readClipper.hardClipByReferenceCoordinatesLeftTail(2);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4));
- Assert.assertEquals(expected.getCigarString(), "2H2M");
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipByReferenceCoordinatesLeftTail(p.inputStart),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
}
- @Test ( enabled = false )
+ @Test ( enabled = true )
public void testHardClipByReferenceCoordinatesRightTail() {
+ init();
logger.warn("Executing testHardClipByReferenceCoordinatesRightTail");
//Clip whole read
Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new GATKSAMRecord(read.getHeader()));
- //clip 1 base at end
- expected = readClipper.hardClipByReferenceCoordinatesRightTail(3);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3));
- Assert.assertEquals(expected.getCigarString(), "3M1H");
+ List testList = new LinkedList();
+ testList.add(new testParameter(-1,4,0,3,"3M1H"));//clip 1 base at end
+ testList.add(new testParameter(-1,3,0,2,"2M2H"));//clip 2 bases at end
- //clip 2 bases at end
- expected = readClipper.hardClipByReferenceCoordinatesRightTail(2);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2));
- Assert.assertEquals(expected.getCigarString(), "2M2H");
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipByReferenceCoordinatesRightTail(p.inputStop),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
}
- @Test ( enabled = false )
+ @Test ( enabled = false ) // TODO This function is returning null reads
public void testHardClipLowQualEnds() {
+ init();
logger.warn("Executing testHardClipByReferenceCoordinates");
@@ -192,6 +226,7 @@ public class ReadClipperUnitTest extends BaseTest {
//clip 1 base at start
expected = readClipper.hardClipLowQualEnds((byte)34);
+ logger.warn(expected.getBaseQualities().toString()+","+expected.getBaseQualityString());
Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
Assert.assertEquals(expected.getCigarString(), "1H3M");
@@ -203,10 +238,11 @@ public class ReadClipperUnitTest extends BaseTest {
Assert.assertEquals(expected.getCigarString(), "2H2M");
// Reverse Quals sequence
- readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33
+ //readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33
//clip 1 base at end
- expected = readClipper.hardClipLowQualEnds((byte)34);
+ expected = readClipper.hardClipLowQualEnds((byte)'!');
+ logger.warn(expected.getBaseQualities().toString()+","+expected.getBaseQualityString());
Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes());
Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3));
Assert.assertEquals(expected.getCigarString(), "3M1H");
@@ -220,4 +256,4 @@ public class ReadClipperUnitTest extends BaseTest {
// revert Qual sequence
readClipper.getRead().setBaseQualityString(QUALS);
}
-}
+}
\ No newline at end of file
From 34acf8b9789a263b991b65477e28b8fe3f25881d Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 14 Nov 2011 10:47:02 +0100
Subject: [PATCH 19/44] Added Unit tests for new methods in GenotypeLikelihoods
---
.../GenotypeLikelihoodsUnitTest.java | 47 +++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java
index 9243588ab..f3d0dedcd 100755
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java
@@ -29,10 +29,13 @@ package org.broadinstitute.sting.utils.variantcontext;
// the imports for unit testing.
+import org.broadinstitute.sting.utils.MathUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import java.util.EnumMap;
+
/**
* Basic unit test for Genotype likelihoods objects
@@ -69,6 +72,50 @@ public class GenotypeLikelihoodsUnitTest {
gl.getAsVector();
}
+ @Test
+ public void testGetAsMap(){
+ GenotypeLikelihoods gl = new GenotypeLikelihoods(v);
+ //Log scale
+ EnumMap glMap = gl.getAsMap(false);
+ Assert.assertEquals(v[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF));
+ Assert.assertEquals(v[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET));
+ Assert.assertEquals(v[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR));
+
+ //Linear scale
+ glMap = gl.getAsMap(true);
+ double [] vl = MathUtils.normalizeFromLog10(v);
+ Assert.assertEquals(vl[Genotype.Type.HOM_REF.ordinal()-1],glMap.get(Genotype.Type.HOM_REF));
+ Assert.assertEquals(vl[Genotype.Type.HET.ordinal()-1],glMap.get(Genotype.Type.HET));
+ Assert.assertEquals(vl[Genotype.Type.HOM_VAR.ordinal()-1],glMap.get(Genotype.Type.HOM_VAR));
+
+ //Test missing likelihoods
+ gl = new GenotypeLikelihoods(".");
+ glMap = gl.getAsMap(false);
+ Assert.assertNull(glMap);
+
+ }
+
+ @Test
+ public void testGetNegLog10GQ(){
+ GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString);
+
+ //GQ for the best guess genotype
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),3.9);
+
+ double[] test = MathUtils.normalizeFromLog10(gl.getAsVector());
+
+ //GQ for the other genotypes
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1]));
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR), -1.0 * Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1]));
+
+ //Test missing likelihoods
+ gl = new GenotypeLikelihoods(".");
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_REF),Double.NEGATIVE_INFINITY);
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HET),Double.NEGATIVE_INFINITY);
+ Assert.assertEquals(gl.getNegLog10GQ(Genotype.Type.HOM_VAR),Double.NEGATIVE_INFINITY);
+
+ }
+
private void assertDoubleArraysAreEqual(double[] v1, double[] v2) {
Assert.assertEquals(v1.length, v2.length);
for ( int i = 0; i < v1.length; i++ ) {
From 6881d4800c3f782255152f63d453f12451a96509 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 14 Nov 2011 10:47:51 +0100
Subject: [PATCH 20/44] Added Integration tests for Phasing by Transmission
---
.../PhaseByTransmissionIntegrationTest.java | 122 +++++++++++++++++-
1 file changed, 115 insertions(+), 7 deletions(-)
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
index c663c1dd7..2cd76e7a5 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java
@@ -6,23 +6,131 @@ import org.testng.annotations.Test;
import java.util.Arrays;
public class PhaseByTransmissionIntegrationTest extends WalkerTest {
- private static String phaseByTransmissionTestDataRoot = validationDataLocation + "/PhaseByTransmission";
- private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf";
+ private static String phaseByTransmissionTestDataRoot = validationDataLocation + "PhaseByTransmission/";
+ private static String goodFamilyFile = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.goodFamilies.ped";
+ private static String TNTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TN.vcf";
+ private static String TPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TP.vcf";
+ private static String FPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.FP.vcf";
+ private static String SpecialTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.Special.vcf";
+ //Tests using PbT on all genotypes with default parameters
+ //And all reporting options
@Test
- public void testBasicFunctionality() {
+ public void testTrueNegativeMV() {
WalkerTestSpec spec = new WalkerTestSpec(
buildCommandLine(
"-T PhaseByTransmission",
"-NO_HEADER",
"-R " + b37KGReference,
- "--variant " + fundamentalTestVCF,
- "-f NA12892+NA12891=NA12878",
+ "--variant " + TNTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("16fefda693156eadf1481fd9de23facb","9418a7a6405b78179ca13a67b8bfcc14")
+ );
+ executeTest("testTrueNegativeMV", spec);
+ }
+
+ @Test
+ public void testTruePositiveMV() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + TPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("14cf1d21a54d8b9fb506df178b634c56","efc66ae3d036715b721f9bd35b65d556")
+ );
+ executeTest("testTruePositiveMV", spec);
+ }
+
+ @Test
+ public void testFalsePositiveMV() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + FPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("f9b0fae9fe1e0f09b883a292b0e70a12","398724bc1e65314cc5ee92706e05a3ee")
+ );
+ executeTest("testFalsePositiveMV", spec);
+ }
+
+ @Test
+ public void testSpecialCases() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + SpecialTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("b8d1aa3789ce77b45430c62d13ee3006","a1a333e08fafb288cda0e7711909e1c3")
+ );
+ executeTest("testSpecialCases", spec);
+ }
+
+ //Test using a different prior
+ //Here the FP file is used but as the prior is lowered, 3 turn to TP
+ @Test
+ public void testPriorOption() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + FPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
+ "-prior 1e-4",
+ "-mvf %s",
+ "-o %s"
+ ),
+ 2,
+ Arrays.asList("7201ce7cc47db5840ac6b647709f7c33","c11b5e7cd7459d90d0160f917eff3b1e")
+ );
+ executeTest("testPriorOption", spec);
+ }
+
+ //Test when running without MV reporting option
+ //This is the exact same test file as FP but should not generate a .mvf file
+ @Test
+ public void testMVFileOption() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ buildCommandLine(
+ "-T PhaseByTransmission",
+ "-NO_HEADER",
+ "-R " + b37KGReference,
+ "--variant " + FPTest,
+ "-ped "+ goodFamilyFile,
+ "-L 1:10109-10315",
"-o %s"
),
1,
- Arrays.asList("")
+ Arrays.asList("398724bc1e65314cc5ee92706e05a3ee")
);
- executeTest("testBasicFunctionality", spec);
+ executeTest("testMVFileOption", spec);
}
+
}
From 284430d61dafcdf47177040bbf416b7501e47168 Mon Sep 17 00:00:00 2001
From: Roger Zurawicki
Date: Tue, 15 Nov 2011 00:13:52 -0500
Subject: [PATCH 21/44] Added more basic UnitTests for ReadClipper
hardClipByReadCoordinatesWorks
hardClipLowQualTailsWorks
---
.../utils/clipreads/ReadClipperUnitTest.java | 206 +++++++++---------
1 file changed, 103 insertions(+), 103 deletions(-)
diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java
index 0c71a845e..ecb5a6d33 100644
--- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java
@@ -30,8 +30,7 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
-import org.testng.annotations.BeforeTest;
-import org.testng.annotations.Test;
+import org.testng.annotations.*;
import java.util.LinkedList;
import java.util.List;
@@ -47,7 +46,6 @@ public class ReadClipperUnitTest extends BaseTest {
// TODO: Add error messages on failed tests
-
//int debug = 0;
GATKSAMRecord read, expected;
@@ -55,73 +53,6 @@ public class ReadClipperUnitTest extends BaseTest {
final static String BASES = "ACTG";
final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63
- // What the test read looks like
- // Ref: 1 2 3 4 5 6 7 8
- // Read: 0 1 2 3 - - - -
- // -----------------------------
- // Bases: A C T G - - - -
- // Quals: ! + 5 ? - - - -
-
- @BeforeTest
- public void init() {
- SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
- read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length());
- read.setReadBases(new String(BASES).getBytes());
- read.setBaseQualityString(new String(QUALS));
-
- readClipper = new ReadClipper(read);
- //logger.warn(read.getCigarString());
- }
-
- @Test ( enabled = true )
- public void testHardClipBothEndsByReferenceCoordinates() {
- init();
- logger.warn("Executing testHardClipBothEndsByReferenceCoordinates");
- //int debug = 1;
- //Clip whole read
- Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(1,1), new GATKSAMRecord(read.getHeader()));
- //clip 1 base
- expected = readClipper.hardClipBothEndsByReferenceCoordinates(1,4);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3));
- Assert.assertEquals(expected.getCigarString(), "1H2M1H");
-
- }
-
- @Test ( enabled = false ) // TODO This fails at hardClipCigar and returns a NullPointerException
- public void testHardClipByReadCoordinates() {
- init();
- logger.warn("Executing testHardClipByReadCoordinates");
-
- //Clip whole read
- Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new GATKSAMRecord(read.getHeader()));
-
- //clip 1 base at start
- System.out.println(readClipper.read.getCigarString());
- expected = readClipper.hardClipByReadCoordinates(0,0);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
- Assert.assertEquals(expected.getCigarString(), "1H3M");
-
- //clip 1 base at end
- expected = readClipper.hardClipByReadCoordinates(3,3);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3));
- Assert.assertEquals(expected.getCigarString(), "3M1H");
-
- //clip 2 bases at start
- expected = readClipper.hardClipByReadCoordinates(0,1);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4));
- Assert.assertEquals(expected.getCigarString(), "2H2M");
-
- //clip 2 bases at end
- expected = readClipper.hardClipByReadCoordinates(2,3);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2));
- Assert.assertEquals(expected.getCigarString(), "2M2H");
-
- }
public void testIfEqual( GATKSAMRecord read, byte[] readBases, String baseQuals, String cigar) {
Assert.assertEquals(read.getReadBases(), readBases);
@@ -145,6 +76,65 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
+ // What the test read looks like
+ // Ref: 1 2 3 4 5 6 7 8
+ // Read: 0 1 2 3 - - - -
+ // -----------------------------
+ // Bases: A C T G - - - -
+ // Quals: ! + 5 ? - - - -
+
+ @BeforeMethod
+ public void init() {
+ SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
+ read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length());
+ read.setReadBases(new String(BASES).getBytes());
+ read.setBaseQualityString(new String(QUALS));
+
+ readClipper = new ReadClipper(read);
+ //logger.warn(read.getCigarString());
+ }
+
+ @Test ( enabled = true )
+ public void testHardClipBothEndsByReferenceCoordinates() {
+
+ logger.warn("Executing testHardClipBothEndsByReferenceCoordinates");
+ //int debug = 1;
+ //Clip whole read
+ Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(1,1), new GATKSAMRecord(read.getHeader()));
+
+ //clip 1 base
+ expected = readClipper.hardClipBothEndsByReferenceCoordinates(1,4);
+ Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes());
+ Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3));
+ Assert.assertEquals(expected.getCigarString(), "1H2M1H");
+
+ }
+
+ @Test ( enabled = true )
+ public void testHardClipByReadCoordinates() {
+
+ logger.warn("Executing testHardClipByReadCoordinates");
+
+ //Clip whole read
+ Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new GATKSAMRecord(read.getHeader()));
+
+ List testList = new LinkedList();
+ testList.add(new testParameter(0,0,1,4,"1H3M"));//clip 1 base at start
+ testList.add(new testParameter(3,3,0,3,"3M1H"));//clip 1 base at end
+ testList.add(new testParameter(0,1,2,4,"2H2M"));//clip 2 bases at start
+ testList.add(new testParameter(2,3,0,2,"2M2H"));//clip 2 bases at end
+
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipByReadCoordinates(p.inputStart, p.inputStop),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
+
+ }
+
@Test ( enabled = true )
public void testHardClipByReferenceCoordinates() {
logger.warn("Executing testHardClipByReferenceCoordinates");
@@ -178,8 +168,8 @@ public class ReadClipperUnitTest extends BaseTest {
Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new GATKSAMRecord(read.getHeader()));
List testList = new LinkedList();
- testList.add(new testParameter(1,-1,1,4,"1H3M"));//clip 1 base at start
- testList.add(new testParameter(2,-1,2,4,"2H2M"));//clip 2 bases at start
+ testList.add(new testParameter(1, -1, 1, 4, "1H3M"));//clip 1 base at start
+ testList.add(new testParameter(2, -1, 2, 4, "2H2M"));//clip 2 bases at start
for ( testParameter p : testList ) {
init();
@@ -201,8 +191,8 @@ public class ReadClipperUnitTest extends BaseTest {
Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new GATKSAMRecord(read.getHeader()));
List testList = new LinkedList();
- testList.add(new testParameter(-1,4,0,3,"3M1H"));//clip 1 base at end
- testList.add(new testParameter(-1,3,0,2,"2M2H"));//clip 2 bases at end
+ testList.add(new testParameter(-1, 4, 0, 3, "3M1H"));//clip 1 base at end
+ testList.add(new testParameter(-1, 3, 0, 2, "2M2H"));//clip 2 bases at end
for ( testParameter p : testList ) {
init();
@@ -215,45 +205,55 @@ public class ReadClipperUnitTest extends BaseTest {
}
- @Test ( enabled = false ) // TODO This function is returning null reads
+ @Test ( enabled = true ) // TODO This function is returning null reads
public void testHardClipLowQualEnds() {
- init();
- logger.warn("Executing testHardClipByReferenceCoordinates");
+ logger.warn("Executing testHardClipByReferenceCoordinates");
//Clip whole read
Assert.assertEquals(readClipper.hardClipLowQualEnds((byte)64), new GATKSAMRecord(read.getHeader()));
- //clip 1 base at start
- expected = readClipper.hardClipLowQualEnds((byte)34);
- logger.warn(expected.getBaseQualities().toString()+","+expected.getBaseQualityString());
- Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4));
- Assert.assertEquals(expected.getCigarString(), "1H3M");
-
- //clip 2 bases at start
- expected = readClipper.hardClipLowQualEnds((byte)44);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4));
- Assert.assertEquals(expected.getCigarString(), "2H2M");
+ List testList = new LinkedList();
+ testList.add(new testParameter(1,-1,1,4,"1H3M"));//clip 1 base at start
+ testList.add(new testParameter(11,-1,2,4,"2H2M"));//clip 2 bases at start
+ for ( testParameter p : testList ) {
+ init();
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
+ /* todo find a better way to test lowqual tail clipping on both sides
// Reverse Quals sequence
- //readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33
+ readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33
- //clip 1 base at end
- expected = readClipper.hardClipLowQualEnds((byte)'!');
- logger.warn(expected.getBaseQualities().toString()+","+expected.getBaseQualityString());
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3));
- Assert.assertEquals(expected.getCigarString(), "3M1H");
+ testList = new LinkedList();
+ testList.add(new testParameter(1,-1,0,3,"3M1H"));//clip 1 base at end
+ testList.add(new testParameter(11,-1,0,2,"2M2H"));//clip 2 bases at end
- //clip 2 bases at end
- expected = readClipper.hardClipLowQualEnds((byte)44);
- Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes());
- Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2));
- Assert.assertEquals(expected.getCigarString(), "2M2H");
+ for ( testParameter p : testList ) {
+ init();
+ readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33
+ //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar);
+ testIfEqual( readClipper.hardClipLowQualEnds( (byte)p.inputStart ),
+ BASES.substring(p.substringStart,p.substringStop).getBytes(),
+ QUALS.substring(p.substringStart,p.substringStop),
+ p.cigar );
+ }
+ */
+ }
- // revert Qual sequence
- readClipper.getRead().setBaseQualityString(QUALS);
+ public class CigarReadMaker {
+
+ }
+
+ @Test ( enabled = false )
+ public void testHardClipSoftClippedBases() {
+
+ // Generate a list of cigars to test
+ // We will use testParameter in the following way
+ // Right tail, left tail,
}
}
\ No newline at end of file
From b66556f4a0faf3036275741b07e0bace4df34943 Mon Sep 17 00:00:00 2001
From: Eric Banks