From 76dd816e709ef230231173b43076c4a423577a4a Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 20 Oct 2011 12:47:27 +0200
Subject: [PATCH 001/380] Added getParents() -> returns an arrayList containing
the sample's parent(s) if available
---
.../broadinstitute/sting/gatk/samples/Sample.java | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
index b39fdd79d..a14d999ea 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java
@@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.samples;
import org.broadinstitute.sting.utils.exceptions.UserException;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
@@ -110,6 +111,17 @@ public class Sample implements Comparable { // implements java.io.Serial
return infoDB.getSample(paternalID);
}
+ public ArrayList getParents(){
+ ArrayList parents = new ArrayList(2);
+ Sample parent = getMother();
+ if(parent != null)
+ parents.add(parent);
+ parent = getFather();
+ if(parent != null)
+ parents.add(parent);
+ return parents;
+ }
+
/**
* Get gender of the sample
* @return property of key "gender" - must be of type Gender
From ef6a6fdfe499809fc3fd6282a460c2fd542963c5 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 20 Oct 2011 12:49:18 +0200
Subject: [PATCH 002/380] Added getAsMap -> returns the likelihoods as an
EnumMap with Genotypes as keys and likelihoods as values.
---
.../utils/variantcontext/GenotypeLikelihoods.java | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index dba16cf86..292bc17cd 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -25,8 +25,12 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.TribbleException;
+import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import java.util.EnumMap;
+import java.util.Map;
+
public class GenotypeLikelihoods {
public static final boolean CAP_PLS = false;
public static final int PL_CAP = 255;
@@ -94,6 +98,16 @@ public class GenotypeLikelihoods {
return likelihoodsAsString_PLs;
}
+ public EnumMap getAsMap(boolean normalizeFromLog10){
+ //Make sure that the log10likelihoods are set
+ double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
+ EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class);
+ likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]);
+ likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]);
+ likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]);
+ return likelihoodsMap;
+ }
+
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = likelihoodsAsString_PLs.split(",");
From 1c61a573296f6df9a3b2af20255c0deff1ece565 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 20 Oct 2011 13:06:44 +0200
Subject: [PATCH 003/380] Original rewrite of PhaseByTransmission: - Adapted to
get the trio information from the SampleDB (i.e. from Pedigree file (ped)) =>
Multiple trios can be passed as argument - Mendelian violations and trio
phasing possibilities are pre-calculated and stored in Maps. => Runtime is
~3x faster - Genotype combinations possible only given two MVs are now given
a squared MV prior (e.g. 0/0+0/0=>1/1 is given 10^-16 prior if the MV prior
is 10^-8) - Corrected bug: In case the best genotype combination is
Het/Het/Het, the genotypes are now set appropriately (before original
genotypes were left even if they weren't Het/Het/Het) - Basic reporting
added: -- mvf argument let the user specify a file to report remaining MVs --
When the walker ends, some basic stats about the genotype reconfiguration and
phasing are output
Known problems:
- GQ is not recalculated even if the genotype changes
Possible improvements:
- Phase partially typed trios
- Use standard Allele/Genotype Classes for the storage of the pre-calculated phase
---
.../walkers/phasing/PhaseByTransmission.java | 781 +++++++++++++-----
1 file changed, 569 insertions(+), 212 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 3eedc2a28..12b541526 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -7,18 +7,18 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
-import org.broadinstitute.sting.utils.text.XReadLines;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
-import java.io.File;
-import java.io.FileNotFoundException;
+import java.io.PrintStream;
import java.util.*;
/**
@@ -32,13 +32,13 @@ import java.util.*;
* the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to
* begin.
*/
-public class PhaseByTransmission extends RodWalker {
+public class PhaseByTransmission extends RodWalker, HashMap> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
- @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.")
- public ArrayList familySpecs = null;
+ @Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.")
+ private PrintStream mvFile = null;
@Output
protected VCFWriter vcfWriter = null;
@@ -48,239 +48,523 @@ public class PhaseByTransmission extends RodWalker {
private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8;
- private class Trio {
- private String mother;
- private String father;
- private String child;
+ private ArrayList trios = new ArrayList();
- public Trio(String mother, String father, String child) {
- this.mother = mother;
- this.father = father;
- this.child = child;
- }
+ //Matrix of priors for all genotype combinations
+ private EnumMap>> mvCountMatrix;
- public Trio(String familySpec) {
- String[] pieces = familySpec.split("[\\+\\=]");
+ //Matrix of allele transmission
+ private EnumMap>> transmissionMatrix;
- this.mother = pieces[0];
- this.father = pieces[1];
- this.child = pieces[2];
- }
+ //Metrics counters hashkeys
+ private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
+ private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1;
+ private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
+ private final Byte NUM_HET = 3;
+ private final Byte NUM_HET_HET_HET = 4;
+ private final Byte NUM_VIOLATIONS = 5;
- public String getMother() { return mother; }
- public String getFather() { return father; }
- public String getChild() { return child; }
+ private enum AlleleType {
+ NO_CALL,
+ REF,
+ VAR,
+ UNPHASED_REF,
+ UNPHASED_VAR
}
- private ArrayList trios = new ArrayList();
+ //Stores a trio-genotype
+ private class TrioPhase {
- public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) {
- if (familySpecs != null) {
- // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
- // spec list set, and treat the entries as if they had been specified on the command line.
- ArrayList specs = new ArrayList();
- for (String familySpec : familySpecs) {
- File specFile = new File(familySpec);
+ private ArrayList trioAlleles = new ArrayList(6);
- try {
- XReadLines reader = new XReadLines(specFile);
+ private ArrayList getAlleles(Genotype.Type genotype){
+ ArrayList alleles = new ArrayList(2);
+ if(genotype == Genotype.Type.HOM_REF){
+ alleles.add(AlleleType.REF);
+ alleles.add(AlleleType.REF);
+ }
+ else if(genotype == Genotype.Type.HET){
+ alleles.add(AlleleType.REF);
+ alleles.add(AlleleType.VAR);
+ }
+ else if(genotype == Genotype.Type.HOM_VAR){
+ alleles.add(AlleleType.VAR);
+ alleles.add(AlleleType.VAR);
+ }
+ else{
+ alleles.add(AlleleType.NO_CALL);
+ alleles.add(AlleleType.NO_CALL);
+ }
+ return alleles;
+ }
- List lines = reader.readLines();
- for (String line : lines) {
- specs.add(new Trio(line));
- }
- } catch (FileNotFoundException e) {
- specs.add(new Trio(familySpec)); // not a file, so must be a family spec
- }
- }
+ private ArrayList phaseSingleIndividualAlleles(Genotype.Type genotype){
+ if(genotype == Genotype.Type.HET){
+ ArrayList phasedAlleles = new ArrayList(2);
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ return phasedAlleles;
+ }
+ else
+ return getAlleles(genotype);
+ }
- return specs;
+ private ArrayList phaseMonoParentFamilyAlleles(Genotype.Type parent, Genotype.Type child){
+ ArrayList phasedAlleles = new ArrayList(4);
+ //Special case for Het/Het as it is ambiguous
+ if(parent == Genotype.Type.HET && child == Genotype.Type.HET){
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
}
- return new ArrayList();
+ ArrayList parentAlleles = getAlleles(parent);
+ ArrayList childAlleles = getAlleles(child);
+
+ int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
+ if(childTransmittedAlleleIndex > -1){
+ phasedAlleles.add(parentAlleles.get(0));
+ phasedAlleles.add(parentAlleles.get(1));
+ phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ phasedAlleles.add(childAlleles.get(0));
+ }
+ else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
+ phasedAlleles.add(parentAlleles.get(1));
+ phasedAlleles.add(parentAlleles.get(0));
+ phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ phasedAlleles.add(childAlleles.get(0));
+ }
+ else{
+ parentAlleles.addAll(childAlleles);
+ for(AlleleType allele : parentAlleles){
+ if(allele == AlleleType.REF){
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ }
+ else if(allele == AlleleType.VAR){
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ }
+ else{
+ phasedAlleles.add(AlleleType.NO_CALL);
+ }
+ }
+ }
+
+ return phasedAlleles;
+ }
+
+ private ArrayList phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+ ArrayList phasedAlleles = new ArrayList(6);
+
+ Set> possiblePhasedChildGenotypes = new HashSet>();
+ ArrayList motherAlleles = getAlleles(mother);
+ ArrayList fatherAlleles = getAlleles(father);
+ ArrayList childAlleles = getAlleles(child);
+
+ //Build all possible child genotypes for the given parent's genotypes
+ for (AlleleType momAllele : motherAlleles) {
+ for (AlleleType fatherAllele : fatherAlleles) {
+ ArrayList possiblePhasedChildAlleles = new ArrayList(2);
+ possiblePhasedChildAlleles.add(momAllele);
+ possiblePhasedChildAlleles.add(fatherAllele);
+ possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
+ }
+ }
+
+ for (ArrayList phasedChildGenotype : possiblePhasedChildGenotypes) {
+ int firstAlleleIndex = phasedChildGenotype.indexOf(childAlleles.get(0));
+ int secondAlleleIndex = phasedChildGenotype.lastIndexOf(childAlleles.get(1));
+ if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
+ //Add mother's alleles
+ phasedAlleles.add(phasedChildGenotype.get(0));
+ if(motherAlleles.get(0) != phasedAlleles.get(0))
+ phasedAlleles.add(motherAlleles.get(0));
+ else
+ phasedAlleles.add(motherAlleles.get(1));
+
+ //Add father's alleles
+ phasedAlleles.add(phasedChildGenotype.get(1));
+ if(fatherAlleles.get(0) != phasedAlleles.get(2))
+ phasedAlleles.add(fatherAlleles.get(0));
+ else
+ phasedAlleles.add(fatherAlleles.get(1));
+
+ //Add child's alleles
+ phasedAlleles.addAll(phasedChildGenotype);
+ return phasedAlleles;
+ }
+ }
+
+ //If this is reached then no phasing could be found
+ motherAlleles.addAll(fatherAlleles);
+ motherAlleles.addAll(childAlleles);
+ for(AlleleType allele : motherAlleles){
+ if(allele == AlleleType.REF){
+ phasedAlleles.add(AlleleType.UNPHASED_REF);
+ }
+ else if(allele == AlleleType.VAR){
+ phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ }
+ else{
+ phasedAlleles.add(AlleleType.NO_CALL);
+ }
+ }
+ return phasedAlleles;
+ }
+
+
+ public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ //Take care of cases where one or more family members are no call
+ if(child == Genotype.Type.NO_CALL){
+ trioAlleles.addAll(phaseSingleIndividualAlleles(mother));
+ trioAlleles.addAll(phaseSingleIndividualAlleles(father));
+ trioAlleles.add(AlleleType.NO_CALL);
+ trioAlleles.add(AlleleType.NO_CALL);
+ }
+ else if(mother == Genotype.Type.NO_CALL){
+ trioAlleles.add(AlleleType.NO_CALL);
+ trioAlleles.add(AlleleType.NO_CALL);
+ if(father == Genotype.Type.NO_CALL){
+ trioAlleles.add(AlleleType.NO_CALL);
+ trioAlleles.add(AlleleType.NO_CALL);
+ trioAlleles.addAll(phaseSingleIndividualAlleles(child));
+ }
+ else
+ trioAlleles.addAll(phaseMonoParentFamilyAlleles(father, child));
+ }
+ else if(father == Genotype.Type.NO_CALL){
+ trioAlleles.addAll(phaseMonoParentFamilyAlleles(mother, child));
+ trioAlleles.add(2, AlleleType.NO_CALL);
+ trioAlleles.add(3, AlleleType.NO_CALL);
+ }
+ //Special case for Het/Het/Het as it is ambiguous
+ else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
+ trioAlleles.add(AlleleType.UNPHASED_REF);
+ trioAlleles.add(AlleleType.UNPHASED_VAR);
+ trioAlleles.add(AlleleType.UNPHASED_REF);
+ trioAlleles.add(AlleleType.UNPHASED_VAR);
+ trioAlleles.add(AlleleType.UNPHASED_REF);
+ trioAlleles.add(AlleleType.UNPHASED_VAR);
+ }
+ //All family members have genotypes and at least one of them is not Het
+ else{
+ trioAlleles = phaseFamilyAlleles(mother, father, child);
+ }
+ }
+
+ public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,trioAlleles.subList(0,2)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,trioAlleles.subList(2,4)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,trioAlleles.subList(4,6)));
+ return phasedGenotypes;
+ }
+
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, List phasedAlleles){
+
+ //Add the transmission probability
+ Map genotypeAttributes = new HashMap();
+ genotypeAttributes.putAll(genotype.getAttributes());
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb);
+
+ boolean isPhased = true;
+
+ List alleles = new ArrayList(2);
+
+ //If unphased, return original genotype
+ for(AlleleType allele : phasedAlleles){
+ if(allele == AlleleType.NO_CALL){
+ genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
+ return genotype;
+ }
+ //Otherwise add the appropriate allele
+ else if(allele == AlleleType.UNPHASED_REF){
+ isPhased = false;
+ alleles.add(refAllele);
+ }
+ else if(allele == AlleleType.UNPHASED_VAR){
+ isPhased = false;
+ alleles.add(altAllele);
+ }
+ else if(allele == AlleleType.REF){
+ alleles.add(refAllele);
+ }
+ else if(allele == AlleleType.VAR){
+ alleles.add(altAllele);
+ }
+ }
+ //TODO: Recalculate GQ field for the new genotype
+ //Remove the GQ attribute as it needs to be recalculated for the new genotype assignment
+ //double[] likelihoods = genotype.getLikelihoods().getAsVector();
+
+ //genotypeAttributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,likelihoods[1]);
+ genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
+ return new Genotype(genotype.getSampleName(), alleles, genotype.getNegLog10PError(), null, genotype.getAttributes(), isPhased);
+ }
+
+
}
/**
* Parse the familial relationship specification, and initialize VCF writer
*/
public void initialize() {
- trios = getFamilySpecsFromCommandLineInput(familySpecs);
-
ArrayList rodNames = new ArrayList();
rodNames.add(variantCollection.variants.getName());
-
Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
+ //Get the trios from the families passed as ped
+ setTrios();
+ if(trios.size()<1)
+ throw new UserException.BadInput("No PED file passed or no trios found in PED file. Aborted.");
+
+
Set headerLines = new HashSet();
headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit()));
- headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct"));
+ headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the phase given that the genotypes are correct"));
headerLines.add(new VCFHeaderLine("source", SOURCE_NAME));
vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples));
+
+ buildMatrices();
+
+ if(mvFile != null)
+ mvFile.println("#CHROM\tPOS\tFILTER\tAC\tFAMILY\tTP\tMOTHER_GT\tMOTHER_DP\tMOTHER_RAD\tMOTHER_AAD\tMOTHER_HRPL\tMOTHER_HETPL\tMOTHER_HAPL\tFATHER_GT\tFATHER_DP\tFATHER_RAD\tFATHER_AAD\tFATHER_HRPL\tFATHER_HETPL\tFATHER_HAPL\tCHILD_GT\tCHILD_DP\tCHILD_RAD\tCHILD_AAD\tCHILD_HRPL\tCHILD_HETPL\tCHILD_HAPL");
+
}
- private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) {
- double[] momLikelihoods = MathUtils.normalizeFromLog10(mom.getLikelihoods().getAsVector());
- double[] dadLikelihoods = MathUtils.normalizeFromLog10(dad.getLikelihoods().getAsVector());
- double[] childLikelihoods = MathUtils.normalizeFromLog10(child.getLikelihoods().getAsVector());
+ /**
+ * Select Trios only
+ */
+ private void setTrios(){
- int momIndex = mom.getType().ordinal() - 1;
- int dadIndex = dad.getType().ordinal() - 1;
- int childIndex = child.getType().ordinal() - 1;
-
- return momLikelihoods[momIndex]*dadLikelihoods[dadIndex]*childLikelihoods[childIndex];
- }
-
- private ArrayList createAllThreeGenotypes(Allele refAllele, Allele altAllele, Genotype g) {
- List homRefAlleles = new ArrayList();
- homRefAlleles.add(refAllele);
- homRefAlleles.add(refAllele);
- Genotype homRef = new Genotype(g.getSampleName(), homRefAlleles, g.getNegLog10PError(), null, g.getAttributes(), false);
-
- List hetAlleles = new ArrayList();
- hetAlleles.add(refAllele);
- hetAlleles.add(altAllele);
- Genotype het = new Genotype(g.getSampleName(), hetAlleles, g.getNegLog10PError(), null, g.getAttributes(), false);
-
- List homVarAlleles = new ArrayList();
- homVarAlleles.add(altAllele);
- homVarAlleles.add(altAllele);
- Genotype homVar = new Genotype(g.getSampleName(), homVarAlleles, g.getNegLog10PError(), null, g.getAttributes(), false);
-
- ArrayList genotypes = new ArrayList();
- genotypes.add(homRef);
- genotypes.add(het);
- genotypes.add(homVar);
-
- return genotypes;
- }
-
- private int getNumberOfMatchingAlleles(Allele alleleToMatch, Genotype g) {
- List alleles = g.getAlleles();
- int matchingAlleles = 0;
-
- for (Allele a : alleles) {
- if (!alleleToMatch.equals(a)) {
- matchingAlleles++;
+ Map> families = this.getSampleDB().getFamilies();
+ Set family;
+ ArrayList parents;
+ for(String familyID : families.keySet()){
+ family = families.get(familyID);
+ if(family.size()!=3){
+ logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios. Family skipped.",familyID,family.size()));
}
- }
-
- return matchingAlleles;
- }
-
- private boolean isMendelianViolation(Allele refAllele, Allele altAllele, Genotype mom, Genotype dad, Genotype child) {
- int numMomRefAlleles = getNumberOfMatchingAlleles(refAllele, mom) > 0 ? 1 : 0;
- int numMomAltAlleles = getNumberOfMatchingAlleles(altAllele, mom) > 0 ? 1 : 0;
-
- int numDadRefAlleles = getNumberOfMatchingAlleles(refAllele, dad) > 0 ? 1 : 0;
- int numDadAltAlleles = getNumberOfMatchingAlleles(altAllele, dad) > 0 ? 1 : 0;
-
- int numChildRefAlleles = getNumberOfMatchingAlleles(refAllele, child);
- int numChildAltAlleles = getNumberOfMatchingAlleles(altAllele, child);
-
- return (numMomRefAlleles + numDadRefAlleles < numChildRefAlleles || numMomAltAlleles + numDadAltAlleles < numChildAltAlleles);
- }
-
- private ArrayList getPhasedGenotypes(Genotype mom, Genotype dad, Genotype child) {
- Set possiblePhasedChildGenotypes = new HashSet();
-
- for (Allele momAllele : mom.getAlleles()) {
- for (Allele dadAllele : dad.getAlleles()) {
- ArrayList possiblePhasedChildAlleles = new ArrayList();
- possiblePhasedChildAlleles.add(momAllele);
- possiblePhasedChildAlleles.add(dadAllele);
-
- Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getNegLog10PError(), child.getFilters(), child.getAttributes(), true);
-
- possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype);
- }
- }
-
- ArrayList finalGenotypes = new ArrayList();
-
- for (Genotype phasedChildGenotype : possiblePhasedChildGenotypes) {
- if (child.sameGenotype(phasedChildGenotype, true)) {
- Allele momTransmittedAllele = phasedChildGenotype.getAllele(0);
- Allele momUntransmittedAllele = mom.getAllele(0) != momTransmittedAllele ? mom.getAllele(0) : mom.getAllele(1);
-
- ArrayList phasedMomAlleles = new ArrayList();
- phasedMomAlleles.add(momTransmittedAllele);
- phasedMomAlleles.add(momUntransmittedAllele);
-
- Genotype phasedMomGenotype = new Genotype(mom.getSampleName(), phasedMomAlleles, mom.getNegLog10PError(), mom.getFilters(), mom.getAttributes(), true);
-
- Allele dadTransmittedAllele = phasedChildGenotype.getAllele(1);
- Allele dadUntransmittedAllele = dad.getAllele(0) != dadTransmittedAllele ? dad.getAllele(0) : dad.getAllele(1);
-
- ArrayList phasedDadAlleles = new ArrayList();
- phasedDadAlleles.add(dadTransmittedAllele);
- phasedDadAlleles.add(dadUntransmittedAllele);
-
- Genotype phasedDadGenotype = new Genotype(dad.getSampleName(), phasedDadAlleles, dad.getNegLog10PError(), dad.getFilters(), dad.getAttributes(), true);
-
- finalGenotypes.add(phasedMomGenotype);
- finalGenotypes.add(phasedDadGenotype);
- finalGenotypes.add(phasedChildGenotype);
-
- return finalGenotypes;
- }
- }
-
- finalGenotypes.add(mom);
- finalGenotypes.add(dad);
- finalGenotypes.add(child);
-
- return finalGenotypes;
- }
-
- private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) {
- ArrayList finalGenotypes = new ArrayList();
- finalGenotypes.add(mother);
- finalGenotypes.add(father);
- finalGenotypes.add(child);
-
- if (mother.isCalled() && father.isCalled() && child.isCalled()) {
- ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother);
- ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father);
- ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child);
-
- double bestConfigurationLikelihood = 0.0;
- double bestPrior = 0.0;
- Genotype bestMotherGenotype = mother;
- Genotype bestFatherGenotype = father;
- Genotype bestChildGenotype = child;
-
- double norm = 0.0;
-
- for (Genotype motherGenotype : possibleMotherGenotypes) {
- for (Genotype fatherGenotype : possibleFatherGenotypes) {
- for (Genotype childGenotype : possibleChildGenotypes) {
- double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR;
- double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype);
- norm += prior*configurationLikelihood;
-
- if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) {
- bestConfigurationLikelihood = configurationLikelihood;
- bestPrior = prior;
- bestMotherGenotype = motherGenotype;
- bestFatherGenotype = fatherGenotype;
- bestChildGenotype = childGenotype;
- }
- }
+ else{
+ for(Sample familyMember : family){
+ parents = familyMember.getParents();
+ if(parents.size()>0){
+ if(family.containsAll(parents))
+ this.trios.add(familyMember);
+ else
+ logger.info(String.format("Caution: Family %s is not a trio; At the moment Phase By Transmission only supports trios. Family skipped.",familyID));
+ break;
+ }
}
}
- if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) {
- Map attributes = new HashMap();
- attributes.putAll(bestChildGenotype.getAttributes());
- attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm);
- bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes);
+ }
- finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype);
+
+
+ }
+
+ private void buildMatrices(){
+ mvCountMatrix = new EnumMap>>(Genotype.Type.class);
+ transmissionMatrix = new EnumMap>>(Genotype.Type.class);
+ for(Genotype.Type mother : Genotype.Type.values()){
+ mvCountMatrix.put(mother,new EnumMap>(Genotype.Type.class));
+ transmissionMatrix.put(mother,new EnumMap>(Genotype.Type.class));
+ for(Genotype.Type father : Genotype.Type.values()){
+ mvCountMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class));
+ transmissionMatrix.get(mother).put(father,new EnumMap(Genotype.Type.class));
+ for(Genotype.Type child : Genotype.Type.values()){
+ mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
+ transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
+ }
+ }
+ }
+ }
+
+ private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ //Child is no call => No MV
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
+ return 0;
+ //Add parents with genotypes for the evaluation
+ ArrayList parents = new ArrayList();
+ if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
+ parents.add(mother);
+ if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
+ parents.add(father);
+
+ //Both parents no calls => No MV
+ if (parents.isEmpty())
+ return 0;
+
+ //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed
+ int parentsNumRefAlleles = 0;
+ int parentsNumAltAlleles = 0;
+
+ for(Genotype.Type parent : parents){
+ if(parent == Genotype.Type.HOM_REF){
+ parentsNumRefAlleles++;
+ }
+ else if(parent == Genotype.Type.HET){
+ parentsNumRefAlleles++;
+ parentsNumAltAlleles++;
+ }
+ else if(parent == Genotype.Type.HOM_VAR){
+ parentsNumAltAlleles++;
}
}
- return finalGenotypes;
+ //Case Child is HomRef
+ if(child == Genotype.Type.HOM_REF){
+ if(parentsNumRefAlleles == parents.size())
+ return 0;
+ else return (parents.size()-parentsNumRefAlleles);
+ }
+
+ //Case child is HomVar
+ if(child == Genotype.Type.HOM_VAR){
+ if(parentsNumAltAlleles == parents.size())
+ return 0;
+ else return parents.size()-parentsNumAltAlleles;
+ }
+
+ //Case child is Het
+ if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
+ return 0;
+
+ //MV
+ return 1;
+ }
+
+ private Double getCombinationPrior(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+
+ double nonMVPrior = 1.0 - 12*MENDELIAN_VIOLATION_PRIOR;
+
+ //Child is no call => No MV
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
+ return nonMVPrior;
+ //Add parents with genotypes for the evaluation
+ ArrayList parents = new ArrayList();
+ if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
+ parents.add(mother);
+ if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
+ parents.add(father);
+
+ //Both parents no calls => No MV
+ if (parents.isEmpty())
+ return nonMVPrior;
+
+ //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed
+ int parentsNumRefAlleles = 0;
+ int parentsNumAltAlleles = 0;
+
+ for(Genotype.Type parent : parents){
+ if(parent == Genotype.Type.HOM_REF){
+ parentsNumRefAlleles++;
+ }
+ else if(parent == Genotype.Type.HET){
+ parentsNumRefAlleles++;
+ parentsNumAltAlleles++;
+ }
+ else if(parent == Genotype.Type.HOM_VAR){
+ parentsNumAltAlleles++;
+ }
+ }
+
+ //Case Child is HomRef
+ if(child == Genotype.Type.HOM_REF){
+ if(parentsNumRefAlleles == parents.size())
+ return nonMVPrior;
+ else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumRefAlleles);
+ }
+
+ //Case child is HomVar
+ if(child == Genotype.Type.HOM_VAR){
+ if(parentsNumAltAlleles == parents.size())
+ return nonMVPrior;
+ else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumAltAlleles);
+ }
+
+ //Case child is Het
+ if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
+ return nonMVPrior;
+
+ //MV
+ return MENDELIAN_VIOLATION_PRIOR;
+ }
+
+ private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
+ int count = 0;
+ if(motherOriginal!=motherNew)
+ count++;
+ if(fatherOriginal!=fatherNew)
+ count++;
+ if(childOriginal!=childNew)
+ count++;
+ return count;
+ }
+
+
+ private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
+
+ //For now, only consider trios with complete information
+ //TODO: Phasing of trios with missing information
+ if(!(mother.isCalled() && father.isCalled() && child.isCalled())) {
+ finalGenotypes.add(mother);
+ finalGenotypes.add(father);
+ finalGenotypes.add(child);
+ return false;
+ }
+
+ //Get the PL
+ Map motherLikelihoods = mother.getLikelihoods().getAsMap(true);
+ Map fatherLikelihoods = father.getLikelihoods().getAsMap(true);
+ Map childLikelihoods = child.getLikelihoods().getAsMap(true);
+
+ //Prior vars
+ double bestConfigurationLikelihood = 0.0;
+ double norm = 0.0;
+ boolean isMV = false;
+ int bestConfigurationGenotypeDiffs=4;
+ Genotype.Type bestMotherGenotype = mother.getType();
+ Genotype.Type bestFatherGenotype = father.getType();
+ Genotype.Type bestChildGenotype = child.getType();
+
+ //Get the most likely combination
+ int mvCount;
+ double configurationLikelihood;
+ int configurationGenotypeDiffs;
+ for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
+ for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
+ for(Map.Entry childGenotype : childLikelihoods.entrySet()){
+ mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
+ configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-12*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
+ norm += configurationLikelihood;
+ configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if ((configurationLikelihood > bestConfigurationLikelihood) ||
+ (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) {
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMotherGenotype = motherGenotype.getKey();
+ bestFatherGenotype = fatherGenotype.getKey();
+ bestChildGenotype = childGenotype.getKey();
+ isMV = mvCount>0;
+ bestConfigurationGenotypeDiffs=configurationGenotypeDiffs;
+ }
+ }
+ }
+ }
+
+ //Get the phased alleles for the genotype configuration
+ TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype);
+
+ //Return the phased genotypes
+ phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,MathUtils.probabilityToPhredScale(1-(bestConfigurationLikelihood / norm)),finalGenotypes);
+ return isMV;
+
}
/**
@@ -292,18 +576,34 @@ public class PhaseByTransmission extends RodWalker {
* @return null
*/
@Override
- public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+
+ //Local cars to avoid lookups on increment
+ int numTrioGenotypesCalled = 0;
+ int numTrioGenotypesNoCall = 0;
+ int numTrioGenotypesPhased = 0;
+ int numHet = 0 ;
+ int numHetHetHet = 0;
+ int numMVs = 0;
+
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
Map genotypeMap = vc.getGenotypes();
- for (Trio trio : trios) {
- Genotype mother = vc.getGenotype(trio.getMother());
- Genotype father = vc.getGenotype(trio.getFather());
- Genotype child = vc.getGenotype(trio.getChild());
+ boolean isMV;
- ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child);
+ for (Sample sample : trios) {
+ Genotype mother = vc.getGenotype(sample.getMaternalID());
+ Genotype father = vc.getGenotype(sample.getPaternalID());
+ Genotype child = vc.getGenotype(sample.getID());
+
+ //Skip trios where any of the genotype is missing in the variant context
+ if(mother == null || father == null | child == null)
+ continue;
+
+ ArrayList trioGenotypes = new ArrayList(3);
+ isMV = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
Genotype phasedMother = trioGenotypes.get(0);
Genotype phasedFather = trioGenotypes.get(1);
@@ -312,14 +612,47 @@ public class PhaseByTransmission extends RodWalker {
genotypeMap.put(phasedMother.getSampleName(), phasedMother);
genotypeMap.put(phasedFather.getSampleName(), phasedFather);
genotypeMap.put(phasedChild.getSampleName(), phasedChild);
+
+ //Increment metrics counters
+ if(phasedMother.isCalled() && phasedFather.isCalled() && phasedChild.isCalled()){
+ numTrioGenotypesCalled++;
+ if(phasedMother.isPhased())
+ numTrioGenotypesPhased++;
+
+ if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){
+ numHet++;
+ if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()){
+ numHetHetHet++;
+ }else if(!phasedMother.isPhased()){
+ int x =9;
+ }
+ }
+
+ if(isMV){
+ numMVs++;
+ if(mvFile != null)
+ mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()));
+ }
+ }else{
+ numTrioGenotypesNoCall++;
+ }
+
}
+
VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap);
vcfWriter.add(newvc);
}
- return null;
+ HashMap metricsCounters = new HashMap(5);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,numTrioGenotypesCalled);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,numTrioGenotypesNoCall);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,numTrioGenotypesPhased);
+ metricsCounters.put(NUM_HET,numHet);
+ metricsCounters.put(NUM_HET_HET_HET,numHetHetHet);
+ metricsCounters.put(NUM_VIOLATIONS,numMVs);
+ return metricsCounters;
}
/**
@@ -328,8 +661,15 @@ public class PhaseByTransmission extends RodWalker {
* @return Initial value of reduce.
*/
@Override
- public Integer reduceInit() {
- return null;
+ public HashMap reduceInit() {
+ HashMap metricsCounters = new HashMap(5);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_HET,0);
+ metricsCounters.put(NUM_HET_HET_HET,0);
+ metricsCounters.put(NUM_VIOLATIONS,0);
+ return metricsCounters;
}
/**
@@ -340,7 +680,24 @@ public class PhaseByTransmission extends RodWalker {
* @return accumulator with result of the map taken into account.
*/
@Override
- public Integer reduce(Integer value, Integer sum) {
- return null;
+ public HashMap reduce(HashMap value, HashMap sum) {
+ sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED));
+ sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL));
+ sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED));
+ sum.put(NUM_HET,value.get(NUM_HET)+sum.get(NUM_HET));
+ sum.put(NUM_HET_HET_HET,value.get(NUM_HET_HET_HET)+sum.get(NUM_HET_HET_HET));
+ sum.put(NUM_VIOLATIONS,value.get(NUM_VIOLATIONS)+sum.get(NUM_VIOLATIONS));
+ return sum;
+ }
+
+
+ @Override
+ public void onTraversalDone(HashMap result) {
+ logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED));
+ logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL));
+ logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED));
+ logger.info("Number of resulting Hom/Hom/Hom trios: " + result.get(NUM_HET));
+ logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_HET_HET_HET));
+ logger.info("Number of remaining mendelian violations: " + result.get(NUM_VIOLATIONS));
}
}
From edea90786a4555ff665b07a3dc2f97ce7cbae24c Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 20 Oct 2011 17:04:19 +0200
Subject: [PATCH 004/380] Genotype quality is now recalculated for each of the
phased Genotypes. Small problem is that we unnecessarily loose a little
precision on the genotypes that do not change after assignment.
---
.../walkers/phasing/PhaseByTransmission.java | 9 ++----
.../variantcontext/GenotypeLikelihoods.java | 29 +++++++++++++++++++
2 files changed, 31 insertions(+), 7 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 12b541526..52629277f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -266,6 +266,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
Map genotypeAttributes = new HashMap();
genotypeAttributes.putAll(genotype.getAttributes());
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb);
+ genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
boolean isPhased = true;
@@ -274,7 +275,6 @@ public class PhaseByTransmission extends RodWalker, HashMa
//If unphased, return original genotype
for(AlleleType allele : phasedAlleles){
if(allele == AlleleType.NO_CALL){
- genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
return genotype;
}
//Otherwise add the appropriate allele
@@ -293,13 +293,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
alleles.add(altAllele);
}
}
- //TODO: Recalculate GQ field for the new genotype
- //Remove the GQ attribute as it needs to be recalculated for the new genotype assignment
- //double[] likelihoods = genotype.getLikelihoods().getAsVector();
- //genotypeAttributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,likelihoods[1]);
- genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
- return new Genotype(genotype.getSampleName(), alleles, genotype.getNegLog10PError(), null, genotype.getAttributes(), isPhased);
+ return new Genotype(genotype.getSampleName(), alleles, genotype.getLikelihoods().getLog10GQ(genotype.getType()), null, genotype.getAttributes(), isPhased);
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index 292bc17cd..9ed9b41cc 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -25,8 +25,10 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.broad.tribble.TribbleException;
+import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.jgrapht.util.MathUtil;
import java.util.EnumMap;
import java.util.Map;
@@ -98,6 +100,7 @@ public class GenotypeLikelihoods {
return likelihoodsAsString_PLs;
}
+ //Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
public EnumMap getAsMap(boolean normalizeFromLog10){
//Make sure that the log10likelihoods are set
double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
@@ -108,6 +111,32 @@ public class GenotypeLikelihoods {
return likelihoodsMap;
}
+ //Return the log10 Genotype Quality (GQ) for the given genotype
+ public double getLog10GQ(Genotype.Type genotype){
+
+ double qual = Double.NEGATIVE_INFINITY;
+ EnumMap likelihoods = getAsMap(false);
+ for(Map.Entry likelihood : likelihoods.entrySet()){
+ if(likelihood.getKey() == genotype)
+ continue;
+ if(likelihood.getValue() > qual)
+ qual = likelihood.getValue();
+
+ }
+
+ qual = likelihoods.get(genotype) - qual;
+
+ if (qual < 0) {
+ // QUAL can be negative if the chosen genotype is not the most likely one individually.
+ // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one
+ double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
+ double chosenGenotype = normalized[genotype.ordinal()-1];
+ qual = -1.0 * Math.log10(1.0 - chosenGenotype);
+ }
+
+ return qual;
+ }
+
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = likelihoodsAsString_PLs.split(",");
From 01b16abc8dcc233e29258eaa4e1d1c0415180e91 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 24 Oct 2011 10:24:41 +0200
Subject: [PATCH 005/380] Genotype quality calculation modified to handle all
genotypes the same way. This is inconsistent with GQ output by the UG but is
correct even for cases of poor quality genotypes.
---
.../variantcontext/GenotypeLikelihoods.java | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index 9ed9b41cc..b83ffa2fd 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -111,8 +111,8 @@ public class GenotypeLikelihoods {
return likelihoodsMap;
}
- //Return the log10 Genotype Quality (GQ) for the given genotype
- public double getLog10GQ(Genotype.Type genotype){
+ //Return the neg log10 Genotype Quality (GQ) for the given genotype
+ public double getNegLog10GQ(Genotype.Type genotype){
double qual = Double.NEGATIVE_INFINITY;
EnumMap likelihoods = getAsMap(false);
@@ -124,15 +124,9 @@ public class GenotypeLikelihoods {
}
- qual = likelihoods.get(genotype) - qual;
-
- if (qual < 0) {
- // QUAL can be negative if the chosen genotype is not the most likely one individually.
- // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one
- double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
- double chosenGenotype = normalized[genotype.ordinal()-1];
- qual = -1.0 * Math.log10(1.0 - chosenGenotype);
- }
+ double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
+ double chosenGenotype = normalized[genotype.ordinal()-1];
+ qual = -1.0 * Math.log10(1.0 - chosenGenotype);
return qual;
}
From 7312e35c71167dee3510bf428204b3a4b75ab52f Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 24 Oct 2011 10:25:53 +0200
Subject: [PATCH 006/380] Now makes use of standard Allele and Genotype
classes. This allowed quite some code cleaning.
---
.../walkers/phasing/PhaseByTransmission.java | 341 +++++++-----------
1 file changed, 135 insertions(+), 206 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 52629277f..62b7bfffa 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -56,7 +56,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
//Matrix of allele transmission
private EnumMap>> transmissionMatrix;
- //Metrics counters hashkeys
+ //Metrics counters hash keys
private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1;
private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
@@ -64,237 +64,223 @@ public class PhaseByTransmission extends RodWalker, HashMa
private final Byte NUM_HET_HET_HET = 4;
private final Byte NUM_VIOLATIONS = 5;
- private enum AlleleType {
- NO_CALL,
- REF,
- VAR,
- UNPHASED_REF,
- UNPHASED_VAR
+ private enum FamilyMember {
+ MOTHER,
+ FATHER,
+ CHILD
}
//Stores a trio-genotype
private class TrioPhase {
- private ArrayList trioAlleles = new ArrayList(6);
+ //Create 2 fake alleles
+ //The actual bases will never be used but the Genotypes created using the alleles will be.
+ private final Allele REF = Allele.create("A",true);
+ private final Allele VAR = Allele.create("A",false);
+ private final Allele NO_CALL = Allele.create(".",false);
+ private final String DUMMY_NAME = "DummySample";
- private ArrayList getAlleles(Genotype.Type genotype){
- ArrayList alleles = new ArrayList(2);
+ private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class);
+
+ private ArrayList getAlleles(Genotype.Type genotype){
+ ArrayList alleles = new ArrayList(2);
if(genotype == Genotype.Type.HOM_REF){
- alleles.add(AlleleType.REF);
- alleles.add(AlleleType.REF);
+ alleles.add(REF);
+ alleles.add(REF);
}
else if(genotype == Genotype.Type.HET){
- alleles.add(AlleleType.REF);
- alleles.add(AlleleType.VAR);
+ alleles.add(REF);
+ alleles.add(VAR);
}
else if(genotype == Genotype.Type.HOM_VAR){
- alleles.add(AlleleType.VAR);
- alleles.add(AlleleType.VAR);
+ alleles.add(VAR);
+ alleles.add(VAR);
+ }
+ else if(genotype == Genotype.Type.NO_CALL){
+ alleles.add(NO_CALL);
+ alleles.add(NO_CALL);
}
else{
- alleles.add(AlleleType.NO_CALL);
- alleles.add(AlleleType.NO_CALL);
+ return null;
}
return alleles;
}
- private ArrayList phaseSingleIndividualAlleles(Genotype.Type genotype){
- if(genotype == Genotype.Type.HET){
- ArrayList phasedAlleles = new ArrayList(2);
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
- return phasedAlleles;
+ //Create a new Genotype based on information from a single individual
+ //Homozygous genotypes will be set as phased, heterozygous won't be
+ private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
+ if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true));
}
else
- return getAlleles(genotype);
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- private ArrayList phaseMonoParentFamilyAlleles(Genotype.Type parent, Genotype.Type child){
- ArrayList phasedAlleles = new ArrayList(4);
+ private void phaseMonoParentFamilyAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
+
//Special case for Het/Het as it is ambiguous
- if(parent == Genotype.Type.HET && child == Genotype.Type.HET){
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
+ if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- ArrayList parentAlleles = getAlleles(parent);
- ArrayList childAlleles = getAlleles(child);
+ ArrayList parentAlleles = getAlleles(parentGenotype);
+ ArrayList childAlleles = getAlleles(childGenotype);
+ ArrayList parentPhasedAlleles = new ArrayList(2);
+ ArrayList childPhasedAlleles = new ArrayList(2);
+ //If there is a possible phasing between the mother and child => phase
int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
if(childTransmittedAlleleIndex > -1){
- phasedAlleles.add(parentAlleles.get(0));
- phasedAlleles.add(parentAlleles.get(1));
- phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
- phasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
}
else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
- phasedAlleles.add(parentAlleles.get(1));
- phasedAlleles.add(parentAlleles.get(0));
- phasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
- phasedAlleles.add(childAlleles.get(0));
+ parentPhasedAlleles.add(parentAlleles.get(1));
+ parentPhasedAlleles.add(parentAlleles.get(0));
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
}
+ //This is a Mendelian Violation => Do not phase
else{
- parentAlleles.addAll(childAlleles);
- for(AlleleType allele : parentAlleles){
- if(allele == AlleleType.REF){
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- }
- else if(allele == AlleleType.VAR){
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
- }
- else{
- phasedAlleles.add(AlleleType.NO_CALL);
- }
- }
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
-
- return phasedAlleles;
}
- private ArrayList phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
- ArrayList phasedAlleles = new ArrayList(6);
+ private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
- Set> possiblePhasedChildGenotypes = new HashSet>();
- ArrayList motherAlleles = getAlleles(mother);
- ArrayList fatherAlleles = getAlleles(father);
- ArrayList childAlleles = getAlleles(child);
+ Set> possiblePhasedChildGenotypes = new HashSet>();
+ ArrayList motherAlleles = getAlleles(mother);
+ ArrayList fatherAlleles = getAlleles(father);
+ ArrayList childAlleles = getAlleles(child);
//Build all possible child genotypes for the given parent's genotypes
- for (AlleleType momAllele : motherAlleles) {
- for (AlleleType fatherAllele : fatherAlleles) {
- ArrayList possiblePhasedChildAlleles = new ArrayList(2);
+ for (Allele momAllele : motherAlleles) {
+ for (Allele fatherAllele : fatherAlleles) {
+ ArrayList possiblePhasedChildAlleles = new ArrayList(2);
possiblePhasedChildAlleles.add(momAllele);
possiblePhasedChildAlleles.add(fatherAllele);
possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
}
}
- for (ArrayList phasedChildGenotype : possiblePhasedChildGenotypes) {
- int firstAlleleIndex = phasedChildGenotype.indexOf(childAlleles.get(0));
- int secondAlleleIndex = phasedChildGenotype.lastIndexOf(childAlleles.get(1));
+ for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) {
+ int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0));
+ int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1));
+ //If a possible combination has been found, create the genotypes
if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
- //Add mother's alleles
- phasedAlleles.add(phasedChildGenotype.get(0));
- if(motherAlleles.get(0) != phasedAlleles.get(0))
- phasedAlleles.add(motherAlleles.get(0));
+ //Create mother's genotype
+ ArrayList motherPhasedAlleles = new ArrayList(2);
+ motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0));
+ if(motherAlleles.get(0) != motherPhasedAlleles.get(0))
+ motherPhasedAlleles.add(motherAlleles.get(0));
else
- phasedAlleles.add(motherAlleles.get(1));
+ motherPhasedAlleles.add(motherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
- //Add father's alleles
- phasedAlleles.add(phasedChildGenotype.get(1));
- if(fatherAlleles.get(0) != phasedAlleles.get(2))
- phasedAlleles.add(fatherAlleles.get(0));
+ //Create father's genotype
+ ArrayList fatherPhasedAlleles = new ArrayList(2);
+ fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1));
+ if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0))
+ fatherPhasedAlleles.add(fatherAlleles.get(0));
else
- phasedAlleles.add(fatherAlleles.get(1));
+ fatherPhasedAlleles.add(fatherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
- //Add child's alleles
- phasedAlleles.addAll(phasedChildGenotype);
- return phasedAlleles;
+ //Create child's genotype
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Once a phased combination is found; exit
+ return;
}
}
//If this is reached then no phasing could be found
- motherAlleles.addAll(fatherAlleles);
- motherAlleles.addAll(childAlleles);
- for(AlleleType allele : motherAlleles){
- if(allele == AlleleType.REF){
- phasedAlleles.add(AlleleType.UNPHASED_REF);
- }
- else if(allele == AlleleType.VAR){
- phasedAlleles.add(AlleleType.UNPHASED_VAR);
- }
- else{
- phasedAlleles.add(AlleleType.NO_CALL);
- }
- }
- return phasedAlleles;
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
//Take care of cases where one or more family members are no call
- if(child == Genotype.Type.NO_CALL){
- trioAlleles.addAll(phaseSingleIndividualAlleles(mother));
- trioAlleles.addAll(phaseSingleIndividualAlleles(father));
- trioAlleles.add(AlleleType.NO_CALL);
- trioAlleles.add(AlleleType.NO_CALL);
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
- else if(mother == Genotype.Type.NO_CALL){
- trioAlleles.add(AlleleType.NO_CALL);
- trioAlleles.add(AlleleType.NO_CALL);
- if(father == Genotype.Type.NO_CALL){
- trioAlleles.add(AlleleType.NO_CALL);
- trioAlleles.add(AlleleType.NO_CALL);
- trioAlleles.addAll(phaseSingleIndividualAlleles(child));
+ else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
else
- trioAlleles.addAll(phaseMonoParentFamilyAlleles(father, child));
+ phaseMonoParentFamilyAlleles(father, child, FamilyMember.FATHER);
}
- else if(father == Genotype.Type.NO_CALL){
- trioAlleles.addAll(phaseMonoParentFamilyAlleles(mother, child));
- trioAlleles.add(2, AlleleType.NO_CALL);
- trioAlleles.add(3, AlleleType.NO_CALL);
+ else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ phaseMonoParentFamilyAlleles(mother, child, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
}
//Special case for Het/Het/Het as it is ambiguous
else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
- trioAlleles.add(AlleleType.UNPHASED_REF);
- trioAlleles.add(AlleleType.UNPHASED_VAR);
- trioAlleles.add(AlleleType.UNPHASED_REF);
- trioAlleles.add(AlleleType.UNPHASED_VAR);
- trioAlleles.add(AlleleType.UNPHASED_REF);
- trioAlleles.add(AlleleType.UNPHASED_VAR);
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
//All family members have genotypes and at least one of them is not Het
else{
- trioAlleles = phaseFamilyAlleles(mother, father, child);
+ phaseFamilyAlleles(mother, father, child);
}
}
public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){
- phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,trioAlleles.subList(0,2)));
- phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,trioAlleles.subList(2,4)));
- phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,trioAlleles.subList(4,6)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
return phasedGenotypes;
}
- private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, List phasedAlleles){
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, Genotype phasedGenotype){
//Add the transmission probability
Map genotypeAttributes = new HashMap();
genotypeAttributes.putAll(genotype.getAttributes());
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb);
- genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
- boolean isPhased = true;
+ //Handle missing genotype
+ if(!phasedGenotype.isAvailable())
+ return new Genotype(genotype.getSampleName(), null);
+ //Handle NoCall genotype
+ else if(phasedGenotype.isNoCall())
+ return new Genotype(genotype.getSampleName(), phasedGenotype.getAlleles());
- List alleles = new ArrayList(2);
+ ArrayList phasedAlleles = new ArrayList(2);
+ for(Allele allele : phasedGenotype.getAlleles()){
+ if(allele.isReference())
+ phasedAlleles.add(refAllele);
+ else if(allele.isNonReference())
+ phasedAlleles.add(altAllele);
+ //At this point there should not be any other alleles left
+ else
+ throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
- //If unphased, return original genotype
- for(AlleleType allele : phasedAlleles){
- if(allele == AlleleType.NO_CALL){
- return genotype;
- }
- //Otherwise add the appropriate allele
- else if(allele == AlleleType.UNPHASED_REF){
- isPhased = false;
- alleles.add(refAllele);
- }
- else if(allele == AlleleType.UNPHASED_VAR){
- isPhased = false;
- alleles.add(altAllele);
- }
- else if(allele == AlleleType.REF){
- alleles.add(refAllele);
- }
- else if(allele == AlleleType.VAR){
- alleles.add(altAllele);
- }
- }
+ }
- return new Genotype(genotype.getSampleName(), alleles, genotype.getLikelihoods().getLog10GQ(genotype.getType()), null, genotype.getAttributes(), isPhased);
+ //Compute the new Log10Error if the genotype is different from the original genotype
+ double negLog10Error;
+ if(genotype.getType() == phasedGenotype.getType())
+ negLog10Error = genotype.getNegLog10PError();
+ else
+ negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType());
+
+ return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased());
}
@@ -432,63 +418,6 @@ public class PhaseByTransmission extends RodWalker, HashMa
return 1;
}
- private Double getCombinationPrior(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
-
- double nonMVPrior = 1.0 - 12*MENDELIAN_VIOLATION_PRIOR;
-
- //Child is no call => No MV
- if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
- return nonMVPrior;
- //Add parents with genotypes for the evaluation
- ArrayList parents = new ArrayList();
- if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
- parents.add(mother);
- if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
- parents.add(father);
-
- //Both parents no calls => No MV
- if (parents.isEmpty())
- return nonMVPrior;
-
- //If at least one parent had a genotype, then count the number of ref and alt alleles that can be passed
- int parentsNumRefAlleles = 0;
- int parentsNumAltAlleles = 0;
-
- for(Genotype.Type parent : parents){
- if(parent == Genotype.Type.HOM_REF){
- parentsNumRefAlleles++;
- }
- else if(parent == Genotype.Type.HET){
- parentsNumRefAlleles++;
- parentsNumAltAlleles++;
- }
- else if(parent == Genotype.Type.HOM_VAR){
- parentsNumAltAlleles++;
- }
- }
-
- //Case Child is HomRef
- if(child == Genotype.Type.HOM_REF){
- if(parentsNumRefAlleles == parents.size())
- return nonMVPrior;
- else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumRefAlleles);
- }
-
- //Case child is HomVar
- if(child == Genotype.Type.HOM_VAR){
- if(parentsNumAltAlleles == parents.size())
- return nonMVPrior;
- else return Math.pow(MENDELIAN_VIOLATION_PRIOR, parents.size()-parentsNumAltAlleles);
- }
-
- //Case child is Het
- if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
- return nonMVPrior;
-
- //MV
- return MENDELIAN_VIOLATION_PRIOR;
- }
-
private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
int count = 0;
if(motherOriginal!=motherNew)
@@ -534,7 +463,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
for(Map.Entry childGenotype : childLikelihoods.entrySet()){
mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
- configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-12*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
+ configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
norm += configurationLikelihood;
configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
//Keep this combination if
From 38ebf3141a13ebafcc43b69c3364e0066ba17ea9 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 24 Oct 2011 12:30:04 +0200
Subject: [PATCH 007/380] - Now supports parent/child pairs - Sites with
missing genotypes in pairs/trios are handled as follows: -- Missing child ->
Homozygous parents are phased, no transmission probability is emitted -- Two
individuals missing -> Phase if homozygous, no transmission probability is
emitted -- One parent missing -> Phased / transmission probability emitted -
Mutation prior set as argument
---
.../walkers/phasing/PhaseByTransmission.java | 134 ++++++++++--------
1 file changed, 77 insertions(+), 57 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 62b7bfffa..d128ff40e 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -40,13 +40,16 @@ public class PhaseByTransmission extends RodWalker, HashMa
@Argument(shortName = "mvf",required = false,fullName = "MendelianViolationsFile", doc="File to output the mendelian violation details.")
private PrintStream mvFile = null;
+ @Argument(shortName = "prior",required = false,fullName = "DeNovoPrior", doc="Prior for de novo mutations. Default: 1e-8")
+ private double deNovoPrior=1e-8;
+
@Output
protected VCFWriter vcfWriter = null;
private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP";
private final String SOURCE_NAME = "PhaseByTransmission";
- private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8;
+ public final double NO_TRANSMISSION_PROB = -1.0;
private ArrayList trios = new ArrayList();
@@ -240,26 +243,26 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
- public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, int transmissionProb,ArrayList phasedGenotypes){
+ public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
return phasedGenotypes;
}
- private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, int transmissionProb, Genotype phasedGenotype){
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
+
+ //Handle null, missing and unavailable genotypes
+ //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
+ //genotype so it is safe to return the original genotype in this case.
+ if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
+ return genotype;
//Add the transmission probability
Map genotypeAttributes = new HashMap();
genotypeAttributes.putAll(genotype.getAttributes());
- genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb);
-
- //Handle missing genotype
- if(!phasedGenotype.isAvailable())
- return new Genotype(genotype.getSampleName(), null);
- //Handle NoCall genotype
- else if(phasedGenotype.isNoCall())
- return new Genotype(genotype.getSampleName(), phasedGenotype.getAlleles());
+ if(transmissionProb>NO_TRANSMISSION_PROB)
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb)));
ArrayList phasedAlleles = new ArrayList(2);
for(Allele allele : phasedGenotype.getAlleles()){
@@ -324,8 +327,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
ArrayList parents;
for(String familyID : families.keySet()){
family = families.get(familyID);
- if(family.size()!=3){
- logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios. Family skipped.",familyID,family.size()));
+ if(family.size()<2 || family.size()>3){
+ logger.info(String.format("Caution: Family %s has %d members; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID,family.size()));
}
else{
for(Sample familyMember : family){
@@ -334,7 +337,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(family.containsAll(parents))
this.trios.add(familyMember);
else
- logger.info(String.format("Caution: Family %s is not a trio; At the moment Phase By Transmission only supports trios. Family skipped.",familyID));
+ logger.info(String.format("Caution: Family %s skipped as it is not a trio nor a parent/child pair; At the moment Phase By Transmission only supports trios and parent/child pairs. Family skipped.",familyID));
break;
}
}
@@ -429,64 +432,84 @@ public class PhaseByTransmission extends RodWalker, HashMa
return count;
}
+ private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){
+ if(genotype == null || !genotype.isAvailable()){
+ EnumMap likelihoods = new EnumMap(Genotype.Type.class);
+ likelihoods.put(Genotype.Type.UNAVAILABLE,1.0);
+ return likelihoods;
+ }
+ else if(genotype.isNoCall()){
+ EnumMap likelihoods = new EnumMap(Genotype.Type.class);
+ likelihoods.put(Genotype.Type.NO_CALL,1.0);
+ return likelihoods;
+ }
+ return genotype.getLikelihoods().getAsMap(true);
+ }
+
+ private Genotype.Type getTypeSafeNull(Genotype genotype){
+ if(genotype == null)
+ return Genotype.Type.UNAVAILABLE;
+ return genotype.getType();
+ }
+
private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
- //For now, only consider trios with complete information
- //TODO: Phasing of trios with missing information
- if(!(mother.isCalled() && father.isCalled() && child.isCalled())) {
- finalGenotypes.add(mother);
- finalGenotypes.add(father);
- finalGenotypes.add(child);
- return false;
- }
-
//Get the PL
- Map motherLikelihoods = mother.getLikelihoods().getAsMap(true);
- Map fatherLikelihoods = father.getLikelihoods().getAsMap(true);
- Map childLikelihoods = child.getLikelihoods().getAsMap(true);
+ Map motherLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ Map fatherLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ Map childLikelihoods = getLikelihoodsAsMapSafeNull(child);
//Prior vars
double bestConfigurationLikelihood = 0.0;
double norm = 0.0;
boolean isMV = false;
int bestConfigurationGenotypeDiffs=4;
- Genotype.Type bestMotherGenotype = mother.getType();
- Genotype.Type bestFatherGenotype = father.getType();
- Genotype.Type bestChildGenotype = child.getType();
+ Genotype.Type bestMotherGenotype = getTypeSafeNull(mother);
+ Genotype.Type bestFatherGenotype = getTypeSafeNull(father);
+ Genotype.Type bestChildGenotype = getTypeSafeNull(child);
//Get the most likely combination
- int mvCount;
- double configurationLikelihood;
- int configurationGenotypeDiffs;
- for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
- for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
- for(Map.Entry childGenotype : childLikelihoods.entrySet()){
- mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
- configurationLikelihood = mvCount>0 ? Math.pow(MENDELIAN_VIOLATION_PRIOR,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*MENDELIAN_VIOLATION_PRIOR)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
- norm += configurationLikelihood;
- configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
- //Keep this combination if
- //It has a better likelihood
- //Or it has the same likelihood but requires less changes from original genotypes
- if ((configurationLikelihood > bestConfigurationLikelihood) ||
- (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) {
- bestConfigurationLikelihood = configurationLikelihood;
- bestMotherGenotype = motherGenotype.getKey();
- bestFatherGenotype = fatherGenotype.getKey();
- bestChildGenotype = childGenotype.getKey();
- isMV = mvCount>0;
- bestConfigurationGenotypeDiffs=configurationGenotypeDiffs;
+ //Only check for most likely combination if at least a parent and the child have genotypes
+ if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){
+ int mvCount;
+ double configurationLikelihood;
+ int configurationGenotypeDiffs;
+ for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
+ for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
+ for(Map.Entry childGenotype : childLikelihoods.entrySet()){
+ mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
+ configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
+ norm += configurationLikelihood;
+ configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if ((configurationLikelihood > bestConfigurationLikelihood) ||
+ (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) {
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMotherGenotype = motherGenotype.getKey();
+ bestFatherGenotype = fatherGenotype.getKey();
+ bestChildGenotype = childGenotype.getKey();
+ isMV = mvCount>0;
+ bestConfigurationGenotypeDiffs=configurationGenotypeDiffs;
+ }
}
}
}
+
+ //normalize the best configuration probability
+ bestConfigurationLikelihood = bestConfigurationLikelihood / norm;
+ }
+ else{
+ bestConfigurationLikelihood = NO_TRANSMISSION_PROB;
}
//Get the phased alleles for the genotype configuration
TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype);
//Return the phased genotypes
- phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,MathUtils.probabilityToPhredScale(1-(bestConfigurationLikelihood / norm)),finalGenotypes);
+ phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
return isMV;
}
@@ -522,8 +545,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
Genotype father = vc.getGenotype(sample.getPaternalID());
Genotype child = vc.getGenotype(sample.getID());
- //Skip trios where any of the genotype is missing in the variant context
- if(mother == null || father == null | child == null)
+ //Keep only trios and parent/child pairs
+ if(mother == null && father == null || child == null)
continue;
ArrayList trioGenotypes = new ArrayList(3);
@@ -545,11 +568,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){
numHet++;
- if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet()){
+ if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet())
numHetHetHet++;
- }else if(!phasedMother.isPhased()){
- int x =9;
- }
}
if(isMV){
From 62477a08104c3a158ccbb547ab5f347625676809 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Mon, 24 Oct 2011 13:45:21 +0200
Subject: [PATCH 008/380] Added documentation and comments
---
.../walkers/phasing/PhaseByTransmission.java | 127 ++++++++++++++----
1 file changed, 103 insertions(+), 24 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index d128ff40e..c4c6b69b7 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -22,15 +22,57 @@ import java.io.PrintStream;
import java.util.*;
/**
- * Phases a trio VCF (child phased by transmission, implied phase carried over to parents). Given genotypes for a trio,
- * this walker modifies the genotypes (if necessary) to reflect the most likely configuration given the genotype
- * likelihoods and inheritance constraints, phases child by transmission and carries over implied phase to the parents
- * (their alleles in their genotypes are ordered as transmitted|untransmitted). Computes probability that the
- * determined phase is correct given that the genotype configuration is correct (useful if you want to use this to
- * compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally
- * filters out sites where the phasing is indeterminate (site has no-calls), ambiguous (everyone is heterozygous), or
- * the genotypes exhibit a Mendelian violation. This walker assumes there are only three samples in the VCF file to
- * begin.
+ * Computes the most likely genotype combination and phases trios and parent/child pairs
+ *
+ *
+ * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases
+ * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability.
+ * Ambiguous sites are:
+ *
+ * - Sites where all individuals are heterozygous
+ * - Sites where there is a Mendelian violation
+ *
+ * Missing genotypes are handled as follows:
+ *
+ * - In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
+ * - In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
+ * - In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
+ * - In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
+ *
+ *
+ * Input
+ *
+ *
+ * - A VCF variant set containing trio(s) and/or parent/child pair(s).
+ * - A PED pedigree file containing the description of the individuals relationships.
+ *
+ *
+ *
+ * Options
+ *
+ *
+ * - MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype
+ * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability,
+ * and each individual genotype, depth, allelic depth and likelihoods.
+ * - DeNovoPrior: Mutation prio; default is 1e-8
+ *
+ *
+ *
+ * Output
+ *
+ * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous..
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T PhaseByTransmission \
+ * -V input.vcf \
+ * -ped input.ped \
+ * -o output.vcf
+ *
+ *
*/
public class PhaseByTransmission extends RodWalker, HashMap> {
@@ -73,7 +115,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
CHILD
}
- //Stores a trio-genotype
+ //Stores a conceptual trio or parent/child pair genotype combination along with its phasing.
+ //This combination can then be "applied" to a given trio or pair using the getPhasedGenotypes method.
private class TrioPhase {
//Create 2 fake alleles
@@ -119,7 +162,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- private void phaseMonoParentFamilyAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
+ //Find the phase for a parent/child pair
+ private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
//Special case for Het/Het as it is ambiguous
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
@@ -155,6 +199,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
+ //Phases a family by transmission
private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
Set> possiblePhasedChildGenotypes = new HashSet>();
@@ -209,7 +254,10 @@ public class PhaseByTransmission extends RodWalker, HashMa
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
-
+ /* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
+ If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
+ or single individual.
+ */
public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
//Take care of cases where one or more family members are no call
@@ -225,10 +273,10 @@ public class PhaseByTransmission extends RodWalker, HashMa
phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
else
- phaseMonoParentFamilyAlleles(father, child, FamilyMember.FATHER);
+ phasePairAlleles(father, child, FamilyMember.FATHER);
}
else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
- phaseMonoParentFamilyAlleles(mother, child, FamilyMember.MOTHER);
+ phasePairAlleles(mother, child, FamilyMember.MOTHER);
phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
}
//Special case for Het/Het/Het as it is ambiguous
@@ -243,11 +291,20 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
- public ArrayList getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
+ /**
+ * Applies the trio genotype combination to the given trio.
+ * @param ref: Reference allele
+ * @param alt: Alternate allele
+ * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
+ * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
+ * @param childGenotype: Genotype of the child to phase using this trio genotype combination
+ * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
+ * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child
+ */
+ public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
- return phasedGenotypes;
}
private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
@@ -290,7 +347,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
/**
- * Parse the familial relationship specification, and initialize VCF writer
+ * Parse the familial relationship specification, build the transmission matrices and initialize VCF writer
*/
public void initialize() {
ArrayList rodNames = new ArrayList();
@@ -306,7 +363,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
Set headerLines = new HashSet();
headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit()));
- headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the phase given that the genotypes are correct"));
+ headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct"));
headerLines.add(new VCFHeaderLine("source", SOURCE_NAME));
vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples));
@@ -318,7 +375,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
/**
- * Select Trios only
+ * Select trios and parent/child pairs only
*/
private void setTrios(){
@@ -349,6 +406,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
+ //Create the transmission matrices
private void buildMatrices(){
mvCountMatrix = new EnumMap>>(Genotype.Type.class);
transmissionMatrix = new EnumMap>>(Genotype.Type.class);
@@ -366,6 +424,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
+ //Returns the number of Mendelian Violations for a given genotype combination.
+ //If one of the parents genotype is missing, it will consider it as a parent/child pair
+ //If the child genotype or both parents genotypes are missing, 0 is returned.
private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
//Child is no call => No MV
@@ -421,6 +482,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
return 1;
}
+ //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations.
private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
int count = 0;
if(motherOriginal!=motherNew)
@@ -432,6 +494,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
return count;
}
+ //Get a Map of genotype likelihoods. If the genotype is NO_CALL or UNAVAILABLE, the Map will contain a single
+ //NO_CALL resp. UNAVAILABLE element with a likelihood of 1.0
private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){
if(genotype == null || !genotype.isAvailable()){
EnumMap likelihoods = new EnumMap(Genotype.Type.class);
@@ -446,6 +510,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
return genotype.getLikelihoods().getAsMap(true);
}
+ //Returns the Genotype.Type; returns UNVAILABLE if given null
private Genotype.Type getTypeSafeNull(Genotype genotype){
if(genotype == null)
return Genotype.Type.UNAVAILABLE;
@@ -453,6 +518,16 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
+ /**
+ * Phases the genotypes of the given trio. If one of the parents is null, it is considered a parent/child pair.
+ * @param ref: Reference allele
+ * @param alt: Alternative allele
+ * @param mother: Mother's genotype
+ * @param father: Father's genotype
+ * @param child: Child's genotype
+ * @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child
+ * @return
+ */
private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
//Get the PL
@@ -600,9 +675,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
/**
- * Provide an initial value for reduce computations.
+ * Initializes the reporting counters.
*
- * @return Initial value of reduce.
+ * @return All counters initialized to 0
*/
@Override
public HashMap reduceInit() {
@@ -617,10 +692,10 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
/**
- * Reduces a single map with the accumulator provided as the ReduceType.
+ * Adds the value of the site phased to the reporting counters.
*
- * @param value result of the map.
- * @param sum accumulator for the reduce.
+ * @param value Site values
+ * @param sum accumulator for the reporting counters
* @return accumulator with result of the map taken into account.
*/
@Override
@@ -635,6 +710,10 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
+ /**
+ * Reports statistics on the phasing by transmission process.
+ * @param result Accumulator with all counters.
+ */
@Override
public void onTraversalDone(HashMap result) {
logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED));
From 62cff266d4e20aa2a48f88c47f282d9a6bc25287 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 26 Oct 2011 14:40:04 +0200
Subject: [PATCH 009/380] GQ calculation corrected for most likely genotype
---
.../utils/variantcontext/GenotypeLikelihoods.java | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index b83ffa2fd..47c93bb1b 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -124,10 +124,15 @@ public class GenotypeLikelihoods {
}
- double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
- double chosenGenotype = normalized[genotype.ordinal()-1];
- qual = -1.0 * Math.log10(1.0 - chosenGenotype);
+ //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best)
+ qual = likelihoods.get(genotype) - qual;
+ //Quality of other genotypes 1-P(G)
+ if (qual < 0) {
+ double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
+ double chosenGenotype = normalized[genotype.ordinal()-1];
+ qual = -1.0 * Math.log10(1.0 - chosenGenotype);
+ }
return qual;
}
From 81b163ff4d6a8203cb4cb155a92a751ee1b2349d Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 26 Oct 2011 14:49:12 +0200
Subject: [PATCH 010/380] Indentation
---
.../walkers/phasing/PhaseByTransmission.java | 370 +++++++++---------
1 file changed, 185 insertions(+), 185 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index c4c6b69b7..956cf7c89 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -129,220 +129,220 @@ public class PhaseByTransmission extends RodWalker, HashMa
private EnumMap trioPhasedGenotypes = new EnumMap(FamilyMember.class);
private ArrayList getAlleles(Genotype.Type genotype){
- ArrayList alleles = new ArrayList(2);
- if(genotype == Genotype.Type.HOM_REF){
- alleles.add(REF);
- alleles.add(REF);
- }
- else if(genotype == Genotype.Type.HET){
- alleles.add(REF);
- alleles.add(VAR);
- }
- else if(genotype == Genotype.Type.HOM_VAR){
- alleles.add(VAR);
- alleles.add(VAR);
- }
- else if(genotype == Genotype.Type.NO_CALL){
- alleles.add(NO_CALL);
- alleles.add(NO_CALL);
- }
- else{
- return null;
- }
- return alleles;
- }
-
- //Create a new Genotype based on information from a single individual
- //Homozygous genotypes will be set as phased, heterozygous won't be
- private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
- if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
- trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true));
- }
- else
- trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- }
-
- //Find the phase for a parent/child pair
- private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
-
- //Special case for Het/Het as it is ambiguous
- if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
- trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ ArrayList alleles = new ArrayList(2);
+ if(genotype == Genotype.Type.HOM_REF){
+ alleles.add(REF);
+ alleles.add(REF);
+ }
+ else if(genotype == Genotype.Type.HET){
+ alleles.add(REF);
+ alleles.add(VAR);
+ }
+ else if(genotype == Genotype.Type.HOM_VAR){
+ alleles.add(VAR);
+ alleles.add(VAR);
+ }
+ else if(genotype == Genotype.Type.NO_CALL){
+ alleles.add(NO_CALL);
+ alleles.add(NO_CALL);
+ }
+ else{
+ return null;
+ }
+ return alleles;
}
- ArrayList parentAlleles = getAlleles(parentGenotype);
- ArrayList childAlleles = getAlleles(childGenotype);
- ArrayList parentPhasedAlleles = new ArrayList(2);
- ArrayList childPhasedAlleles = new ArrayList(2);
-
- //If there is a possible phasing between the mother and child => phase
- int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
- if(childTransmittedAlleleIndex > -1){
- trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
- childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
- childPhasedAlleles.add(childAlleles.get(0));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ //Create a new Genotype based on information from a single individual
+ //Homozygous genotypes will be set as phased, heterozygous won't be
+ private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
+ if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ }
+ else
+ trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
- parentPhasedAlleles.add(parentAlleles.get(1));
- parentPhasedAlleles.add(parentAlleles.get(0));
- trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
- childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
- childPhasedAlleles.add(childAlleles.get(0));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
- }
- //This is a Mendelian Violation => Do not phase
- else{
- trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- }
- }
- //Phases a family by transmission
- private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
+ //Find the phase for a parent/child pair
+ private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
- Set> possiblePhasedChildGenotypes = new HashSet>();
- ArrayList motherAlleles = getAlleles(mother);
- ArrayList fatherAlleles = getAlleles(father);
- ArrayList childAlleles = getAlleles(child);
+ //Special case for Het/Het as it is ambiguous
+ if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ }
- //Build all possible child genotypes for the given parent's genotypes
- for (Allele momAllele : motherAlleles) {
- for (Allele fatherAllele : fatherAlleles) {
- ArrayList possiblePhasedChildAlleles = new ArrayList(2);
- possiblePhasedChildAlleles.add(momAllele);
- possiblePhasedChildAlleles.add(fatherAllele);
- possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
+ ArrayList parentAlleles = getAlleles(parentGenotype);
+ ArrayList childAlleles = getAlleles(childGenotype);
+ ArrayList parentPhasedAlleles = new ArrayList(2);
+ ArrayList childPhasedAlleles = new ArrayList(2);
+
+ //If there is a possible phasing between the mother and child => phase
+ int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
+ if(childTransmittedAlleleIndex > -1){
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ }
+ else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
+ parentPhasedAlleles.add(parentAlleles.get(1));
+ parentPhasedAlleles.add(parentAlleles.get(0));
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
+ childPhasedAlleles.add(childAlleles.get(0));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_NEG_LOG_10PERROR, null, null, true));
+ }
+ //This is a Mendelian Violation => Do not phase
+ else{
+ trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
}
- for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) {
- int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0));
- int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1));
- //If a possible combination has been found, create the genotypes
- if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
- //Create mother's genotype
- ArrayList motherPhasedAlleles = new ArrayList(2);
- motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0));
- if(motherAlleles.get(0) != motherPhasedAlleles.get(0))
- motherPhasedAlleles.add(motherAlleles.get(0));
- else
- motherPhasedAlleles.add(motherAlleles.get(1));
- trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+ //Phases a family by transmission
+ private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
- //Create father's genotype
- ArrayList fatherPhasedAlleles = new ArrayList(2);
- fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1));
- if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0))
- fatherPhasedAlleles.add(fatherAlleles.get(0));
- else
- fatherPhasedAlleles.add(fatherAlleles.get(1));
- trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+ Set> possiblePhasedChildGenotypes = new HashSet>();
+ ArrayList motherAlleles = getAlleles(mother);
+ ArrayList fatherAlleles = getAlleles(father);
+ ArrayList childAlleles = getAlleles(child);
- //Create child's genotype
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
-
- //Once a phased combination is found; exit
- return;
+ //Build all possible child genotypes for the given parent's genotypes
+ for (Allele momAllele : motherAlleles) {
+ for (Allele fatherAllele : fatherAlleles) {
+ ArrayList possiblePhasedChildAlleles = new ArrayList(2);
+ possiblePhasedChildAlleles.add(momAllele);
+ possiblePhasedChildAlleles.add(fatherAllele);
+ possiblePhasedChildGenotypes.add(possiblePhasedChildAlleles);
+ }
}
+
+ for (ArrayList childPhasedAllelesAlleles : possiblePhasedChildGenotypes) {
+ int firstAlleleIndex = childPhasedAllelesAlleles.indexOf(childAlleles.get(0));
+ int secondAlleleIndex = childPhasedAllelesAlleles.lastIndexOf(childAlleles.get(1));
+ //If a possible combination has been found, create the genotypes
+ if (firstAlleleIndex != secondAlleleIndex && firstAlleleIndex > -1 && secondAlleleIndex > -1) {
+ //Create mother's genotype
+ ArrayList motherPhasedAlleles = new ArrayList(2);
+ motherPhasedAlleles.add(childPhasedAllelesAlleles.get(0));
+ if(motherAlleles.get(0) != motherPhasedAlleles.get(0))
+ motherPhasedAlleles.add(motherAlleles.get(0));
+ else
+ motherPhasedAlleles.add(motherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Create father's genotype
+ ArrayList fatherPhasedAlleles = new ArrayList(2);
+ fatherPhasedAlleles.add(childPhasedAllelesAlleles.get(1));
+ if(fatherAlleles.get(0) != fatherPhasedAlleles.get(0))
+ fatherPhasedAlleles.add(fatherAlleles.get(0));
+ else
+ fatherPhasedAlleles.add(fatherAlleles.get(1));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Create child's genotype
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_NEG_LOG_10PERROR,null,null,true));
+
+ //Once a phased combination is found; exit
+ return;
+ }
+ }
+
+ //If this is reached then no phasing could be found
+ trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
}
- //If this is reached then no phasing could be found
- trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
- }
+ /* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
+ If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
+ or single individual.
+ */
+ public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
- /* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
- If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
- or single individual.
- */
- public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
-
- //Take care of cases where one or more family members are no call
- if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){
- phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
- phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
- phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
- }
- else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){
- phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
- if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ //Take care of cases where one or more family members are no call
+ if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
}
- else
- phasePairAlleles(father, child, FamilyMember.FATHER);
+ else if(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ else
+ phasePairAlleles(father, child, FamilyMember.FATHER);
+ }
+ else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
+ phasePairAlleles(mother, child, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ }
+ //Special case for Het/Het/Het as it is ambiguous
+ else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
+ phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
+ phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+ phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
+ }
+ //All family members have genotypes and at least one of them is not Het
+ else{
+ phaseFamilyAlleles(mother, father, child);
+ }
}
- else if(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE){
- phasePairAlleles(mother, child, FamilyMember.MOTHER);
- phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
+
+ /**
+ * Applies the trio genotype combination to the given trio.
+ * @param ref: Reference allele
+ * @param alt: Alternate allele
+ * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
+ * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
+ * @param childGenotype: Genotype of the child to phase using this trio genotype combination
+ * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
+ * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child
+ */
+ public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
+ phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
}
- //Special case for Het/Het/Het as it is ambiguous
- else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
- phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
- phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
- phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
- }
- //All family members have genotypes and at least one of them is not Het
- else{
- phaseFamilyAlleles(mother, father, child);
- }
- }
- /**
- * Applies the trio genotype combination to the given trio.
- * @param ref: Reference allele
- * @param alt: Alternate allele
- * @param motherGenotype: Genotype of the mother to phase using this trio genotype combination
- * @param fatherGenotype: Genotype of the father to phase using this trio genotype combination
- * @param childGenotype: Genotype of the child to phase using this trio genotype combination
- * @param transmissionProb: Probability for this trio genotype combination to be correct (pass NO_TRANSMISSION_PROB if unavailable)
- * @param phasedGenotypes: An ArrayList to which the newly phased genotypes are added in the following order: Mother, Father, Child
- */
- public void getPhasedGenotypes(Allele ref, Allele alt, Genotype motherGenotype, Genotype fatherGenotype, Genotype childGenotype, double transmissionProb,ArrayList phasedGenotypes){
- phasedGenotypes.add(getPhasedGenotype(ref,alt,motherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.MOTHER)));
- phasedGenotypes.add(getPhasedGenotype(ref,alt,fatherGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.FATHER)));
- phasedGenotypes.add(getPhasedGenotype(ref,alt,childGenotype,transmissionProb,this.trioPhasedGenotypes.get(FamilyMember.CHILD)));
- }
+ private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
- private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
+ //Handle null, missing and unavailable genotypes
+ //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
+ //genotype so it is safe to return the original genotype in this case.
+ if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
+ return genotype;
- //Handle null, missing and unavailable genotypes
- //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
- //genotype so it is safe to return the original genotype in this case.
- if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
- return genotype;
+ //Add the transmission probability
+ Map genotypeAttributes = new HashMap();
+ genotypeAttributes.putAll(genotype.getAttributes());
+ if(transmissionProb>NO_TRANSMISSION_PROB)
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb)));
- //Add the transmission probability
- Map genotypeAttributes = new HashMap();
- genotypeAttributes.putAll(genotype.getAttributes());
- if(transmissionProb>NO_TRANSMISSION_PROB)
- genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb)));
+ ArrayList phasedAlleles = new ArrayList(2);
+ for(Allele allele : phasedGenotype.getAlleles()){
+ if(allele.isReference())
+ phasedAlleles.add(refAllele);
+ else if(allele.isNonReference())
+ phasedAlleles.add(altAllele);
+ //At this point there should not be any other alleles left
+ else
+ throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
- ArrayList phasedAlleles = new ArrayList(2);
- for(Allele allele : phasedGenotype.getAlleles()){
- if(allele.isReference())
- phasedAlleles.add(refAllele);
- else if(allele.isNonReference())
- phasedAlleles.add(altAllele);
- //At this point there should not be any other alleles left
+ }
+
+ //Compute the new Log10Error if the genotype is different from the original genotype
+ double negLog10Error;
+ if(genotype.getType() == phasedGenotype.getType())
+ negLog10Error = genotype.getNegLog10PError();
else
- throw new UserException(String.format("BUG: Unexpected allele: %s. Please report.",allele.toString()));
+ negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType());
+ return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased());
}
- //Compute the new Log10Error if the genotype is different from the original genotype
- double negLog10Error;
- if(genotype.getType() == phasedGenotype.getType())
- negLog10Error = genotype.getNegLog10PError();
- else
- negLog10Error = genotype.getLikelihoods().getNegLog10GQ(phasedGenotype.getType());
-
- return new Genotype(genotype.getSampleName(), phasedAlleles, negLog10Error, null, genotypeAttributes, phasedGenotype.isPhased());
- }
-
}
From 1f044faedd01317b71c3abf91ac935beb6581223 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 26 Oct 2011 19:57:09 +0200
Subject: [PATCH 011/380] - Genotype assignment in case of equally likeli
combination is now random - Genotype combinations with 0 confidence are now
left unphased
---
.../walkers/phasing/PhaseByTransmission.java | 65 +++++++++++++------
1 file changed, 44 insertions(+), 21 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 956cf7c89..f5e9ce6ea 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -109,6 +109,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
private final Byte NUM_HET_HET_HET = 4;
private final Byte NUM_VIOLATIONS = 5;
+ //Random number generator
+ private Random rand = new Random();
+
private enum FamilyMember {
MOTHER,
FATHER,
@@ -309,17 +312,22 @@ public class PhaseByTransmission extends RodWalker, HashMa
private Genotype getPhasedGenotype(Allele refAllele, Allele altAllele, Genotype genotype, double transmissionProb, Genotype phasedGenotype){
+ int phredScoreTransmission = -1;
+ if(transmissionProb != NO_TRANSMISSION_PROB)
+ phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb));
+
//Handle null, missing and unavailable genotypes
//Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable
//genotype so it is safe to return the original genotype in this case.
- if(genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
+ //In addition, if the phasing confidence is 0, then return the unphased, original genotypes.
+ if(phredScoreTransmission ==0 || genotype == null || !phasedGenotype.isAvailable() || phasedGenotype.isNoCall())
return genotype;
//Add the transmission probability
Map genotypeAttributes = new HashMap();
genotypeAttributes.putAll(genotype.getAttributes());
if(transmissionProb>NO_TRANSMISSION_PROB)
- genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, MathUtils.probabilityToPhredScale(1-(transmissionProb)));
+ genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
ArrayList phasedAlleles = new ArrayList(2);
for(Allele allele : phasedGenotype.getAlleles()){
@@ -538,36 +546,46 @@ public class PhaseByTransmission extends RodWalker, HashMa
//Prior vars
double bestConfigurationLikelihood = 0.0;
double norm = 0.0;
- boolean isMV = false;
- int bestConfigurationGenotypeDiffs=4;
- Genotype.Type bestMotherGenotype = getTypeSafeNull(mother);
- Genotype.Type bestFatherGenotype = getTypeSafeNull(father);
- Genotype.Type bestChildGenotype = getTypeSafeNull(child);
+ int configuration_index =0;
+ ArrayList isMV = new ArrayList();
+ isMV.add(false);
+ ArrayList bestMotherGenotype = new ArrayList();
+ bestMotherGenotype.add(getTypeSafeNull(mother));
+ ArrayList bestFatherGenotype = new ArrayList();
+ bestFatherGenotype.add(getTypeSafeNull(father));
+ ArrayList bestChildGenotype = new ArrayList();
+ bestChildGenotype.add(getTypeSafeNull(child));
//Get the most likely combination
//Only check for most likely combination if at least a parent and the child have genotypes
if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){
int mvCount;
double configurationLikelihood;
- int configurationGenotypeDiffs;
for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
for(Map.Entry childGenotype : childLikelihoods.entrySet()){
mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
norm += configurationLikelihood;
- configurationGenotypeDiffs = countFamilyGenotypeDiff(mother.getType(),father.getType(),child.getType(),motherGenotype.getKey(),fatherGenotype.getKey(),childGenotype.getKey());
//Keep this combination if
//It has a better likelihood
//Or it has the same likelihood but requires less changes from original genotypes
- if ((configurationLikelihood > bestConfigurationLikelihood) ||
- (configurationLikelihood == bestConfigurationLikelihood && configurationGenotypeDiffs < bestConfigurationGenotypeDiffs)) {
+ if (configurationLikelihood > bestConfigurationLikelihood){
bestConfigurationLikelihood = configurationLikelihood;
- bestMotherGenotype = motherGenotype.getKey();
- bestFatherGenotype = fatherGenotype.getKey();
- bestChildGenotype = childGenotype.getKey();
- isMV = mvCount>0;
- bestConfigurationGenotypeDiffs=configurationGenotypeDiffs;
+ isMV.clear();
+ isMV.add(mvCount>0);
+ bestMotherGenotype.clear();
+ bestMotherGenotype.add(motherGenotype.getKey());
+ bestFatherGenotype.clear();
+ bestFatherGenotype.add(fatherGenotype.getKey());
+ bestChildGenotype.clear();
+ bestChildGenotype.add(childGenotype.getKey());
+ }
+ else if(configurationLikelihood == bestConfigurationLikelihood) {
+ bestMotherGenotype.add(motherGenotype.getKey());
+ bestFatherGenotype.add(fatherGenotype.getKey());
+ bestChildGenotype.add(childGenotype.getKey());
+ isMV.add(mvCount>0);
}
}
}
@@ -575,17 +593,22 @@ public class PhaseByTransmission extends RodWalker, HashMa
//normalize the best configuration probability
bestConfigurationLikelihood = bestConfigurationLikelihood / norm;
+
+ //In case of multiple equally likely combinations, take a random one
+ if(bestMotherGenotype.size()>1){
+ configuration_index = rand.nextInt(bestMotherGenotype.size()-1);
+ }
+
}
else{
bestConfigurationLikelihood = NO_TRANSMISSION_PROB;
}
- //Get the phased alleles for the genotype configuration
- TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype).get(bestFatherGenotype).get(bestChildGenotype);
+ TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype.get(configuration_index)).get(bestFatherGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
- //Return the phased genotypes
- phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
- return isMV;
+ //Return the phased genotypes
+ phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
+ return isMV.get(configuration_index);
}
From b91a9c4711d60b361a226a517a97e68217fdd5cf Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 2 Nov 2011 08:04:01 +0100
Subject: [PATCH 012/380] - Fixed parent/child pairs handling (was crashing
before) - Added parent/child pair reporting
---
.../walkers/phasing/PhaseByTransmission.java | 151 ++++++++++++------
1 file changed, 101 insertions(+), 50 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index f5e9ce6ea..5b2024f96 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -105,9 +105,13 @@ public class PhaseByTransmission extends RodWalker, HashMa
private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
private final Byte NUM_TRIO_GENOTYPES_NOCALL = 1;
private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
- private final Byte NUM_HET = 3;
- private final Byte NUM_HET_HET_HET = 4;
- private final Byte NUM_VIOLATIONS = 5;
+ private final Byte NUM_TRIO_HET_HET_HET = 3;
+ private final Byte NUM_TRIO_VIOLATIONS = 4;
+ private final Byte NUM_PAIR_GENOTYPES_CALLED = 5;
+ private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6;
+ private final Byte NUM_PAIR_GENOTYPES_PHASED = 7;
+ private final Byte NUM_PAIR_HET_HET = 8;
+ private final Byte NUM_PAIR_VIOLATIONS = 9;
//Random number generator
private Random rand = new Random();
@@ -172,6 +176,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_NEG_LOG_10PERROR, null, null, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_NEG_LOG_10PERROR,null,null,false));
+ return;
}
ArrayList parentAlleles = getAlleles(parentGenotype);
@@ -612,6 +617,48 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
+
+ private void updatePairMetricsCounters(Genotype parent, Genotype child, boolean isMV, HashMap counters){
+
+ //Increment metrics counters
+ if(parent.isCalled() && child.isCalled()){
+ counters.put(NUM_PAIR_GENOTYPES_CALLED,counters.get(NUM_PAIR_GENOTYPES_CALLED)+1);
+ if(parent.isPhased())
+ counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
+ else{
+ if(parent.isHet() && child.isHet())
+ counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1);
+
+ else if(isMV)
+ counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+1);
+ }
+ }else{
+ counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1);
+ }
+
+ }
+
+ private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, boolean isMV, HashMap counters){
+
+ //Increment metrics counters
+ if(mother.isCalled() && father.isCalled() && child.isCalled()){
+ counters.put(NUM_TRIO_GENOTYPES_CALLED,counters.get(NUM_TRIO_GENOTYPES_CALLED)+1);
+ if(mother.isPhased())
+ counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
+
+ else{
+ if(mother.isHet() && father.isHet() && child.isHet())
+ counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
+
+ else if(isMV)
+ counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
+
+ }
+ }else{
+ counters.put(NUM_TRIO_GENOTYPES_NOCALL,counters.get(NUM_TRIO_GENOTYPES_NOCALL)+1);
+ }
+ }
+
/**
* For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype
*
@@ -623,13 +670,17 @@ public class PhaseByTransmission extends RodWalker, HashMa
@Override
public HashMap map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
- //Local cars to avoid lookups on increment
- int numTrioGenotypesCalled = 0;
- int numTrioGenotypesNoCall = 0;
- int numTrioGenotypesPhased = 0;
- int numHet = 0 ;
- int numHetHetHet = 0;
- int numMVs = 0;
+ HashMap metricsCounters = new HashMap(10);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_TRIO_HET_HET_HET,0);
+ metricsCounters.put(NUM_TRIO_VIOLATIONS,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_PAIR_HET_HET,0);
+ metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
@@ -654,30 +705,26 @@ public class PhaseByTransmission extends RodWalker, HashMa
Genotype phasedFather = trioGenotypes.get(1);
Genotype phasedChild = trioGenotypes.get(2);
- genotypeMap.put(phasedMother.getSampleName(), phasedMother);
- genotypeMap.put(phasedFather.getSampleName(), phasedFather);
- genotypeMap.put(phasedChild.getSampleName(), phasedChild);
-
- //Increment metrics counters
- if(phasedMother.isCalled() && phasedFather.isCalled() && phasedChild.isCalled()){
- numTrioGenotypesCalled++;
- if(phasedMother.isPhased())
- numTrioGenotypesPhased++;
-
- if(phasedMother.isHet() || phasedFather.isHet() || phasedChild.isHet()){
- numHet++;
- if(phasedMother.isHet() && phasedFather.isHet() && phasedChild.isHet())
- numHetHetHet++;
+ //Fill the genotype map with the new genotypes and increment metrics counters
+ genotypeMap.put(phasedChild.getSampleName(),phasedChild);
+ if(mother != null){
+ genotypeMap.put(phasedMother.getSampleName(), phasedMother);
+ if(father != null){
+ genotypeMap.put(phasedFather.getSampleName(), phasedFather);
+ updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters);
}
-
- if(isMV){
- numMVs++;
- if(mvFile != null)
- mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()));
- }
- }else{
- numTrioGenotypesNoCall++;
+ else
+ updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters);
}
+ else{
+ genotypeMap.put(phasedFather.getSampleName(),phasedFather);
+ updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters);
+ }
+
+ //Report violation if set so
+ //TODO: ADAPT FOR PAIRS TOO!!
+ if(isMV && mvFile != null)
+ mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()));
}
@@ -686,14 +733,6 @@ public class PhaseByTransmission extends RodWalker, HashMa
vcfWriter.add(newvc);
}
-
- HashMap metricsCounters = new HashMap(5);
- metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,numTrioGenotypesCalled);
- metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,numTrioGenotypesNoCall);
- metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,numTrioGenotypesPhased);
- metricsCounters.put(NUM_HET,numHet);
- metricsCounters.put(NUM_HET_HET_HET,numHetHetHet);
- metricsCounters.put(NUM_VIOLATIONS,numMVs);
return metricsCounters;
}
@@ -704,13 +743,17 @@ public class PhaseByTransmission extends RodWalker, HashMa
*/
@Override
public HashMap reduceInit() {
- HashMap metricsCounters = new HashMap(5);
+ HashMap metricsCounters = new HashMap(10);
metricsCounters.put(NUM_TRIO_GENOTYPES_CALLED,0);
metricsCounters.put(NUM_TRIO_GENOTYPES_NOCALL,0);
metricsCounters.put(NUM_TRIO_GENOTYPES_PHASED,0);
- metricsCounters.put(NUM_HET,0);
- metricsCounters.put(NUM_HET_HET_HET,0);
- metricsCounters.put(NUM_VIOLATIONS,0);
+ metricsCounters.put(NUM_TRIO_HET_HET_HET,0);
+ metricsCounters.put(NUM_TRIO_VIOLATIONS,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_CALLED,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_NOCALL,0);
+ metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
+ metricsCounters.put(NUM_PAIR_HET_HET,0);
+ metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
return metricsCounters;
}
@@ -726,9 +769,13 @@ public class PhaseByTransmission extends RodWalker, HashMa
sum.put(NUM_TRIO_GENOTYPES_CALLED,value.get(NUM_TRIO_GENOTYPES_CALLED)+sum.get(NUM_TRIO_GENOTYPES_CALLED));
sum.put(NUM_TRIO_GENOTYPES_NOCALL,value.get(NUM_TRIO_GENOTYPES_NOCALL)+sum.get(NUM_TRIO_GENOTYPES_NOCALL));
sum.put(NUM_TRIO_GENOTYPES_PHASED,value.get(NUM_TRIO_GENOTYPES_PHASED)+sum.get(NUM_TRIO_GENOTYPES_PHASED));
- sum.put(NUM_HET,value.get(NUM_HET)+sum.get(NUM_HET));
- sum.put(NUM_HET_HET_HET,value.get(NUM_HET_HET_HET)+sum.get(NUM_HET_HET_HET));
- sum.put(NUM_VIOLATIONS,value.get(NUM_VIOLATIONS)+sum.get(NUM_VIOLATIONS));
+ sum.put(NUM_TRIO_HET_HET_HET,value.get(NUM_TRIO_HET_HET_HET)+sum.get(NUM_TRIO_HET_HET_HET));
+ sum.put(NUM_TRIO_VIOLATIONS,value.get(NUM_TRIO_VIOLATIONS)+sum.get(NUM_TRIO_VIOLATIONS));
+ sum.put(NUM_PAIR_GENOTYPES_CALLED,value.get(NUM_PAIR_GENOTYPES_CALLED)+sum.get(NUM_PAIR_GENOTYPES_CALLED));
+ sum.put(NUM_PAIR_GENOTYPES_NOCALL,value.get(NUM_PAIR_GENOTYPES_NOCALL)+sum.get(NUM_PAIR_GENOTYPES_NOCALL));
+ sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED));
+ sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET));
+ sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS));
return sum;
}
@@ -742,8 +789,12 @@ public class PhaseByTransmission extends RodWalker, HashMa
logger.info("Number of complete trio-genotypes: " + result.get(NUM_TRIO_GENOTYPES_CALLED));
logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL));
logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED));
- logger.info("Number of resulting Hom/Hom/Hom trios: " + result.get(NUM_HET));
- logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_HET_HET_HET));
- logger.info("Number of remaining mendelian violations: " + result.get(NUM_VIOLATIONS));
+ logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET));
+ logger.info("Number of remaining mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS));
+ logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED));
+ logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL));
+ logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED));
+ logger.info("Number of resulting Het/Het pairs: " + result.get(NUM_PAIR_HET_HET));
+ logger.info("Number of remaining mendelian violations in pairs: " + result.get(NUM_PAIR_VIOLATIONS));
}
}
From 119ca7d742371423060bcc7150f908e1cb04de8a Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 2 Nov 2011 08:22:33 +0100
Subject: [PATCH 013/380] Fixed a bug in parent/child pairs reporting causing a
crash in case the -mvf option was used and mother was not provided
---
.../sting/gatk/walkers/phasing/PhaseByTransmission.java | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 5b2024f96..806a84e9d 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -681,6 +681,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
metricsCounters.put(NUM_PAIR_HET_HET,0);
metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
+ String mvfLine = "";
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
@@ -712,19 +713,23 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(father != null){
genotypeMap.put(phasedFather.getSampleName(), phasedFather);
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
- else
+ else{
updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
+ }
}
else{
genotypeMap.put(phasedFather.getSampleName(),phasedFather);
updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters);
+ mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
//Report violation if set so
//TODO: ADAPT FOR PAIRS TOO!!
if(isMV && mvFile != null)
- mvFile.println(String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString()));
+ mvFile.println(mvfLine);
}
From 19ad5b635ad46f119ed1beccccf7bb2bc6a18aef Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Wed, 2 Nov 2011 18:35:31 +0100
Subject: [PATCH 014/380] - Calculation of parent/child pairs corrected -
Separated the reporting of single and double mendelian violations in trios
---
.../walkers/phasing/PhaseByTransmission.java | 177 ++++++++++++------
1 file changed, 120 insertions(+), 57 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 806a84e9d..3c39b58d3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -107,6 +107,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
private final Byte NUM_TRIO_GENOTYPES_PHASED = 2;
private final Byte NUM_TRIO_HET_HET_HET = 3;
private final Byte NUM_TRIO_VIOLATIONS = 4;
+ private final Byte NUM_TRIO_DOUBLE_VIOLATIONS = 10;
private final Byte NUM_PAIR_GENOTYPES_CALLED = 5;
private final Byte NUM_PAIR_GENOTYPES_NOCALL = 6;
private final Byte NUM_PAIR_GENOTYPES_PHASED = 7;
@@ -507,17 +508,14 @@ public class PhaseByTransmission extends RodWalker, HashMa
return count;
}
- //Get a Map of genotype likelihoods. If the genotype is NO_CALL or UNAVAILABLE, the Map will contain a single
- //NO_CALL resp. UNAVAILABLE element with a likelihood of 1.0
+ //Get a Map of genotype likelihoods.
+ //In case of null, unavailable or no call, all likelihoods are 1/3.
private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){
- if(genotype == null || !genotype.isAvailable()){
+ if(genotype == null || !genotype.isCalled()){
EnumMap likelihoods = new EnumMap(Genotype.Type.class);
- likelihoods.put(Genotype.Type.UNAVAILABLE,1.0);
- return likelihoods;
- }
- else if(genotype.isNoCall()){
- EnumMap likelihoods = new EnumMap(Genotype.Type.class);
- likelihoods.put(Genotype.Type.NO_CALL,1.0);
+ likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0);
+ likelihoods.put(Genotype.Type.HET,1.0/3.0);
+ likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0);
return likelihoods;
}
return genotype.getLikelihoods().getAsMap(true);
@@ -541,57 +539,113 @@ public class PhaseByTransmission extends RodWalker, HashMa
* @param finalGenotypes: An ArrayList that will be added the genotypes phased by transmission in the following order: Mother, Father, Child
* @return
*/
- private boolean phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
+ private int phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child,ArrayList finalGenotypes) {
- //Get the PL
- Map motherLikelihoods = getLikelihoodsAsMapSafeNull(mother);
- Map fatherLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ //Check whether it is a pair or trio
+ //Always assign the first parent as the parent having genotype information in pairs
+ //Always assign the mother as the first parent in trios
+ int parentsCalled = 0;
+ Map firstParentLikelihoods;
+ Map secondParentLikelihoods;
+ Genotype.Type pairSecondParentGenotype = null;
+ if(mother == null || !mother.isCalled()){
+ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType();
+ if(father != null && father.isCalled())
+ parentsCalled = 1;
+ }
+ else{
+ firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ if(father == null || !father.isCalled()){
+ parentsCalled = 1;
+ pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType();
+ }else{
+ parentsCalled = 2;
+ }
+ }
Map childLikelihoods = getLikelihoodsAsMapSafeNull(child);
//Prior vars
double bestConfigurationLikelihood = 0.0;
double norm = 0.0;
int configuration_index =0;
- ArrayList isMV = new ArrayList();
- isMV.add(false);
- ArrayList bestMotherGenotype = new ArrayList();
- bestMotherGenotype.add(getTypeSafeNull(mother));
- ArrayList bestFatherGenotype = new ArrayList();
- bestFatherGenotype.add(getTypeSafeNull(father));
+ ArrayList bestMVCount = new ArrayList();
+ bestMVCount.add(0);
+
+ ArrayList bestFirstParentGenotype = new ArrayList();
+ ArrayList bestSecondParentGenotype = new ArrayList();
ArrayList bestChildGenotype = new ArrayList();
+ bestFirstParentGenotype.add(getTypeSafeNull(mother));
+ bestSecondParentGenotype.add(getTypeSafeNull(father));
bestChildGenotype.add(getTypeSafeNull(child));
//Get the most likely combination
//Only check for most likely combination if at least a parent and the child have genotypes
- if(childLikelihoods.size()>2 && (motherLikelihoods.size() + fatherLikelihoods.size())>3){
+ if(child.isCalled() && parentsCalled > 0){
int mvCount;
- double configurationLikelihood;
- for(Map.Entry motherGenotype : motherLikelihoods.entrySet()){
- for(Map.Entry fatherGenotype : fatherLikelihoods.entrySet()){
- for(Map.Entry childGenotype : childLikelihoods.entrySet()){
- mvCount = mvCountMatrix.get(motherGenotype.getKey()).get(fatherGenotype.getKey()).get(childGenotype.getKey());
- configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*motherGenotype.getValue()*fatherGenotype.getValue()*childGenotype.getValue();
+ int cumulativeMVCount = 0;
+ double configurationLikelihood = 0;
+ for(Map.Entry childGenotype : childLikelihoods.entrySet()){
+ for(Map.Entry firstParentGenotype : firstParentLikelihoods.entrySet()){
+ for(Map.Entry secondParentGenotype : secondParentLikelihoods.entrySet()){
+ mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey());
+ //For parent/child pairs, sum over the possible genotype configurations of the missing parent
+ if(parentsCalled<2){
+ cumulativeMVCount += mvCount;
+ configurationLikelihood += mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue();
+ }
+ //Evaluate configurations of trios
+ else{
+ configurationLikelihood = mvCount>0 ? Math.pow(deNovoPrior,mvCount)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue() : (1.0-11*deNovoPrior)*firstParentGenotype.getValue()*secondParentGenotype.getValue()*childGenotype.getValue();
+ norm += configurationLikelihood;
+ //Keep this combination if
+ //It has a better likelihood
+ //Or it has the same likelihood but requires less changes from original genotypes
+ if (configurationLikelihood > bestConfigurationLikelihood){
+ bestConfigurationLikelihood = configurationLikelihood;
+ bestMVCount.clear();
+ bestMVCount.add(mvCount);
+ bestFirstParentGenotype.clear();
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.clear();
+ bestSecondParentGenotype.add(secondParentGenotype.getKey());
+ bestChildGenotype.clear();
+ bestChildGenotype.add(childGenotype.getKey());
+ }
+ else if(configurationLikelihood == bestConfigurationLikelihood) {
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(secondParentGenotype.getKey());
+ bestChildGenotype.add(childGenotype.getKey());
+ bestMVCount.add(mvCount);
+ }
+ }
+ }
+ //Evaluate configurations of parent/child pairs
+ if(parentsCalled<2){
norm += configurationLikelihood;
//Keep this combination if
//It has a better likelihood
//Or it has the same likelihood but requires less changes from original genotypes
if (configurationLikelihood > bestConfigurationLikelihood){
bestConfigurationLikelihood = configurationLikelihood;
- isMV.clear();
- isMV.add(mvCount>0);
- bestMotherGenotype.clear();
- bestMotherGenotype.add(motherGenotype.getKey());
- bestFatherGenotype.clear();
- bestFatherGenotype.add(fatherGenotype.getKey());
+ bestMVCount.clear();
+ bestMVCount.add(cumulativeMVCount/3);
bestChildGenotype.clear();
+ bestFirstParentGenotype.clear();
+ bestSecondParentGenotype.clear();
bestChildGenotype.add(childGenotype.getKey());
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(pairSecondParentGenotype);
}
else if(configurationLikelihood == bestConfigurationLikelihood) {
- bestMotherGenotype.add(motherGenotype.getKey());
- bestFatherGenotype.add(fatherGenotype.getKey());
+ bestFirstParentGenotype.add(firstParentGenotype.getKey());
+ bestSecondParentGenotype.add(pairSecondParentGenotype);
bestChildGenotype.add(childGenotype.getKey());
- isMV.add(mvCount>0);
+ bestMVCount.add(cumulativeMVCount/3);
}
+ configurationLikelihood = 0;
}
}
}
@@ -600,8 +654,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
bestConfigurationLikelihood = bestConfigurationLikelihood / norm;
//In case of multiple equally likely combinations, take a random one
- if(bestMotherGenotype.size()>1){
- configuration_index = rand.nextInt(bestMotherGenotype.size()-1);
+ if(bestFirstParentGenotype.size()>1){
+ configuration_index = rand.nextInt(bestFirstParentGenotype.size()-1);
}
}
@@ -609,16 +663,20 @@ public class PhaseByTransmission extends RodWalker, HashMa
bestConfigurationLikelihood = NO_TRANSMISSION_PROB;
}
- TrioPhase phasedTrioGenotypes = transmissionMatrix.get(bestMotherGenotype.get(configuration_index)).get(bestFatherGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
+ TrioPhase phasedTrioGenotypes;
+ if(parentsCalled < 2 && mother == null || !mother.isCalled())
+ phasedTrioGenotypes = transmissionMatrix.get(bestSecondParentGenotype.get(configuration_index)).get(bestFirstParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
+ else
+ phasedTrioGenotypes = transmissionMatrix.get(bestFirstParentGenotype.get(configuration_index)).get(bestSecondParentGenotype.get(configuration_index)).get(bestChildGenotype.get(configuration_index));
//Return the phased genotypes
phasedTrioGenotypes.getPhasedGenotypes(ref,alt,mother,father,child,bestConfigurationLikelihood,finalGenotypes);
- return isMV.get(configuration_index);
+ return bestMVCount.get(configuration_index);
}
- private void updatePairMetricsCounters(Genotype parent, Genotype child, boolean isMV, HashMap counters){
+ private void updatePairMetricsCounters(Genotype parent, Genotype child, int mvCount, HashMap counters){
//Increment metrics counters
if(parent.isCalled() && child.isCalled()){
@@ -626,11 +684,9 @@ public class PhaseByTransmission extends RodWalker, HashMa
if(parent.isPhased())
counters.put(NUM_PAIR_GENOTYPES_PHASED,counters.get(NUM_PAIR_GENOTYPES_PHASED)+1);
else{
+ counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+mvCount);
if(parent.isHet() && child.isHet())
counters.put(NUM_PAIR_HET_HET,counters.get(NUM_PAIR_HET_HET)+1);
-
- else if(isMV)
- counters.put(NUM_PAIR_VIOLATIONS,counters.get(NUM_PAIR_VIOLATIONS)+1);
}
}else{
counters.put(NUM_PAIR_GENOTYPES_NOCALL,counters.get(NUM_PAIR_GENOTYPES_NOCALL)+1);
@@ -638,7 +694,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
- private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, boolean isMV, HashMap counters){
+ private void updateTrioMetricsCounters(Genotype mother, Genotype father, Genotype child, int mvCount, HashMap counters){
//Increment metrics counters
if(mother.isCalled() && father.isCalled() && child.isCalled()){
@@ -647,11 +703,14 @@ public class PhaseByTransmission extends RodWalker, HashMa
counters.put(NUM_TRIO_GENOTYPES_PHASED,counters.get(NUM_TRIO_GENOTYPES_PHASED)+1);
else{
- if(mother.isHet() && father.isHet() && child.isHet())
- counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
-
- else if(isMV)
- counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
+ if(mvCount > 0){
+ if(mvCount >1)
+ counters.put(NUM_TRIO_DOUBLE_VIOLATIONS,counters.get(NUM_TRIO_DOUBLE_VIOLATIONS)+1);
+ else
+ counters.put(NUM_TRIO_VIOLATIONS,counters.get(NUM_TRIO_VIOLATIONS)+1);
+ }
+ else if(mother.isHet() && father.isHet() && child.isHet())
+ counters.put(NUM_TRIO_HET_HET_HET,counters.get(NUM_TRIO_HET_HET_HET)+1);
}
}else{
@@ -681,14 +740,15 @@ public class PhaseByTransmission extends RodWalker, HashMa
metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
metricsCounters.put(NUM_PAIR_HET_HET,0);
metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
- String mvfLine = "";
+ metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
+ String mvfLine;
if (tracker != null) {
VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation());
Map genotypeMap = vc.getGenotypes();
- boolean isMV;
+ int mvCount;
for (Sample sample : trios) {
Genotype mother = vc.getGenotype(sample.getMaternalID());
@@ -700,7 +760,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
continue;
ArrayList trioGenotypes = new ArrayList(3);
- isMV = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
+ mvCount = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child,trioGenotypes);
Genotype phasedMother = trioGenotypes.get(0);
Genotype phasedFather = trioGenotypes.get(1);
@@ -712,23 +772,23 @@ public class PhaseByTransmission extends RodWalker, HashMa
genotypeMap.put(phasedMother.getSampleName(), phasedMother);
if(father != null){
genotypeMap.put(phasedFather.getSampleName(), phasedFather);
- updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,isMV,metricsCounters);
+ updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
else{
- updatePairMetricsCounters(phasedMother,phasedChild,isMV,metricsCounters);
+ updatePairMetricsCounters(phasedMother,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
}
else{
genotypeMap.put(phasedFather.getSampleName(),phasedFather);
- updatePairMetricsCounters(phasedFather,phasedChild,isMV,metricsCounters);
+ updatePairMetricsCounters(phasedFather,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoods().toString(),phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoods().toString());
}
//Report violation if set so
//TODO: ADAPT FOR PAIRS TOO!!
- if(isMV && mvFile != null)
+ if(mvCount>0 && mvFile != null)
mvFile.println(mvfLine);
}
@@ -759,6 +819,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
metricsCounters.put(NUM_PAIR_GENOTYPES_PHASED,0);
metricsCounters.put(NUM_PAIR_HET_HET,0);
metricsCounters.put(NUM_PAIR_VIOLATIONS,0);
+ metricsCounters.put(NUM_TRIO_DOUBLE_VIOLATIONS,0);
return metricsCounters;
}
@@ -781,6 +842,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
sum.put(NUM_PAIR_GENOTYPES_PHASED,value.get(NUM_PAIR_GENOTYPES_PHASED)+sum.get(NUM_PAIR_GENOTYPES_PHASED));
sum.put(NUM_PAIR_HET_HET,value.get(NUM_PAIR_HET_HET)+sum.get(NUM_PAIR_HET_HET));
sum.put(NUM_PAIR_VIOLATIONS,value.get(NUM_PAIR_VIOLATIONS)+sum.get(NUM_PAIR_VIOLATIONS));
+ sum.put(NUM_TRIO_DOUBLE_VIOLATIONS,value.get(NUM_TRIO_DOUBLE_VIOLATIONS)+sum.get(NUM_TRIO_DOUBLE_VIOLATIONS));
return sum;
}
@@ -795,7 +857,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
logger.info("Number of trio-genotypes containing no call(s): " + result.get(NUM_TRIO_GENOTYPES_NOCALL));
logger.info("Number of trio-genotypes phased: " + result.get(NUM_TRIO_GENOTYPES_PHASED));
logger.info("Number of resulting Het/Het/Het trios: " + result.get(NUM_TRIO_HET_HET_HET));
- logger.info("Number of remaining mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS));
+ logger.info("Number of remaining single mendelian violations in trios: " + result.get(NUM_TRIO_VIOLATIONS));
+ logger.info("Number of remaining double mendelian violations in trios: " + result.get(NUM_TRIO_DOUBLE_VIOLATIONS));
logger.info("Number of complete pair-genotypes: " + result.get(NUM_PAIR_GENOTYPES_CALLED));
logger.info("Number of pair-genotypes containing no call(s): " + result.get(NUM_PAIR_GENOTYPES_NOCALL));
logger.info("Number of pair-genotypes phased: " + result.get(NUM_PAIR_GENOTYPES_PHASED));
From 893787de535fde77b56d02e5c006400e021f76c1 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 3 Nov 2011 13:04:11 +0100
Subject: [PATCH 015/380] Functions getAsMap and getNegLog10GQ now handle
missing genotype case.
---
.../sting/utils/variantcontext/GenotypeLikelihoods.java | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
index 47c93bb1b..8c8e4f257 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java
@@ -101,9 +101,12 @@ public class GenotypeLikelihoods {
}
//Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
+ //Returns null in case of missing likelihoods
public EnumMap getAsMap(boolean normalizeFromLog10){
//Make sure that the log10likelihoods are set
double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
+ if(likelihoods == null)
+ return null;
EnumMap likelihoodsMap = new EnumMap(Genotype.Type.class);
likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]);
likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]);
@@ -112,10 +115,13 @@ public class GenotypeLikelihoods {
}
//Return the neg log10 Genotype Quality (GQ) for the given genotype
+ //Returns Double.NEGATIVE_INFINITY in case of missing genotype
public double getNegLog10GQ(Genotype.Type genotype){
double qual = Double.NEGATIVE_INFINITY;
EnumMap likelihoods = getAsMap(false);
+ if(likelihoods == null)
+ return qual;
for(Map.Entry likelihood : likelihoods.entrySet()){
if(likelihood.getKey() == genotype)
continue;
From 385a6abec144fc72c726f88216cdcd9c3a4fa435 Mon Sep 17 00:00:00 2001
From: Laurent Francioli
Date: Thu, 3 Nov 2011 13:04:53 +0100
Subject: [PATCH 016/380] Fixed a bug that wrongly swapped the mother and
father genotypes in case the child genotype missing.
---
.../gatk/walkers/phasing/PhaseByTransmission.java | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
index 3c39b58d3..244527212 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java
@@ -547,10 +547,15 @@ public class PhaseByTransmission extends RodWalker, HashMa
int parentsCalled = 0;
Map firstParentLikelihoods;
Map secondParentLikelihoods;
+ ArrayList bestFirstParentGenotype = new ArrayList();
+ ArrayList bestSecondParentGenotype = new ArrayList();
+ ArrayList bestChildGenotype = new ArrayList();
Genotype.Type pairSecondParentGenotype = null;
if(mother == null || !mother.isCalled()){
firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
+ bestFirstParentGenotype.add(getTypeSafeNull(father));
+ bestSecondParentGenotype.add(getTypeSafeNull(mother));
pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType();
if(father != null && father.isCalled())
parentsCalled = 1;
@@ -558,6 +563,8 @@ public class PhaseByTransmission extends RodWalker, HashMa
else{
firstParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
secondParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
+ bestFirstParentGenotype.add(getTypeSafeNull(mother));
+ bestSecondParentGenotype.add(getTypeSafeNull(father));
if(father == null || !father.isCalled()){
parentsCalled = 1;
pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType();
@@ -566,6 +573,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
}
}
Map childLikelihoods = getLikelihoodsAsMapSafeNull(child);
+ bestChildGenotype.add(getTypeSafeNull(child));
//Prior vars
double bestConfigurationLikelihood = 0.0;
@@ -574,13 +582,6 @@ public class PhaseByTransmission extends RodWalker, HashMa
ArrayList bestMVCount = new ArrayList();
bestMVCount.add(0);
- ArrayList bestFirstParentGenotype = new ArrayList();
- ArrayList bestSecondParentGenotype = new ArrayList();
- ArrayList bestChildGenotype = new ArrayList();
- bestFirstParentGenotype.add(getTypeSafeNull(mother));
- bestSecondParentGenotype.add(getTypeSafeNull(father));
- bestChildGenotype.add(getTypeSafeNull(child));
-
//Get the most likely combination
//Only check for most likely combination if at least a parent and the child have genotypes
if(child.isCalled() && parentsCalled > 0){
From 611a395783c861f70b25157368045ffb257e762b Mon Sep 17 00:00:00 2001
From: Ryan Poplin
Date: Sat, 5 Nov 2011 12:18:56 -0400
Subject: [PATCH 024/380] Now properly extending candidate haplotypes with
bases from the reference context instead of filling with padding bases.
Functionality in the private Haplotype class is no longer necessary so
removing it. No need to have four different Haplotype classes in the GATK.
---
.../gatk/walkers/annotator/HaplotypeScore.java | 10 +++++-----
.../indels/HaplotypeIndelErrorModel.java | 18 +++++++++---------
.../walkers/indels/PairHMMIndelErrorModel.java | 4 ++--
.../broadinstitute/sting/utils/Haplotype.java | 12 ++++++++----
4 files changed, 24 insertions(+), 20 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
index c142109fa..803bf514c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
@@ -180,12 +180,12 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
final Haplotype haplotype1 = consensusHaplotypeQueue.poll();
Listhlist = new ArrayList();
- hlist.add(new Haplotype(haplotype1.getBasesAsBytes(), 60));
+ hlist.add(new Haplotype(haplotype1.getBases(), 60));
for (int k=1; k < haplotypesToCompute; k++) {
Haplotype haplotype2 = consensusHaplotypeQueue.poll();
if(haplotype2 == null ) { haplotype2 = haplotype1; } // Sometimes only the reference haplotype can be found
- hlist.add(new Haplotype(haplotype2.getBasesAsBytes(), 20));
+ hlist.add(new Haplotype(haplotype2.getBases(), 20));
}
return hlist;
} else
@@ -229,8 +229,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
}
private Haplotype getConsensusHaplotype(final Haplotype haplotypeA, final Haplotype haplotypeB) {
- final byte[] a = haplotypeA.getBasesAsBytes();
- final byte[] b = haplotypeB.getBasesAsBytes();
+ final byte[] a = haplotypeA.getBases();
+ final byte[] b = haplotypeB.getBases();
if (a.length != b.length) {
throw new ReviewedStingException("Haplotypes a and b must be of same length");
@@ -313,7 +313,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
// actually be a miscall in a matching direction, which would happen at a e / 3 rate. If b != c, then
// the chance that it is actually a mismatch is 1 - e, since any of the other 3 options would be a mismatch.
// so the probability-weighted mismatch rate is sum_i ( matched ? e_i / 3 : 1 - e_i ) for i = 1 ... n
- final byte[] haplotypeBases = haplotype.getBasesAsBytes();
+ final byte[] haplotypeBases = haplotype.getBases();
final SAMRecord read = p.getRead();
byte[] readBases = read.getReadBases();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
index 3b3f54b05..200a250f2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
@@ -205,7 +205,7 @@ public class HaplotypeIndelErrorModel {
byte haplotypeBase;
if (haplotypeIndex < RIGHT_ALIGN_INDEX)
- haplotypeBase = haplotype.getBasesAsBytes()[haplotypeIndex];
+ haplotypeBase = haplotype.getBases()[haplotypeIndex];
else
haplotypeBase = (byte)0; // dummy
@@ -217,7 +217,7 @@ public class HaplotypeIndelErrorModel {
if (readQual > 3)
pRead += pBaseRead;
haplotypeIndex++;
- if (haplotypeIndex >= haplotype.getBasesAsBytes().length)
+ if (haplotypeIndex >= haplotype.getBases().length)
haplotypeIndex = RIGHT_ALIGN_INDEX;
//System.out.format("H:%c R:%c RQ:%d HI:%d %4.5f %4.5f\n", haplotypeBase, readBase, (int)readQual, haplotypeIndex, pBaseRead, pRead);
}
@@ -227,8 +227,8 @@ public class HaplotypeIndelErrorModel {
System.out.println(read.getReadName());
System.out.print("Haplotype:");
- for (int k=0; k LEFT_ALIGN_INDEX && indX < RIGHT_ALIGN_INDEX)
- haplotypeBase = haplotype.getBasesAsBytes()[indX-1];
+ haplotypeBase = haplotype.getBases()[indX-1];
else
haplotypeBase = readBase;
@@ -296,8 +296,8 @@ public class HaplotypeIndelErrorModel {
System.out.println(read.getReadName());
System.out.print("Haplotype:");
- for (int k=0; k
Date: Sat, 5 Nov 2011 23:53:15 -0400
Subject: [PATCH 025/380] Adding integration test to cover using expressions
with IDs (-E foo.ID)
---
.../annotator/VariantAnnotatorIntegrationTest.java | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index 8e887c32a..189f643d4 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -124,6 +124,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("using expression", spec);
}
+ @Test
+ public void testUsingExpressionWithID() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
+ Arrays.asList("4a6f0675242f685e9072c1da5ad9e715"));
+ executeTest("using expression with ID", spec);
+ }
+
@Test
public void testTabixAnnotations() {
final String MD5 = "13269d5a2e16f06fd755cc0fb9271acf";
From a12bc63e5c2b99e52baa9eebe85667a4d1abec3d Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Sat, 5 Nov 2011 23:54:28 -0400
Subject: [PATCH 026/380] Get rid of support for bams without sample
information in the read groups. This hidden option wasn't being used anyways
because it wasn't hooked up properly in the AlignmentContext.
---
.../gatk/contexts/AlignmentContextUtils.java | 2 +-
.../walkers/annotator/VariantAnnotator.java | 13 ++-----------
.../walkers/genotyper/UGCalcLikelihoods.java | 8 +-------
.../genotyper/UnifiedArgumentCollection.java | 6 ------
.../walkers/genotyper/UnifiedGenotyper.java | 7 +------
.../genotyper/UnifiedGenotyperEngine.java | 17 ++++++-----------
6 files changed, 11 insertions(+), 42 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
index 4e75f3ddb..d589f9029 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
@@ -131,7 +131,7 @@ public class AlignmentContextUtils {
}
}
- public static Map splitContextBySampleName(ReadBackedPileup pileup, String assumedSingleSample) {
+ public static Map splitContextBySampleName(ReadBackedPileup pileup) {
return splitContextBySampleName(new AlignmentContext(pileup.getLocation(), pileup));
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index c9937f3d6..8f4bc0abd 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -164,10 +164,6 @@ public class VariantAnnotator extends RodWalker implements Ann
@Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
protected Boolean LIST = false;
- @Hidden
- @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
- protected String ASSUME_SINGLE_SAMPLE = null;
-
@Hidden
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;
@@ -213,11 +209,6 @@ public class VariantAnnotator extends RodWalker implements Ann
List rodName = Arrays.asList(variantCollection.variants.getName());
Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
- // if there are no valid samples, warn the user
- if ( samples.size() == 0 ) {
- logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
- }
-
if ( USE_ALL_ANNOTATIONS )
engine = new VariantAnnotatorEngine(annotationsToExclude, this, getToolkit());
else
@@ -301,9 +292,9 @@ public class VariantAnnotator extends RodWalker implements Ann
Map stratifiedContexts;
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
if ( ! context.hasExtendedEventPileup() ) {
- stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup(), ASSUME_SINGLE_SAMPLE);
+ stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup());
} else {
- stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup(), ASSUME_SINGLE_SAMPLE);
+ stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup());
}
if ( stratifiedContexts != null ) {
annotatedVCs = new ArrayList(VCs.size());
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java
index 503d87cbe..c7e577393 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java
@@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.HashSet;
import java.util.Set;
-import java.util.TreeSet;
/**
@@ -71,12 +70,7 @@ public class UGCalcLikelihoods extends LocusWalker
public void initialize() {
// get all of the unique sample names
- // if we're supposed to assume a single sample, do so
- Set samples = new TreeSet();
- if ( UAC.ASSUME_SINGLE_SAMPLE != null )
- samples.add(UAC.ASSUME_SINGLE_SAMPLE);
- else
- samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
+ Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index 07d9892a1..62218416d 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -96,11 +96,6 @@ public class UnifiedArgumentCollection {
@Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when in GENOTYPE_MODE = GENOTYPE_GIVEN_ALLELES", required=false)
public RodBinding alleles;
- // control the error modes
- @Hidden
- @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
- public String ASSUME_SINGLE_SAMPLE = null;
-
/**
* The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base
* is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value.
@@ -170,7 +165,6 @@ public class UnifiedArgumentCollection {
uac.GenotypingMode = GenotypingMode;
uac.OutputMode = OutputMode;
uac.COMPUTE_SLOD = COMPUTE_SLOD;
- uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE;
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
index 72dc217e1..bdd4e2c65 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@@ -206,12 +206,7 @@ public class UnifiedGenotyper extends LocusWalker samples = new TreeSet();
- if ( UAC.ASSUME_SINGLE_SAMPLE != null )
- samples.add(UAC.ASSUME_SINGLE_SAMPLE);
- else
- samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
+ Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
// initialize the verbose writer
if ( verboseWriter != null )
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
index 993a434ac..cee128a6a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@@ -106,12 +106,7 @@ public class UnifiedGenotyperEngine {
// ---------------------------------------------------------------------------------------------------------
@Requires({"toolkit != null", "UAC != null"})
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
- this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null,
- // get the number of samples
- // if we're supposed to assume a single sample, do so
- UAC.ASSUME_SINGLE_SAMPLE != null ?
- new TreeSet(Arrays.asList(UAC.ASSUME_SINGLE_SAMPLE)) :
- SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()));
+ this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()));
}
@Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0"})
@@ -253,7 +248,7 @@ public class UnifiedGenotyperEngine {
pileup = rawContext.getExtendedEventPileup();
else if (rawContext.hasBasePileup())
pileup = rawContext.getBasePileup();
- stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
+ stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc);
}
@@ -435,7 +430,7 @@ public class UnifiedGenotyperEngine {
pileup = rawContext.getExtendedEventPileup();
else if (rawContext.hasBasePileup())
pileup = rawContext.getBasePileup();
- stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
+ stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall);
}
@@ -569,7 +564,7 @@ public class UnifiedGenotyperEngine {
return null;
// stratify the AlignmentContext and cut by sample
- stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
+ stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
} else {
@@ -586,12 +581,12 @@ public class UnifiedGenotyperEngine {
return null;
// stratify the AlignmentContext and cut by sample
- stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
+ stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
}
} else if ( model == GenotypeLikelihoodsCalculationModel.Model.SNP ) {
// stratify the AlignmentContext and cut by sample
- stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
+ stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup());
if( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) {
int numDeletions = 0;
From 3517489a22a3af3cad192a7c5378930c4f94bd26 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Sun, 6 Nov 2011 01:07:49 -0400
Subject: [PATCH 027/380] Better --sample selection integration test for VE.
The previous one would return true even if --sample was not working at all.
---
.../walkers/varianteval/VariantEvalIntegrationTest.java | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
index cd2493dde..cbfe0ceab 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@@ -9,7 +9,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval";
private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf";
private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf";
- private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf";
+ private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.NA12045.vcf";
private static String cmdRoot = "-T VariantEval" +
" -R " + b36KGReference;
@@ -359,7 +359,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
@Test
public void testPerSampleAndSubsettedSampleHaveSameResults() {
- String md5 = "b0565ac61b2860248e4abd478a177b5e";
+ String md5 = "7425ca5c439afd7bb33ed5cfea02c2b3";
WalkerTestSpec spec = new WalkerTestSpec(
buildCommandLine(
@@ -369,7 +369,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"--eval " + fundamentalTestSNPsVCF,
"-noEV",
"-EV CompOverlap",
- "-sn HG00625",
+ "-sn NA12045",
"-noST",
"-L " + fundamentalTestSNPsVCF,
"-o %s"
From c1986b63356db6e816ce864164e0b1b3b59d2ab5 Mon Sep 17 00:00:00 2001
From: Eric Banks
Date: Mon, 7 Nov 2011 11:06:19 -0500
Subject: [PATCH 031/380] Add notes to the GATKdocs as to when a particular
annotation can/cannot be calculated.
---
.../sting/gatk/walkers/annotator/BaseQualityRankSumTest.java | 3 ++-
.../sting/gatk/walkers/annotator/FisherStrand.java | 3 ++-
.../sting/gatk/walkers/annotator/HaplotypeScore.java | 2 ++
.../sting/gatk/walkers/annotator/InbreedingCoeff.java | 3 ++-
.../gatk/walkers/annotator/MappingQualityRankSumTest.java | 1 +
.../sting/gatk/walkers/annotator/QualByDepth.java | 3 ++-
.../sting/gatk/walkers/annotator/ReadPosRankSumTest.java | 1 +
7 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
index 6cab6d95f..312b505ec 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
@@ -14,7 +14,8 @@ import java.util.List;
/**
- * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele)
+ * The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele).
+ * Note that the base quality rank sum test can not be calculated for homozygous sites.
*/
public class BaseQualityRankSumTest extends RankSumTest {
public List getKeyNames() { return Arrays.asList("BaseQRankSum"); }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
index 2d1d1978c..c4025a25c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
@@ -46,7 +46,8 @@ import java.util.*;
/**
* Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
* being seen on only the forward or only the reverse strand) in the reads? More bias is
- * indicative of false positive calls.
+ * indicative of false positive calls. Note that the fisher strand test may not be
+ * calculated for certain complex indel cases or for multi-allelic sites.
*/
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation {
private static final String FS = "FS";
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
index 803bf514c..94b0636f4 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
@@ -52,6 +52,8 @@ import java.util.*;
/**
* Consistency of the site with two (and only two) segregating haplotypes. Higher scores
* are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls.
+ * Note that the Haplotype Score is only calculated for sites with read coverage; also, for SNPs, the
+ * site must be bi-allelic.
*/
public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation {
private final static boolean DEBUG = false;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java
index a14007147..8728e5aa4 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java
@@ -23,7 +23,8 @@ import java.util.Map;
*
* A continuous generalization of the Hardy-Weinberg test for disequilibrium that works
* well with limited coverage per sample. See the 1000 Genomes Phase I release for
- * more information.
+ * more information. Note that the Inbreeding Coefficient will not be calculated for files
+ * with fewer than a minimum (generally 10) number of samples.
*/
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
index 157c615d7..9857c339f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
@@ -16,6 +16,7 @@ import java.util.List;
/**
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
+ * Note that the mapping quality rank sum test can not be calculated for homozygous sites.
*/
public class MappingQualityRankSumTest extends RankSumTest {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
index ffc852903..b942d9817 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
@@ -19,7 +19,8 @@ import java.util.Map;
/**
* Variant confidence (given as (AB+BB)/AA from the PLs) / unfiltered depth.
*
- * Low scores are indicative of false positive calls and artifacts.
+ * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing
+ * reads associated with the samples with polymorphic genotypes.
*/
public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
index 27a9306d4..d762af428 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
@@ -20,6 +20,7 @@ import java.util.List;
/**
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
+ * Note that the read position rank sum test can not be calculated for homozygous sites.
*/
public class ReadPosRankSumTest extends RankSumTest {
From 759f4fe6b85c7341daf5a90af78d74f806ec330e Mon Sep 17 00:00:00 2001
From: Eric Banks