From d896a4a9d362adaf04ea803227ff21d77811fe74 Mon Sep 17 00:00:00 2001 From: kiran Date: Thu, 19 May 2011 21:27:37 +0000 Subject: [PATCH] Given genotypes for a trio, phases child by transmission. Computes probability that the determined phase is correct given that the genotypes for mom and dad are correct (useful if you want to use this to compare phasing accuracy, but want to break that comparison down by phasing confidence in the truth set). Optionally filters out sites where the phasing is indeterminate. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5824 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/phasing/PhaseByTransmission.java | 172 ++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java new file mode 100755 index 000000000..7226b38a5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java @@ -0,0 +1,172 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.phasing; + +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.vcf.VCFUtils; + +import java.util.*; + +/** + * Given genotypes for a trio, phases child by transmission. Computes probability that the determined phase is correct + * given that the genotypes for mom and dad are correct (useful if you want to use this to compare phasing accuracy, but + * want to break that comparison down by phasing confidence in the truth set). Optionally filters out sites where the + * phasing is indeterminate. + */ +public class PhaseByTransmission extends RodWalker { + @Argument(shortName="f", fullName="familyPattern", required=true, doc="Pattern for the family structure (usage: mom+dad=child)") + public String familyStr = null; + + @Argument(shortName="filter", fullName="filterPhaseIndeterminateSites", required=false, doc="Filters out sites that are phase-indeterminate") + public Boolean filterPhaseIndeterminateSites = false; + + @Output + protected VCFWriter vcfWriter = null; + + private String SAMPLE_NAME_MOM; + private String SAMPLE_NAME_DAD; + private String SAMPLE_NAME_CHILD; + + /** + * Parse the familial relationship specification, and initialize VCF writer + */ + public void initialize() { + String[] pieces = familyStr.split("[\\+\\=]"); + + SAMPLE_NAME_MOM = pieces[0]; + SAMPLE_NAME_DAD = pieces[1]; + SAMPLE_NAME_CHILD = pieces[2]; + + Set samples = new HashSet(); + samples.add(SAMPLE_NAME_MOM); + samples.add(SAMPLE_NAME_DAD); + samples.add(SAMPLE_NAME_CHILD); + + Set headerLines = new HashSet(); + headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); + headerLines.add(new VCFInfoHeaderLine("TP", 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); + vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + } + + /** + * Given genotypes for mom, dad, and child, determine the phase for the child by examining the possible transmitted alleles. + * + * @param mom mom's genotype + * @param dad dad's genotype + * @param child child's genotype + * @return phased version of child's genotype, or null in the case that the phasing cannot be determined + */ + private Genotype getPhasedChildGenotype(Genotype mom, Genotype dad, Genotype child) { + List momAlleles = mom.getAlleles(); + List dadAlleles = dad.getAlleles(); + + Set possiblePhasedChildGenotypes = new HashSet(); + + double transmissionProb = QualityUtils.qualToProb(mom.getNegLog10PError())*QualityUtils.qualToProb(dad.getNegLog10PError()); + + Map attributes = new HashMap(); + attributes.putAll(child.getAttributes()); + attributes.put("TP", transmissionProb); + + for (Allele momAllele : momAlleles) { + for (Allele dadAllele : dadAlleles) { + if (momAllele.isCalled() && dadAllele.isCalled()) { + List possiblePhasedChildAlleles = new ArrayList(); + possiblePhasedChildAlleles.add(momAllele); + possiblePhasedChildAlleles.add(dadAllele); + + Genotype possiblePhasedChildGenotype = new Genotype(child.getSampleName(), possiblePhasedChildAlleles, child.getNegLog10PError(), null, attributes, true); + possiblePhasedChildGenotypes.add(possiblePhasedChildGenotype); + } + } + } + + for (Genotype possiblePhasedChildGenotype : possiblePhasedChildGenotypes) { + if (child.sameGenotype(possiblePhasedChildGenotype, true)) { + return possiblePhasedChildGenotype; + } + } + + return null; + } + + /** + * For each variant in the file, determine the phasing for the child and replace the child's genotype with the trio's genotype + * + * @param tracker the reference meta-data tracker + * @param ref the reference context + * @param context the alignment context + * @return null + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (tracker != null) { + Collection vcs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, true); + + for (VariantContext vc : vcs) { + Genotype mom = vc.getGenotype(SAMPLE_NAME_MOM); + Genotype dad = vc.getGenotype(SAMPLE_NAME_DAD); + Genotype child = vc.getGenotype(SAMPLE_NAME_CHILD); + + Genotype childPhased = getPhasedChildGenotype(mom, dad, child); + + Collection phasedGenotypes = new ArrayList(); + Collection alleles = vc.getAlleles(); + Set filters = new HashSet(); + Map attributes = new HashMap(); + + phasedGenotypes.add(mom); + phasedGenotypes.add(dad); + + filters.addAll(vc.getFilters()); + attributes.putAll(vc.getAttributes()); + + if (childPhased == null) { + phasedGenotypes.add(child); + + if (filterPhaseIndeterminateSites) { + filters.add("PHASE_INDETERMINATE"); + } + } else { + phasedGenotypes.add(childPhased); + } + + VariantContext newvc = new VariantContext("PhaseByTransmission", ref.getLocus().getContig(), vc.getStart(), vc.getStart(), alleles, phasedGenotypes, vc.getNegLog10PError(), filters, attributes); + + vcfWriter.add(newvc, ref.getBase()); + } + } + + return null; + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + @Override + public Integer reduceInit() { + return null; + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + @Override + public Integer reduce(Integer value, Integer sum) { + return null; + } +}