From f8f37a786d5560e462fec33f4ccc12d387dab7b3 Mon Sep 17 00:00:00 2001 From: kiran Date: Mon, 23 May 2011 17:50:59 +0000 Subject: [PATCH] Now emits much more informative filter names and includes all of other the proper VCF header details (filter description line, tag definitions, etc.). Currently rewriting the way the transmission probability is calculated. This is shaping up to be a lovely little piece of code... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5849 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/phasing/PhaseByTransmission.java | 76 ++++++++++++++----- 1 file changed, 57 insertions(+), 19 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java index 3f193733b..ef73c89f4 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/phasing/PhaseByTransmission.java @@ -8,9 +8,12 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.util.*; @@ -25,11 +28,8 @@ public class PhaseByTransmission extends RodWalker { @Argument(shortName="f", fullName="familyPattern", required=true, doc="Pattern for the family structure (usage: mom+dad=child)") public String familyStr = null; - @Argument(shortName="filter_ind", fullName="filterPhaseIndeterminateSites", required=false, doc="Filters out sites that are phase-indeterminate") - public Boolean filterPhaseIndeterminateSites = false; - - @Argument(shortName="filter_amb", fullName="filterAmbiguousAlleleOriginSites", required=false, doc="Filters out sites where the parental origin of the alleles is ambiguous (i.e. triple-het situations)") - public Boolean filterAmbiguousAlleleOriginSites = false; + @Argument(shortName="nofilters", fullName="disableFilters", required=false, doc="Disable filters for sites where the phase can't be determined, where the parental origin of the alleles is ambiguous (i.e. everyone is heterozygous), or Mendelian violations") + public Boolean noFilters = false; @Output protected VCFWriter vcfWriter = null; @@ -38,8 +38,16 @@ public class PhaseByTransmission extends RodWalker { private String SAMPLE_NAME_DAD; private String SAMPLE_NAME_CHILD; - private class AmbiguousAlleleOriginException extends Exception {}; - private class NoAppropriatePhasedGenotypeException extends Exception {}; + private class AmbiguousAlleleOriginException extends Exception {} + private class InsufficientInfoToPhaseGenotypeException extends Exception {} + private class MendelianViolationException extends Exception {} + + private final String ROD_NAME = "variant"; + private final String AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME = "AmbiguousAlleleOrigin"; + private final String PHASE_INDETERMINATE_FILTER_NAME = "PhaseIndeterminate"; + private final String MENDELIAN_VIOLATION_FILTER_NAME = "MendelianViolation"; + private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; + private final String SOURCE_NAME = "PhaseByTransmission"; /** * Parse the familial relationship specification, and initialize VCF writer @@ -51,6 +59,24 @@ public class PhaseByTransmission extends RodWalker { SAMPLE_NAME_DAD = pieces[1]; SAMPLE_NAME_CHILD = pieces[2]; + ArrayList rodNames = new ArrayList(); + rodNames.add(ROD_NAME); + + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + + if (vcfSamples.size() != 3) { + throw new UserException("File to phase by transmission contains more than three samples. This walker only" + + "accepts VCFs with three samples, so that the meaning of the applied filters is" + + "unambiguous."); + } + + if (!vcfSamples.contains(SAMPLE_NAME_MOM) || !vcfSamples.contains(SAMPLE_NAME_DAD) || !vcfSamples.contains(SAMPLE_NAME_CHILD)) { + throw new UserException("One or more of the samples specified in the familyPattern argument is not present" + + "in this file. Please supply a VCF file that contains only three samples: the" + + "mother, the father, and the child"); + } + Set samples = new HashSet(); samples.add(SAMPLE_NAME_MOM); samples.add(SAMPLE_NAME_DAD); @@ -58,7 +84,10 @@ public class PhaseByTransmission extends RodWalker { Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - headerLines.add(new VCFInfoHeaderLine("TP", 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); + headerLines.add(new VCFFilterHeaderLine(AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME, "The parental origin of each of the child's allele cannot be determined (ie everyone is heterozygous)")); + headerLines.add(new VCFFilterHeaderLine(PHASE_INDETERMINATE_FILTER_NAME, "The phase of the child's genotype cannot be determined (ie someone is a no-call)")); + headerLines.add(new VCFFilterHeaderLine(MENDELIAN_VIOLATION_FILTER_NAME, "No combination of the parents' alleles can yield the child's genotype (ie a possible Mendelian violation)")); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); } @@ -70,9 +99,14 @@ public class PhaseByTransmission extends RodWalker { * @param child child's genotype * @return phased version of child's genotype * @throws AmbiguousAlleleOriginException if the parentage of the alleles can't be determined (i.e. a triple-het situation), - * @throws NoAppropriatePhasedGenotypeException if there is insufficient information to determine the phase + * @throws InsufficientInfoToPhaseGenotypeException if there is insufficient information to determine the phase + * @throws MendelianViolationException if no combination of the parental alleles can yield the child's genotype */ - private Genotype getPhasedChildGenotype(Genotype mom, Genotype dad, Genotype child) throws AmbiguousAlleleOriginException, NoAppropriatePhasedGenotypeException { + private Genotype getPhasedChildGenotype(Genotype mom, Genotype dad, Genotype child) throws AmbiguousAlleleOriginException, InsufficientInfoToPhaseGenotypeException, MendelianViolationException { + if (mom.isNoCall() || dad.isNoCall() || child.isNoCall()) { + throw new InsufficientInfoToPhaseGenotypeException(); + } + Set momAlleles = new HashSet(); momAlleles.addAll(mom.getAlleles()); @@ -83,7 +117,7 @@ public class PhaseByTransmission extends RodWalker { Map attributes = new HashMap(); attributes.putAll(child.getAttributes()); - attributes.put("TP", transmissionProb); + attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, transmissionProb); Set possiblePhasedChildGenotypes = new HashSet(); @@ -124,7 +158,7 @@ public class PhaseByTransmission extends RodWalker { } } - throw new NoAppropriatePhasedGenotypeException(); + throw new MendelianViolationException(); } /** @@ -138,7 +172,7 @@ public class PhaseByTransmission extends RodWalker { @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker != null) { - Collection vcs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, true); + Collection vcs = tracker.getVariantContexts(ref, ROD_NAME, null, context.getLocation(), true, true); for (VariantContext vc : vcs) { Genotype mom = vc.getGenotype(SAMPLE_NAME_MOM); @@ -151,12 +185,16 @@ public class PhaseByTransmission extends RodWalker { try { child = getPhasedChildGenotype(mom, dad, child); } catch (AmbiguousAlleleOriginException e) { - if (filterAmbiguousAlleleOriginSites) { - filters.add("AMBIGUOUS_ALLELE_ORIGIN"); + if (!noFilters) { + filters.add(AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME); } - } catch (NoAppropriatePhasedGenotypeException e) { - if (filterPhaseIndeterminateSites) { - filters.add("PHASE_INDETERMINATE"); + } catch (InsufficientInfoToPhaseGenotypeException e) { + if (!noFilters) { + filters.add(PHASE_INDETERMINATE_FILTER_NAME); + } + } catch (MendelianViolationException e) { + if (!noFilters) { + filters.add(MENDELIAN_VIOLATION_FILTER_NAME); } } @@ -170,7 +208,7 @@ public class PhaseByTransmission extends RodWalker { phasedGenotypes.add(dad); phasedGenotypes.add(child); - VariantContext newvc = new VariantContext("PhaseByTransmission", ref.getLocus().getContig(), vc.getStart(), vc.getStart(), alleles, phasedGenotypes, vc.getNegLog10PError(), filters, attributes); + VariantContext newvc = new VariantContext(SOURCE_NAME, ref.getLocus().getContig(), vc.getStart(), vc.getStart(), alleles, phasedGenotypes, vc.getNegLog10PError(), filters, attributes); vcfWriter.add(newvc, ref.getBase()); }