From 9c728979cc3267521770eafa550f9482f1440bbd Mon Sep 17 00:00:00 2001 From: fromer Date: Thu, 27 Jan 2011 20:17:48 +0000 Subject: [PATCH] In order to calculate haplotype lengths of trio+RBP, I implemented a simple trio phaser as an option to ComparePhasingToTrioPhasingNoRecombination, which already decides if the trio could theoretically phase git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5099 348d0f76-0448-11de-a6fe-93d51630548a --- ...ingToTrioPhasingNoRecombinationWalker.java | 282 ++++++++++++++++-- 1 file changed, 255 insertions(+), 27 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java index 70b521044..a5beda8d9 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/phasing/ComparePhasingToTrioPhasingNoRecombinationWalker.java @@ -27,6 +27,10 @@ package org.broadinstitute.sting.oneoffprojects.phasing; import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFHeader; +import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.vcf.VCFWriter; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -34,14 +38,17 @@ import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; +import org.broadinstitute.sting.gatk.walkers.phasing.WriteVCF; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.io.PrintStream; -import java.util.LinkedList; -import java.util.Map; -import java.util.TreeSet; +import java.util.*; + +import static org.broadinstitute.sting.utils.vcf.VCFUtils.getVCFHeadersFromRods; /** * Walks along all variant ROD loci and verifies the phasing from the reads for user-defined pairs of sites. @@ -52,15 +59,20 @@ import java.util.TreeSet; @ReadFilters({ZeroMappingQualityReadFilter.class}) // Filter out all reads with zero mapping quality -public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker { +public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker { public final static String TRIO_ROD_NAME = "trio"; public final static String PHASING_ROD_NAME = "phasing"; private final static int NUM_IN_TRIO = 3; + private final static int DIPLOID = 2; + @Output protected PrintStream out; + @Argument(fullName = "trioAugmentedPhasing", shortName = "trioAugmentedPhasing", doc = "File to which trio-phased variants should be written", required = false) + protected VCFWriter writer = null; + private String phasingSample = null; private enum TrioStatus { @@ -68,18 +80,38 @@ public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker< } private GenomeLoc prevLoc = null; + private VariantContext prevTrioVc = null; private TrioStatus prevTrioStatus = TrioStatus.MISSING; + private Genotype prevPhasingGt = null; + public void initialize() { + initializeVcfWriter(); + } + + private void initializeVcfWriter() { + if (writer == null) + return; + + // setup the header fields: + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + + List rodNames = new LinkedList(); + rodNames.add(PHASING_ROD_NAME); + Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); + Set samples = new TreeSet(rodNameToHeader.get(PHASING_ROD_NAME).getGenotypeSamples()); + writer.writeHeader(new VCFHeader(hInfo, samples)); } public boolean generateExtendedEvents() { return false; } - public Integer reduceInit() { - return 0; + public CompareToTrioPhasingStats reduceInit() { + return new CompareToTrioPhasingStats(); } /** @@ -88,14 +120,18 @@ public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker< * @param context the context for the given locus * @return statistics of and list of all phased VariantContexts and their base pileup that have gone out of cacheWindow range. */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public CompareResult map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (tracker == null) return null; GenomeLoc curLoc = ref.getLocus(); VariantContext phasingVc = tracker.getVariantContext(ref, PHASING_ROD_NAME, curLoc); + + CompareToTrioPhasingStats stats = new CompareToTrioPhasingStats(); + CompareResult result = new CompareResult(phasingVc, stats); + if (phasingVc == null || phasingVc.isFiltered()) - return null; + return result; Map phasingSampleToGt = phasingVc.getGenotypes(); if (phasingSampleToGt.size() != 1) @@ -106,24 +142,24 @@ public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker< phasingSample = sample; if (!sample.equals(phasingSample)) throw new UserException("Must provide EXACTLY one sample!"); - Genotype phasingGt = phasingSampGt.getValue(); - if (!phasingGt.isHet()) - return null; + Genotype curPhasingGt = phasingSampGt.getValue(); + if (!curPhasingGt.isHet()) + return result; - VariantContext trioVc = tracker.getVariantContext(ref, TRIO_ROD_NAME, curLoc); - boolean useTrioVc = (trioVc != null && !trioVc.isFiltered()); + VariantContext curTrioVc = tracker.getVariantContext(ref, TRIO_ROD_NAME, curLoc); + boolean useTrioVc = (curTrioVc != null && !curTrioVc.isFiltered()); - Genotype sampleGtInTrio = null; + Genotype sampleCurGtInTrio = null; if (useTrioVc) { - sampleGtInTrio = trioVc.getGenotype(phasingSample); + sampleCurGtInTrio = curTrioVc.getGenotype(phasingSample); - if (trioVc.getNSamples() > NUM_IN_TRIO || sampleGtInTrio == null) + if (curTrioVc.getNSamples() > NUM_IN_TRIO || sampleCurGtInTrio == null) throw new UserException("Must provide trio data for sample: " + phasingSample); - if (!new TreeSet(phasingGt.getAlleles()).equals(new TreeSet(sampleGtInTrio.getAlleles()))) { + if (!new TreeSet(curPhasingGt.getAlleles()).equals(new TreeSet(sampleCurGtInTrio.getAlleles()))) { logger.warn("Locus " + curLoc + " breaks phase, since " + PHASING_ROD_NAME + " and " + TRIO_ROD_NAME + " tracks have different genotypes for " + phasingSample + "!"); prevLoc = null; - return null; + return result; } } @@ -131,10 +167,10 @@ public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker< int processed = 1; TrioStatus currentTrioStatus = TrioStatus.MISSING; - if (useTrioVc && trioVc.getNSamples() == NUM_IN_TRIO) { + if (useTrioVc && curTrioVc.getNSamples() == NUM_IN_TRIO) { boolean allHet = true; for (int i = 0; i < NUM_IN_TRIO; i++) { - if (!trioVc.getGenotype(i).isHet()) { + if (!curTrioVc.getGenotype(i).isHet()) { allHet = false; break; } @@ -148,6 +184,8 @@ public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker< if (prevLoc != null && curLoc.onSameContig(prevLoc)) { String trioPhaseStatus; + stats.comparedSites++; + String addToOutput = ""; if (prevTrioStatus == TrioStatus.TRIPLE_HET || currentTrioStatus == TrioStatus.TRIPLE_HET) { trioPhaseStatus = "Het3"; @@ -160,28 +198,218 @@ public class ComparePhasingToTrioPhasingNoRecombinationWalker extends RodWalker< throw new ReviewedStingException("LOGICAL error: prevTrioStatus != TrioStatus.PRESENT || currentTrioStatus != TrioStatus.PRESENT"); trioPhaseStatus = "trio_phased"; + stats.trioPhaseableSites++; + + if (writer != null) { // Phase the genotypes using the trio information: + String parent1 = null; + String parent2 = null; + for (Map.Entry trioEntry : curTrioVc.getGenotypes().entrySet()) { + String trioSample = trioEntry.getKey(); + if (trioEntry.getValue().getPloidy() != DIPLOID) + throw new UserException("Each sample in trio must be diploid!"); + if (trioSample.equals(phasingSample)) + continue; + + if (parent1 == null) + parent1 = trioSample; + else if (parent2 == null) + parent2 = trioSample; + else + throw new ReviewedStingException("Cannot be more than 2 parents in TRIO!"); + } + if (parent1 == null || parent2 == null) + throw new ReviewedStingException("Must have 2 parents in TRIO!"); + + Genotype samplePrevGtInTrio = prevTrioVc.getGenotype(phasingSample); + + Genotype parent1PrevGt = prevTrioVc.getGenotype(parent1); + Genotype parent1CurGt = curTrioVc.getGenotype(parent1); + + Genotype parent2PrevGt = prevTrioVc.getGenotype(parent2); + Genotype parent2CurGt = curTrioVc.getGenotype(parent2); + + int prevHomIndex, prevOtherIndex; + Allele prevHomAllele; + Set prevOtherAlleles; + if (parent1PrevGt.isHom()) { + prevHomIndex = 1; + prevOtherIndex = 2; + prevHomAllele = parent1PrevGt.getAllele(0); + prevOtherAlleles = new TreeSet(parent2PrevGt.getAlleles()); + } + else if (parent2PrevGt.isHom()) { + prevHomIndex = 2; + prevOtherIndex = 1; + prevHomAllele = parent2PrevGt.getAllele(0); + prevOtherAlleles = new TreeSet(parent1PrevGt.getAlleles()); + } + else + throw new ReviewedStingException("LOGICAL ERROR: at least one parent is hom!"); + + int curHomIndex, curOtherIndex; + Allele curHomAllele; + Set curOtherAlleles; + if (parent1CurGt.isHom()) { + curHomIndex = 1; + curOtherIndex = 2; + curHomAllele = parent1CurGt.getAllele(0); + curOtherAlleles = new TreeSet(parent2CurGt.getAlleles()); + } + else if (parent2CurGt.isHom()) { + curHomIndex = 2; + curOtherIndex = 1; + curHomAllele = parent2CurGt.getAllele(0); + curOtherAlleles = new TreeSet(parent1CurGt.getAlleles()); + } + else + throw new ReviewedStingException("LOGICAL ERROR: at least one parent is hom!"); + + boolean phased = true; + + Map prevAlleleToParent = new TreeMap(); + for (Allele prevAllele : samplePrevGtInTrio.getAlleles()) { + if (prevAllele.equals(prevHomAllele)) + prevAlleleToParent.put(prevAllele, prevHomIndex); + else if (prevOtherAlleles.contains(prevAllele)) + prevAlleleToParent.put(prevAllele, prevOtherIndex); + else { + logger.warn("CANNOT phase, due to inconsistent inheritance of alleles!"); + phased = false; + break; + } + } + + Map parentToCurAllele = new HashMap(); + for (Allele curAllele : sampleCurGtInTrio.getAlleles()) { + if (curAllele.equals(curHomAllele)) + parentToCurAllele.put(curHomIndex, curAllele); + else if (curOtherAlleles.contains(curAllele)) + parentToCurAllele.put(curOtherIndex, curAllele); + else { + logger.warn("CANNOT phase, due to inconsistent inheritance of alleles!"); + phased = false; + break; + } + } + + if (phased) { + List phasedCurAlleles = new LinkedList(); + for (Allele prevAllele : prevPhasingGt.getAlleles()) { + Integer prevIndex = prevAlleleToParent.get(prevAllele); + if (prevIndex == null) + throw new ReviewedStingException("LOGICAL error: expecting to find prev allele in trio parents"); + Allele curAllele = parentToCurAllele.get(prevIndex); + if (curAllele == null) + throw new ReviewedStingException("LOGICAL error: expecting to find cur allele in trio parents"); + phasedCurAlleles.add(curAllele); + } + + boolean useTrioPhase = true; + Genotype phasedGt = new Genotype(phasingSample, phasedCurAlleles, curPhasingGt.getNegLog10PError(), curPhasingGt.getFilters(), curPhasingGt.getAttributes(), phased); + + if (curPhasingGt.isPhased()) { + stats.bothCanPhase++; + useTrioPhase = false; + + if (!phasedCurAlleles.equals(curPhasingGt.getAlleles())) { + String contradictMessage = "Phase from " + PHASING_ROD_NAME + " track at " + curLoc + " contradicts the trio-based phasing."; + stats.contradictoryPhaseSites++; + addToOutput += "\tcontradictory"; + + if (phasingVc.hasAttribute(ReadBackedPhasingWalker.PHASING_INCONSISTENT_KEY)) { + stats.contradictoryPhaseSitesWithPhaseInconsistency++; + addToOutput += "\tphaseInconsistent"; + useTrioPhase = true; + contradictMessage += " Ignoring " + PHASING_ROD_NAME + " phase due to phase-inconsistency."; + } + else { + contradictMessage += " Maintaining phase from " + PHASING_ROD_NAME + "."; + } + logger.warn(contradictMessage); + } + } + + if (useTrioPhase) { // trio phasing adds PREVIOUSLY UNKNOWN phase information: + Map genotypes = new HashMap(); + genotypes.put(phasingSample, phasedGt); + + phasingVc = VariantContext.modifyGenotypes(phasingVc, genotypes); + result.phasedVc = phasingVc; + } + } + } } - out.println(prevLoc + "\t" + curLoc + "\t" + trioPhaseStatus + "\t" + phasingGt.isPhased()); + out.println(prevLoc + "\t" + curLoc + "\t" + trioPhaseStatus + "\t" + curPhasingGt.isPhased() + addToOutput); } prevLoc = curLoc; + prevTrioVc = curTrioVc; prevTrioStatus = currentTrioStatus; + prevPhasingGt = curPhasingGt; - return processed; + return result; } - public Integer reduce(Integer addIn, Integer runningCount) { + public CompareToTrioPhasingStats reduce(CompareResult addIn, CompareToTrioPhasingStats runningCount) { if (addIn == null) - addIn = 0; + addIn = new CompareResult(); - return runningCount + addIn; + if (writer != null && addIn.phasedVc != null) + WriteVCF.writeVCF(addIn.phasedVc, writer, logger); + + return runningCount.addIn(addIn.stats); } /** * @param result the number of reads and VariantContexts seen. */ - public void onTraversalDone(Integer result) { - System.out.println("Processed " + result + " sites."); + public void onTraversalDone(CompareToTrioPhasingStats result) { + System.out.println("Compared " + result.comparedSites + " sites."); + System.out.println("Trio can phase " + result.trioPhaseableSites + " sites."); + System.out.println("Trio and " + PHASING_ROD_NAME + " track can both phase " + result.bothCanPhase + " sites."); + System.out.println("Contradiction between phase inferred from " + TRIO_ROD_NAME + " and phase present in " + PHASING_ROD_NAME + " tracks at " + result.contradictoryPhaseSites + " sites."); + System.out.println("Of those, " + PHASING_ROD_NAME + " track is phase-inconsistent at " + result.contradictoryPhaseSitesWithPhaseInconsistency + " sites."); + } +} + +class CompareToTrioPhasingStats { + public int comparedSites; + public int trioPhaseableSites; + public int contradictoryPhaseSites; + public int contradictoryPhaseSitesWithPhaseInconsistency; + public int bothCanPhase; + + public CompareToTrioPhasingStats() { + this.comparedSites = 0; + this.trioPhaseableSites = 0; + this.contradictoryPhaseSites = 0; + this.contradictoryPhaseSitesWithPhaseInconsistency = 0; + this.bothCanPhase = 0; + } + + public CompareToTrioPhasingStats addIn(CompareToTrioPhasingStats other) { + this.comparedSites += other.comparedSites; + this.trioPhaseableSites += other.trioPhaseableSites; + this.contradictoryPhaseSites += other.contradictoryPhaseSites; + this.contradictoryPhaseSitesWithPhaseInconsistency += other.contradictoryPhaseSitesWithPhaseInconsistency; + this.bothCanPhase += other.bothCanPhase; + + return this; + } +} + +class CompareResult { + public VariantContext phasedVc; + public CompareToTrioPhasingStats stats; + + public CompareResult() { + this.phasedVc = null; + this.stats = new CompareToTrioPhasingStats(); + } + + public CompareResult(VariantContext phasedVc, CompareToTrioPhasingStats stats) { + this.phasedVc = phasedVc; + this.stats = stats; } } \ No newline at end of file