From 20ffe484bccd405d57021414636e109cd8cfc8c2 Mon Sep 17 00:00:00 2001 From: fromer Date: Wed, 29 Sep 2010 19:28:56 +0000 Subject: [PATCH] Added detection and INFO field marking of phasing inconsistencies (and optional filtration using --filterInconsistentSites) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4384 348d0f76-0448-11de-a6fe-93d51630548a --- .../phasing/ReadBackedPhasingWalker.java | 215 ++++++++++-------- .../varianteval/GenotypePhasingEvaluator.java | 2 +- 2 files changed, 121 insertions(+), 96 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 33f1ebc18..c17212505 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -68,7 +68,7 @@ public class ReadBackedPhasingWalker extends RodWalker P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9 @@ -84,6 +84,17 @@ public class ReadBackedPhasingWalker extends RodWalker rodNames = null; private PhasingQualityStatsWriter statsWriter = null; + public static final String PQ_KEY = "PQ"; + + // In order to detect phase inconsistencies: + private static final double FRACTION_OF_MEAN_PQ_CHANGES = 0.5; // If the PQ decreases by this fraction of the mean PQ changes (thus far), then this read is inconsistent with previous reads + private static final double MAX_FRACTION_OF_INCONSISTENT_READS = 0.1; // If there are more than this fraction of inconsistent reads, then flag this site + + @Argument(fullName = "filterInconsistentSites", shortName = "filterInconsistentSites", doc = "Mark sites with inconsistent phasing as filtered [default:false]", required = false) + protected boolean filterInconsistentSites = false; + + public static final String PHASING_INCONSISTENT_KEY = "PhasingInconsistent"; + public void initialize() { if (maxPhaseSites <= 2) maxPhaseSites = 2; // by definition, must phase a site relative to previous site [thus, 2 in total] @@ -105,7 +116,10 @@ public class ReadBackedPhasingWalker extends RodWalker hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - hInfo.add(new VCFFormatHeaderLine("PQ", 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); + + // Phasing-specific INFO fields: + hInfo.add(new VCFFormatHeaderLine(PQ_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); + hInfo.add(new VCFInfoHeaderLine(PHASING_INCONSISTENT_KEY, 0, VCFHeaderLineType.Flag, "Are the reads significantly haplotype-inconsistent?")); Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); writer.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples()))); @@ -244,8 +258,8 @@ public class ReadBackedPhasingWalker extends RodWalker prevHetAndInteriorIt = phaseWindow.prevHetAndInteriorIt; /* Notes: @@ -258,24 +272,21 @@ public class ReadBackedPhasingWalker extends RodWalker gtAttribs = new HashMap(gt.getAttributes()); - gtAttribs.put("PQ", pr.phaseQuality); - Genotype phasedGt = new Genotype(gt.getSampleName(), biall.getAllelesAsList(), gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); + gtAttribs.put(PQ_KEY, pr.phaseQuality); + Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getNegLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); uvc.setGenotype(samp, phasedGt); } @@ -290,10 +301,13 @@ public class ReadBackedPhasingWalker extends RodWalker handledGtAttribs = new HashMap(handledGt.getAttributes()); - handledGtAttribs.put("PQ", pr.phaseQuality); + handledGtAttribs.put(PQ_KEY, pr.phaseQuality); Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getNegLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased); interiorUvc.setGenotype(samp, phasedHomGt); } @@ -557,10 +571,36 @@ public class ReadBackedPhasingWalker extends RodWalker cur] is sufficient for this path to exist. NOTE: - If we would use NON-UNIFORM priors for the various haplotypes, then this calculation would not be correct, since the equivalence of: + If we would use NON-UNIFORM priors for the various haplotypes consistent with a margnialized haplotype, then this calculation would not be correct, since the equivalence of: 1. The read affects the final marginal haplotype posterior probability (for general mapping and base quality values). 2. The read has edges involved in a path from prev ---> cur. DEPENDS STRONGLY on the fact that all haplotypes have the same EXACT prior. + + This is due to the following: + [We denote: + R = set of all reads + r = a single read + "AA + CC" = AA on top chromosome, CC on bottom chromosome] + + Note that since there are only two haplotype possibilities: + P(AA + CC | R) + P(AC + CA | R) = 1 + + Now, if we assume that all haplotypes consistent with AA + CC have the same prior probability [P(AA + CC | R)], then: + P(AA + CC | R) + = P(AAAA + CCCC | R) + ... + P(AACC + CCAA | R) + = [P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R)] / P(R) + \propto P(AAAA + CCCC , R) + ... + P(AACC + CCAA , R) + = P(R | AAAA + CCCC)*P(AAAA + CCCC) + ... + P(R | AACC + CCAA)*P(AACC + CCAA) + = P(AA + CC | R) * [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] + + Since we assume independence between reads given a particular haplotype [P(R | AAAA + CCCC) = \prod_r P(r | AAAA + CCCC)], + a new read r affects P(AA + CC | R) by multiplying each of the terms in the sum by, e.g., P(r | AAAA + CCCC). + Therefore, if these values do not affect the ratio of: + (I) [P(R | AAAA + CCCC) + ... + P(R | AACC + CCAA)] / [P(R | ACAA + CACC) + ... + P(R | ACCC + CAAA)] + then they do not affect the value of: + (II) P(AA + CC | R) / P(AC + CA | R) [which uniquely defines their values, since they sum to 1] + + And, the P(r | AAAA + CCCC), ..., P(r | ACCC + CAAA) do not affect ratio (I) iff r's edges do not take part in a path from prev to cur in combination with the other reads in R. */ int prev = phasingSiteIndex - 1; int cur = phasingSiteIndex; @@ -699,7 +739,7 @@ public class ReadBackedPhasingWalker extends RodWalker nameToReads : phaseWindow.readsAtHetSites.entrySet()) { Read rd = nameToReads.getValue(); - - logger.debug("rd = " + rd + "\tname = " + nameToReads.getKey() + (rd.isGapped() ? "\tGAPPED" : "")); + logger.debug("rd = " + rd + "\tname = " + nameToReads.getKey()); for (PhasingTable.PhasingTableEntry pte : sampleHaps) { PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass()); pte.getScore().integrateReadScore(score); - logger.debug("score(" + rd + ", " + pte.getHaplotypeClass() + ") = " + score); } - // - // - // - // Check the current best haplotype assignment: + // Check the current best haplotype assignment and compare it to the previous one: MaxHaplotypeAndQuality curMaxHapAndQual = new MaxHaplotypeAndQuality(sampleHaps, false); logger.debug("CUR MAX hap:\t" + curMaxHapAndQual.maxEntry.getHaplotypeClass() + "\tcurPhaseQuality:\t" + curMaxHapAndQual.phaseQuality); - if (prevMaxHapAndQual != null && passesPhasingThreshold(prevMaxHapAndQual.phaseQuality)) { - numHighQualityIterations++; - if (!curMaxHapAndQual.maxEntry.getHaplotypeClass().getRepresentative().equals(prevMaxHapAndQual.maxEntry.getHaplotypeClass().getRepresentative()) || // switched phase - curMaxHapAndQual.phaseQuality < 0.9 * prevMaxHapAndQual.phaseQuality) { // a 10% ["significant"] decrease in PQ - logger.debug("Inconsistent read found!"); - numInconsistentIterations++; + if (prevMaxHapAndQual != null) { + double changeInPQ = prevMaxHapAndQual.phaseQuality - curMaxHapAndQual.phaseQuality; + + if (passesPhasingThreshold(prevMaxHapAndQual.phaseQuality)) { + numHighQualityIterations++; + if (!curMaxHapAndQual.hasSameRepresentativeHaplotype(prevMaxHapAndQual) || // switched phase + (numPQchangesObserved > 0 && changeInPQ > FRACTION_OF_MEAN_PQ_CHANGES * (totalAbsPQchange / numPQchangesObserved))) { // a "significant" decrease in PQ + logger.debug("Inconsistent read found!"); + numInconsistentIterations++; + } } + + totalAbsPQchange += Math.abs(changeInPQ); + numPQchangesObserved++; } prevMaxHapAndQual = curMaxHapAndQual; - // - // - // - } + logger.debug("\nPhasing table [AFTER CALCULATION]:\n" + sampleHaps + "\n"); MaxHaplotypeAndQuality maxHapQual = new MaxHaplotypeAndQuality(sampleHaps, true); double posteriorProb = maxHapQual.maxEntry.getScore().getValue(); @@ -755,22 +799,11 @@ public class ReadBackedPhasingWalker extends RodWalker MAX_FRACTION_OF_INCONSISTENT_READS) + phasingContainsInconsistencies = true; - // - // - if (numInconsistentIterations / (double) numHighQualityIterations >= 0.1) { - // - // ???? - // NEED TO CHANGE phaseSite() to always output PQ field, EVEN if it's LESS THAN threshold ?????? - // ???? - // - repPhaseQuality *= -1; - } - // - // - - return new PhaseResult(maxHapQual.maxEntry.getHaplotypeClass().getRepresentative(), repPhaseQuality); + return new PhaseResult(maxHapQual.maxEntry.getHaplotypeClass().getRepresentative(), maxHapQual.phaseQuality, phasingContainsInconsistencies); } private static class MaxHaplotypeAndQuality { @@ -803,22 +836,30 @@ public class ReadBackedPhasingWalker extends RodWalker 2!"); byte prevBase = hap.getBase(0); // The 1st base in the haplotype byte curBase = hap.getBase(1); // The 2nd base in the haplotype - boolean chosePrevTopChrom = prevBiall.matchesTopBase(prevBase); - boolean choseCurTopChrom = curBiall.matchesTopBase(curBase); + boolean chosePrevTopChrom = prevAllelePair.matchesTopBase(prevBase); + boolean choseCurTopChrom = curAllelePair.matchesTopBase(curBase); if (chosePrevTopChrom != choseCurTopChrom) - curBiall.swapAlleles(); + curAllelePair.swapAlleles(); } private boolean isInWindowRange(VariantContext vc1, VariantContext vc2) { @@ -1002,7 +1043,7 @@ public class ReadBackedPhasingWalker extends RodWalker genotypes; private double negLog10PError; private Set filters; - private Map attributes; + private Map attributes; public UnfinishedVariantContext(VariantContext vc) { this.name = vc.getName(); @@ -1012,8 +1053,8 @@ public class ReadBackedPhasingWalker extends RodWalker(vc.getGenotypes()); // since vc.getGenotypes() is unmodifiable this.negLog10PError = vc.getNegLog10PError(); - this.filters = vc.getFilters(); - this.attributes = vc.getAttributes(); + this.filters = new HashSet(vc.getFilters()); + this.attributes = new HashMap(vc.getAttributes()); } public VariantContext toVariantContext() { @@ -1031,6 +1072,13 @@ public class ReadBackedPhasingWalker extends RodWalker { if (isFirst) isFirst = false; else - sb.append("|"); + sb.append(" + "); sb.append(h); } @@ -1488,31 +1538,6 @@ class Read extends BaseArray { return score; } - private enum ReadStage { - BEFORE_BASES, BASES_1, NO_BASES, BASES_2 - } - - public boolean isGapped() { - ReadStage s = ReadStage.BEFORE_BASES; - - for (int i = 0; i < bases.length; i++) { - if (getBase(i) != null) { // has a base at i - if (s == ReadStage.BEFORE_BASES) - s = ReadStage.BASES_1; - else if (s == ReadStage.NO_BASES) { - s = ReadStage.BASES_2; - break; - } - } - else { // no base at i - if (s == ReadStage.BASES_1) - s = ReadStage.NO_BASES; - } - } - - return (s == ReadStage.BASES_2); - } - public int[] getNonNullIndices() { List nonNull = new LinkedList(); for (int i = 0; i < bases.length; i++) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java index 1a6e66c15..1f7c9750c 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java @@ -205,7 +205,7 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { } public static Double getPQ(Genotype gt) { - Object pq = gt.getAttributes().get("PQ"); + Object pq = gt.getAttributes().get(ReadBackedPhasingWalker.PQ_KEY); if (pq == null) return null;