Fixed bug in phasing, where mapping probability was incorrectly raised to the power of number of non-null bases [instead, it is just multiplied into phasing probability once]

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4430 348d0f76-0448-11de-a6fe-93d51630548a
2010-10-05 17:07:31 +00:00 · 2010-10-05 17:07:31 +00:00 · c6668bd49c
parent 7f1e44b764
commit c6668bd49c
1 changed files with 11 additions and 5 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
@ -87,7 +87,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
    public static final String PQ_KEY = "PQ";

    // In order to detect phase inconsistencies:
-    private static final double FRACTION_OF_MEAN_PQ_CHANGES = 0.5; // If the PQ decreases by this fraction of the mean PQ changes (thus far), then this read is inconsistent with previous reads
+    private static final double FRACTION_OF_MEAN_PQ_CHANGES = 0.1; // If the PQ decreases by this fraction of the mean PQ changes (thus far), then this read is inconsistent with previous reads
    private static final double MAX_FRACTION_OF_INCONSISTENT_READS = 0.1; // If there are more than this fraction of inconsistent reads, then flag this site

    @Argument(fullName = "filterInconsistentSites", shortName = "filterInconsistentSites", doc = "Mark sites with inconsistent phasing as filtered [default:false]", required = false)
@ -763,7 +763,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph

        for (Map.Entry<String, Read> nameToReads : phaseWindow.readsAtHetSites.entrySet()) {
            Read rd = nameToReads.getValue();
-            logger.debug("rd = " + rd + "\tname = " + nameToReads.getKey());
+            logger.debug("\nrd = " + rd + "\tname = " + nameToReads.getKey());

            for (PhasingTable.PhasingTableEntry pte : sampleHaps) {
                PhasingScore score = rd.matchHaplotypeClassScore(pte.getHaplotypeClass());
@ -803,7 +803,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
        if (numInconsistentIterations / (double) numHighQualityIterations > MAX_FRACTION_OF_INCONSISTENT_READS)
            phasingContainsInconsistencies = true;

-        return new PhaseResult(maxHapQual.maxEntry.getHaplotypeClass().getRepresentative(), maxHapQual.phaseQuality, phasingContainsInconsistencies);
+        return new PhaseResult(maxHapQual.getRepresentative(), maxHapQual.phaseQuality, phasingContainsInconsistencies);
    }

    private static class MaxHaplotypeAndQuality {
@ -1274,7 +1274,6 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
                normalizeBy.plusEqual(pte.getScore());

            if (!normalizeBy.equals(ZERO)) { // prevent precision problems
-                logger.debug("normalizeBy = " + normalizeBy);
                for (PhasingTableEntry pte : table)
                    pte.getScore().divEqual(normalizeBy);
            }
@ -1524,6 +1523,8 @@ class Read extends BaseArray {
        if (sz != hap.bases.length)
            throw new ReviewedStingException("Read and Haplotype should have same length to be compared!");

+        score.timesEqual(mappingProb);
+
        for (int i = 0; i < sz; i++) {
            Byte thisBase = this.getBase(i);
            Byte hapBase = hap.getBase(i);
@ -1532,9 +1533,14 @@ class Read extends BaseArray {
                    score.timesEqual(baseProbs[i]);
                else
                    score.timesEqual(baseErrorProbs[i]);
-                score.timesEqual(mappingProb);
+                /* The use of baseErrorProbs[i] as the probability of seeing ANY base but hapBase (i.e., a sequencing error) would SEEM to be OK, since we are dealing with biallelic sites.
+                  Nevertheless, since there could be reads that DON'T have either of the 2 alleles, then this would be incorrect
+                  [since both alleles would be scored with the same error probability -- baseErrorProbs[i] -- which is the error SUMMED OVER ALL non-read bases]!
+                  DESPITE this, we use the above model to CONSERVATIVELY give high scores to haplotype bases that differ from low-quality read bases.
+                */
            }
        }
+
        return score;
    }