1. Phasing now ignores bases without minimum base quality (BQ) and minimum mapping quality (MQ); 2. The probability of a non-called base is now divided by 3, to evenly split up the error probability over the non-called bases

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4450 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-10-07 17:40:59 +00:00
parent 6205910f9f
commit ee00dcb79d
2 changed files with 52 additions and 38 deletions

View File

@ -76,6 +76,12 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
@Argument(fullName = "variantStatsFilePrefix", shortName = "variantStats", doc = "The prefix of the VCF/phasing statistics files", required = false)
protected String variantStatsFilePrefix = null;
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing [default: 10]", required = false)
public int MIN_BASE_QUALITY_SCORE = 10;
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing [default: 10]", required = false)
public int MIN_MAPPING_QUALITY_SCORE = 10;
private LinkedList<VariantAndReads> unphasedSiteQueue = null;
private DoublyLinkedList<UnfinishedVariantAndReads> partiallyPhasedSites = null; // the phased VCs to be emitted, and the alignment bases at these positions
@ -323,8 +329,14 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
samplePhaseStats.put(samp, sampPhaseCounts);
}
sampPhaseCounts.numTestedSites++;
if (pr.phasingContainsInconsistencies)
sampPhaseCounts.numInconsistentSites++;
if (pr.phasingContainsInconsistencies) {
if (genotypesArePhased)
sampPhaseCounts.numInconsistentSitesPhased++;
else
sampPhaseCounts.numInconsistentSitesNotPhased++;
}
if (genotypesArePhased)
sampPhaseCounts.numPhased++;
}
@ -819,25 +831,25 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (printDebug)
logger.debug("\nPhasing table [AFTER MAPPING]:\n" + hapTable + "\n");
// Determine the phase at this position:
calculateMaxHapAndPhasingQuality(hapTable, printDebug);
}
// Calculates maxEntry and its PQ (within table hapTable):
private void calculateMaxHapAndPhasingQuality(PhasingTable hapTable, boolean printDebug) {
hapTable.normalizeScores();
if (printDebug)
logger.debug("\nPhasing table [AFTER NORMALIZATION]:\n" + hapTable + "\n");
// Determine the phase at this position:
this.maxEntry = hapTable.maxEntry();
// convert posteriorProb to PHRED scale, but do NOT cap the quality as in QualityUtils.probToQual(posteriorProb):
this.phaseQuality = getPhasingQuality(hapTable, maxEntry);
}
// Returns the PQ of entry (within table hapTable, which MUST be normalized):
public static double getPhasingQuality(PhasingTable hapTable, PhasingTable.PhasingTableEntry entry) {
PreciseNonNegativeDouble sumErrorProbs = new PreciseNonNegativeDouble(ZERO);
for (PhasingTable.PhasingTableEntry pte : hapTable) {
if (pte != entry)
if (pte != maxEntry)
sumErrorProbs.plusEqual(pte.getScore());
}
return -10.0 * (sumErrorProbs.getLog10Value());
this.phaseQuality = -10.0 * (sumErrorProbs.getLog10Value());
}
public boolean hasSameRepresentativeHaplotype(MaxHaplotypeAndQuality that) {
@ -933,25 +945,25 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
System.out.println("Number of variant sites observed: " + result.getNumVarSites());
System.out.println("Average coverage: " + ((double) result.getNumReads() / result.getNumVarSites()));
System.out.println("\n-- Phasing summary [minimal haplotype quality (PQ): " + phaseQualityThresh + ", maxPhaseSites= " + maxPhaseSites + "] --");
System.out.println("\n--- Phasing summary [minimal haplotype quality (PQ): " + phaseQualityThresh + ", maxPhaseSites: " + maxPhaseSites + ", cacheWindow: " + cacheWindow + "] ---");
for (Map.Entry<String, PhaseCounts> sampPhaseCountEntry : result.getPhaseCounts()) {
PhaseCounts pc = sampPhaseCountEntry.getValue();
System.out.println("Sample: " + sampPhaseCountEntry.getKey() + "\tNumber of tested sites [cacheWindow= " + cacheWindow + "]: " + pc.numTestedSites + "\tNumber of phased sites: " + pc.numPhased + "\tNumber of phase-inconsistent sites: " + pc.numInconsistentSites);
System.out.print("Sample: " + sampPhaseCountEntry.getKey() + "\tSites tested: " + pc.numTestedSites + "\tSites phased: " + pc.numPhased);
System.out.println("\tPhase-inconsistent sites: " + (pc.numInconsistentSitesPhased + pc.numInconsistentSitesNotPhased) + " [phased: " + pc.numInconsistentSitesPhased + ", unphased:" + pc.numInconsistentSitesNotPhased + "]");
}
System.out.println("");
}
protected void writeVarContList(List<VariantContext> varContList) {
for (VariantContext vc : varContList) {
for (VariantContext vc : varContList)
writeVCF(vc);
}
}
/*
Inner classes:
*/
private static class VariantAndReads {
private class VariantAndReads {
public VariantContext variant;
public HashMap<String, ReadBasesAtPosition> sampleReadBases;
public boolean processVariant;
@ -976,14 +988,18 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
pileup = alignment.getExtendedEventPileup();
}
if (pileup != null) {
for (String samp : pileup.getSamples()) {
ReadBackedPileup samplePileup = pileup.getPileupForSample(samp);
ReadBasesAtPosition readBases = new ReadBasesAtPosition();
for (PileupElement p : samplePileup) {
if (!p.isDeletion()) // IGNORE deletions for now
readBases.putReadBase(p);
// filter the read-base pileup based on min base and mapping qualities:
pileup = pileup.getBaseAndMappingFilteredPileup(MIN_BASE_QUALITY_SCORE, MIN_MAPPING_QUALITY_SCORE);
if (pileup != null) {
for (String samp : pileup.getSamples()) {
ReadBackedPileup samplePileup = pileup.getPileupForSample(samp);
ReadBasesAtPosition readBases = new ReadBasesAtPosition();
for (PileupElement p : samplePileup) {
if (!p.isDeletion()) // IGNORE deletions for now
readBases.putReadBase(p);
}
sampleReadBases.put(samp, readBases);
}
sampleReadBases.put(samp, readBases);
}
}
}
@ -1512,11 +1528,11 @@ class Read extends BaseArray {
double errProb = QualityUtils.qualToErrorProb(baseQual);
// The base error should be AT LEAST AS HIGH as the mapping error:
// The base error should be AT LEAST AS HIGH as the mapping error [equivalent to capping the base quality (BQ) by the mapping quality (MQ)]:
errProb = Math.max(errProb, 1.0 - mappingProb.getValue());
baseProbs[index] = new PreciseNonNegativeDouble(1.0 - errProb);
baseErrorProbs[index] = new PreciseNonNegativeDouble(errProb);
baseProbs[index] = new PreciseNonNegativeDouble(1.0 - errProb); // The probability that the true base is the base called in the read
baseErrorProbs[index] = new PreciseNonNegativeDouble(errProb / 3.0); // DIVIDE up the error probability EQUALLY over the 3 non-called bases
}
public PhasingScore matchHaplotypeClassScore(HaplotypeClass hapClass) {
@ -1545,11 +1561,6 @@ class Read extends BaseArray {
score.timesEqual(baseProbs[i]);
else
score.timesEqual(baseErrorProbs[i]);
/* The use of baseErrorProbs[i] as the probability of seeing ANY base but hapBase (i.e., a sequencing error) would SEEM to be OK, since we are dealing with biallelic sites.
Nevertheless, since there could be reads that DON'T have either of the 2 alleles, then this would be incorrect
[since both alleles would be scored with the same error probability -- baseErrorProbs[i] -- which is the error SUMMED OVER ALL non-read bases]!
DESPITE this, we use the above model to CONSERVATIVELY give high scores to haplotype bases that differ from low-quality read bases.
*/
}
}
@ -1625,18 +1636,21 @@ class PhasingStats {
class PhaseCounts {
public int numTestedSites; // number of het sites directly succeeding het sites
public int numInconsistentSites;
public int numInconsistentSitesPhased;
public int numInconsistentSitesNotPhased;
public int numPhased;
public PhaseCounts() {
this.numTestedSites = 0;
this.numInconsistentSites = 0;
this.numInconsistentSitesPhased = 0;
this.numInconsistentSitesNotPhased = 0;
this.numPhased = 0;
}
public void addIn(PhaseCounts other) {
this.numTestedSites += other.numTestedSites;
this.numInconsistentSites += other.numInconsistentSites;
this.numInconsistentSitesPhased += other.numInconsistentSitesPhased;
this.numInconsistentSitesNotPhased += other.numInconsistentSitesNotPhased;
this.numPhased += other.numPhased;
}
}

View File

@ -25,7 +25,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:332341-382503",
1,
Arrays.asList("7ea69a898890b7bea53477547ae5580a"));
Arrays.asList("c59e8773efe79e1195b320d2735e1c16"));
executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec);
}
@ -35,7 +35,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:1232503-1332503",
1,
Arrays.asList("9a8f9beb94b204332d0a32a3af0d4118"));
Arrays.asList("d9cf20f506f2d0aa196fa89c051532c6"));
executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec);
}
@ -45,7 +45,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30)
+ " -L chr20:332341-382503",
1,
Arrays.asList("3d2973ca1f6d26062fa7f0d795faa216"));
Arrays.asList("e3ea64260f517830633b0060b3594178"));
executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec);
}
@ -55,7 +55,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100)
+ " -L chr20:332341-382503",
1,
Arrays.asList("13c66ff8448a8c3b1e3db4f7881f871d"));
Arrays.asList("435825a2a034ca27047fbd8a1dc1e0b2"));
executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec);
}
@ -65,7 +65,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10)
+ " -L chr20:332341-482503",
1,
Arrays.asList("3d35745e1cf40c892a8e793b141dbf20"));
Arrays.asList("d5897638dd317c3347b15ef05a13e53a"));
executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec);
}