diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
index a28fc83b5..7ed77b845 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java
@@ -82,6 +82,12 @@ import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersF
/**
* Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads).
*
+ * The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites.
+ *
+ * The underlying algorithm is based on building up 2^n local haplotypes,
+ * where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter).
+ * Then, these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype the alleles of a genotype at a particular locus belong (denoted by the HP tag).
+ *
*
* Performs physical phasing of SNP calls, based on sequencing reads.
*
@@ -161,13 +167,6 @@ public class ReadBackedPhasing extends RodWalker unphasedSiteQueue = null;
@@ -327,6 +326,7 @@ public class ReadBackedPhasing extends RodWalker processQueue(PhasingStats phaseStats, boolean processAll) {
List oldPhasedList = new LinkedList();
@@ -362,6 +362,7 @@ public class ReadBackedPhasing extends RodWalker discardIrrelevantPhasedSites() {
List vcList = new LinkedList();
@@ -517,6 +518,7 @@ public class ReadBackedPhasing extends RodWalker= phaseQualityThresh;
}
+ // A genotype and the base pileup that supports it
private static class GenotypeAndReadBases {
public Genotype genotype;
public ReadBasesAtPosition readBases;
@@ -529,6 +531,7 @@ public class ReadBackedPhasing extends RodWalker listHetGenotypes, String sample, GenomeLoc phasingLoc) {
buildReadsAtHetSites(listHetGenotypes, sample, phasingLoc, null);
}
@@ -650,6 +654,7 @@ public class ReadBackedPhasing extends RodWalker> edgeReads;
@@ -695,6 +700,7 @@ public class ReadBackedPhasing extends RodWalker removeExtraneousReads(int numHetSites) {
PhasingGraph readGraph = new PhasingGraph(numHetSites);
EdgeToReads edgeToReads = new EdgeToReads();
@@ -840,6 +846,7 @@ public class ReadBackedPhasing extends RodWalker removeExtraneousSites(List listHetGenotypes) {
Set sitesWithReads = new HashSet();
for (Map.Entry nameToReads : readsAtHetSites.entrySet()) {
@@ -879,6 +886,10 @@ public class ReadBackedPhasing extends RodWalker {
private Vector grb;
@@ -916,6 +927,7 @@ public class ReadBackedPhasing extends RodWalker trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) {
if (DEBUG)
logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes));
@@ -966,6 +978,7 @@ public class ReadBackedPhasing extends RodWalker sampleReadBases;
@@ -1214,6 +1229,7 @@ public class ReadBackedPhasing extends RodWalker sampleReadBases;
@@ -1313,6 +1329,9 @@ public class ReadBackedPhasing extends RodWalker hapMap = new TreeMap();
for (PhasingTable.PhasingTableEntry pte : table) {
@@ -1366,6 +1388,7 @@ public class ReadBackedPhasing extends RodWalker marginalizeInds;
@@ -1412,6 +1435,9 @@ public class ReadBackedPhasing extends RodWalker {
private LinkedList table;
@@ -1464,6 +1491,7 @@ public class ReadBackedPhasing extends RodWalker {
private HaplotypeClass haplotypeClass;
private PhasingScore score;
@@ -1528,6 +1557,7 @@ public class ReadBackedPhasing extends RodWalker multipleBaseCounts = null;
@@ -1644,6 +1674,7 @@ class HaplotypeClass implements Iterable {
}
}
+// Summary statistics about phasing rates, for each sample
class PhasingStats {
private int numReads;
private int numVarSites;
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java
index 9759004a0..8c8817fe6 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java
@@ -72,7 +72,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:332341-382503",
1,
- Arrays.asList("1c9a7fe4db41864cd85d16e5cf88986c"));
+ Arrays.asList("1bb034bd54421fe4884e3142ed92d47e"));
executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec);
}
@@ -82,7 +82,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:1232503-1332503",
1,
- Arrays.asList("a3ca151145379e0d4bae64a91165ea0b"));
+ Arrays.asList("c12954252d4c8659b5ecf7517b277496"));
executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec);
}
@@ -92,7 +92,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30)
+ " -L chr20:332341-382503",
1,
- Arrays.asList("f685803333123a102ce1851d984cbd10"));
+ Arrays.asList("0b945e30504d04e9c6fa659ca5c25ed5"));
executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec);
}
@@ -102,7 +102,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100)
+ " -L chr20:332341-382503",
1,
- Arrays.asList("aaa7c25d118383639f273128d241e140"));
+ Arrays.asList("e9e8ef92d694ca71f29737fba26282f5"));
executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec);
}
@@ -112,7 +112,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10)
+ " -L chr20:332341-482503",
1,
- Arrays.asList("418e29400762972e77bae4f73e16befe"));
+ Arrays.asList("b9c9347c760a06db635952bf4920fb48"));
executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec);
}
@@ -122,7 +122,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:652810-681757",
1,
- Arrays.asList("4c8f6190ecc86766baba3aba08542991"));
+ Arrays.asList("02c3a903842aa035ae379f16bc3d64ae"));
executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec);
}
@@ -132,18 +132,8 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10)
+ " -L chr20:332341-802503",
1,
- Arrays.asList("44eb225ab3167651ec0a9e1fdcc83d34"));
- executeTest("Use trio-phased VCF, but ignore its phasing [TEST SEVEN]", spec);
- }
-
- @Test
- public void test8() {
- WalkerTestSpec spec = new WalkerTestSpec(
- baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10)
- + " -L chr20:332341-802503" + " -respectPhaseInInput",
- 1,
- Arrays.asList("e3549b89d49092e73cc6eb21f233471c"));
- executeTest("Use trio-phased VCF, and respect its phasing [TEST EIGHT]", spec);
+ Arrays.asList("ac41d1aa9c9a67c07d894f485c29c574"));
+ executeTest("Use trio-phased VCF, adding read-backed phasing infomration in HP tag (as is now standard for RBP) [TEST SEVEN]", spec);
}
}