From e33d3dafc6554c4fea705e62dabb93115d457240 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Fri, 3 Jan 2014 12:04:47 -0500 Subject: [PATCH] Add documentation for RBP, and also update the MD5 for the tests now that the output uses HP tags instead of '|', which is now reserved for trio-based phasing --- .../walkers/phasing/ReadBackedPhasing.java | 47 +++++++++++++++---- .../ReadBackedPhasingIntegrationTest.java | 26 ++++------ 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index a28fc83b5..7ed77b845 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -82,6 +82,12 @@ import static org.broadinstitute.sting.utils.variant.GATKVCFUtils.getVCFHeadersF /** * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). * + * The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites. + * + * The underlying algorithm is based on building up 2^n local haplotypes, + * where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter). + * Then, these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype the alleles of a genotype at a particular locus belong (denoted by the HP tag). + * *

* Performs physical phasing of SNP calls, based on sequencing reads. *

@@ -161,13 +167,6 @@ public class ReadBackedPhasing extends RodWalker unphasedSiteQueue = null; @@ -327,6 +326,7 @@ public class ReadBackedPhasing extends RodWalker processQueue(PhasingStats phaseStats, boolean processAll) { List oldPhasedList = new LinkedList(); @@ -362,6 +362,7 @@ public class ReadBackedPhasing extends RodWalker discardIrrelevantPhasedSites() { List vcList = new LinkedList(); @@ -517,6 +518,7 @@ public class ReadBackedPhasing extends RodWalker= phaseQualityThresh; } + // A genotype and the base pileup that supports it private static class GenotypeAndReadBases { public Genotype genotype; public ReadBasesAtPosition readBases; @@ -529,6 +531,7 @@ public class ReadBackedPhasing extends RodWalker listHetGenotypes, String sample, GenomeLoc phasingLoc) { buildReadsAtHetSites(listHetGenotypes, sample, phasingLoc, null); } @@ -650,6 +654,7 @@ public class ReadBackedPhasing extends RodWalker> edgeReads; @@ -695,6 +700,7 @@ public class ReadBackedPhasing extends RodWalker removeExtraneousReads(int numHetSites) { PhasingGraph readGraph = new PhasingGraph(numHetSites); EdgeToReads edgeToReads = new EdgeToReads(); @@ -840,6 +846,7 @@ public class ReadBackedPhasing extends RodWalker removeExtraneousSites(List listHetGenotypes) { Set sitesWithReads = new HashSet(); for (Map.Entry nameToReads : readsAtHetSites.entrySet()) { @@ -879,6 +886,10 @@ public class ReadBackedPhasing extends RodWalker { private Vector grb; @@ -916,6 +927,7 @@ public class ReadBackedPhasing extends RodWalker trimWindow(List listHetGenotypes, String sample, GenomeLoc phaseLocus) { if (DEBUG) logger.warn("Trying to phase sample " + sample + " at locus " + phaseLocus + " within a window of " + cacheWindow + " bases yields " + listHetGenotypes.size() + " heterozygous sites to phase:\n" + toStringGRL(listHetGenotypes)); @@ -966,6 +978,7 @@ public class ReadBackedPhasing extends RodWalker sampleReadBases; @@ -1214,6 +1229,7 @@ public class ReadBackedPhasing extends RodWalker sampleReadBases; @@ -1313,6 +1329,9 @@ public class ReadBackedPhasing extends RodWalker hapMap = new TreeMap(); for (PhasingTable.PhasingTableEntry pte : table) { @@ -1366,6 +1388,7 @@ public class ReadBackedPhasing extends RodWalker marginalizeInds; @@ -1412,6 +1435,9 @@ public class ReadBackedPhasing extends RodWalker { private LinkedList table; @@ -1464,6 +1491,7 @@ public class ReadBackedPhasing extends RodWalker { private HaplotypeClass haplotypeClass; private PhasingScore score; @@ -1528,6 +1557,7 @@ public class ReadBackedPhasing extends RodWalker multipleBaseCounts = null; @@ -1644,6 +1674,7 @@ class HaplotypeClass implements Iterable { } } +// Summary statistics about phasing rates, for each sample class PhasingStats { private int numReads; private int numVarSites; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java index 9759004a0..8c8817fe6 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingIntegrationTest.java @@ -72,7 +72,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:332341-382503", 1, - Arrays.asList("1c9a7fe4db41864cd85d16e5cf88986c")); + Arrays.asList("1bb034bd54421fe4884e3142ed92d47e")); executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec); } @@ -82,7 +82,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:1232503-1332503", 1, - Arrays.asList("a3ca151145379e0d4bae64a91165ea0b")); + Arrays.asList("c12954252d4c8659b5ecf7517b277496")); executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec); } @@ -92,7 +92,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30) + " -L chr20:332341-382503", 1, - Arrays.asList("f685803333123a102ce1851d984cbd10")); + Arrays.asList("0b945e30504d04e9c6fa659ca5c25ed5")); executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec); } @@ -102,7 +102,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100) + " -L chr20:332341-382503", 1, - Arrays.asList("aaa7c25d118383639f273128d241e140")); + Arrays.asList("e9e8ef92d694ca71f29737fba26282f5")); executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec); } @@ -112,7 +112,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10) + " -L chr20:332341-482503", 1, - Arrays.asList("418e29400762972e77bae4f73e16befe")); + Arrays.asList("b9c9347c760a06db635952bf4920fb48")); executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec); } @@ -122,7 +122,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10) + " -L chr20:652810-681757", 1, - Arrays.asList("4c8f6190ecc86766baba3aba08542991")); + Arrays.asList("02c3a903842aa035ae379f16bc3d64ae")); executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec); } @@ -132,18 +132,8 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest { baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) + " -L chr20:332341-802503", 1, - Arrays.asList("44eb225ab3167651ec0a9e1fdcc83d34")); - executeTest("Use trio-phased VCF, but ignore its phasing [TEST SEVEN]", spec); - } - - @Test - public void test8() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10) - + " -L chr20:332341-802503" + " -respectPhaseInInput", - 1, - Arrays.asList("e3549b89d49092e73cc6eb21f233471c")); - executeTest("Use trio-phased VCF, and respect its phasing [TEST EIGHT]", spec); + Arrays.asList("ac41d1aa9c9a67c07d894f485c29c574")); + executeTest("Use trio-phased VCF, adding read-backed phasing infomration in HP tag (as is now standard for RBP) [TEST SEVEN]", spec); } }