diff --git a/lua/findGenes.lua b/lua/findGenes.lua new file mode 100644 index 000000000..fd43cf610 --- /dev/null +++ b/lua/findGenes.lua @@ -0,0 +1,298 @@ +-- This script parses the table of genes from http://geneticassociationdb.nih.gov/ +-- and generates an annotated interval list for use with the GATK +-- +-- Author: carneiro +-- Date: 5/24/2011 + + +ref_table = { + "ID", + "AS", + "PH", + "DI", + "DC", + "DT", + "CH", + "CB", + "GE", + "ST", + "SP", + "PV", + "RE", + "PI", + "AA", + "AF", + "PC", + "GN", + "RS", + "PO", + "GO", + "SU", + "LN", + "UN", + "NP", + "MP", + "JO", + "TI", + "RN", + "OM", + "YR", + "CN", + "SI", + "EF", + "GIGA", + "GIAA", + "GIGB", + "GIAB", + "GIGC", + "GIAC", + "GIAS", + "GIEF" +} + +chr_limits = { + 249250621, + 243199373, + 198022430, + 191154276, + 180915260, + 171115067, + 159138663, + 146364022, + 141213431, + 135534747, + 135006516, + 133851895, + 115169878, + 107349540, + 102531392, + 90354753, + 81195210, + 78077248, + 59128983, + 63025520, + 48129895, + 51304566, + 155270560, + 59373566, + 16569 +} + +local header = [[@HD VN:1.0 SO:coordinate +@SQ SN:1 LN:249250621 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1b22b98cdeb4a9304cb5d48026a85128 SP:Homo Sapiens +@SQ SN:2 LN:243199373 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a0d9851da00400dec1098a9255ac712e SP:Homo Sapiens +@SQ SN:3 LN:198022430 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fdfd811849cc2fadebc929bb925902e5 SP:Homo Sapiens +@SQ SN:4 LN:191154276 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:23dccd106897542ad87d2765d28a19a1 SP:Homo Sapiens +@SQ SN:5 LN:180915260 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:0740173db9ffd264d728f32784845cd7 SP:Homo Sapiens +@SQ SN:6 LN:171115067 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1d3a93a248d92a729ee764823acbbc6b SP:Homo Sapiens +@SQ SN:7 LN:159138663 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:618366e953d6aaad97dbe4777c29375e SP:Homo Sapiens +@SQ SN:8 LN:146364022 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:96f514a9929e410c6651697bded59aec SP:Homo Sapiens +@SQ SN:9 LN:141213431 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:3e273117f15e0a400f01055d9f393768 SP:Homo Sapiens +@SQ SN:10 LN:135534747 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:988c28e000e84c26d552359af1ea2e1d SP:Homo Sapiens +@SQ SN:11 LN:135006516 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:98c59049a2df285c76ffb1c6db8f8b96 SP:Homo Sapiens +@SQ SN:12 LN:133851895 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:51851ac0e1a115847ad36449b0015864 SP:Homo Sapiens +@SQ SN:13 LN:115169878 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:283f8d7892baa81b510a015719ca7b0b SP:Homo Sapiens +@SQ SN:14 LN:107349540 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:98f3cae32b2a2e9524bc19813927542e SP:Homo Sapiens +@SQ SN:15 LN:102531392 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:e5645a794a8238215b2cd77acb95a078 SP:Homo Sapiens +@SQ SN:16 LN:90354753 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fc9b1a7b42b97a864f56b348b06095e6 SP:Homo Sapiens +@SQ SN:17 LN:81195210 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:351f64d4f4f9ddd45b35336ad97aa6de SP:Homo Sapiens +@SQ SN:18 LN:78077248 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:b15d4b2d29dde9d3e4f93d1d0f2cbc9c SP:Homo Sapiens +@SQ SN:19 LN:59128983 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1aacd71f30db8e561810913e0b72636d SP:Homo Sapiens +@SQ SN:20 LN:63025520 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:0dec9660ec1efaaf33281c0d5ea2560f SP:Homo Sapiens +@SQ SN:21 LN:48129895 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:2979a6085bfe28e3ad6f552f361ed74d SP:Homo Sapiens +@SQ SN:22 LN:51304566 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a718acaa6135fdca8357d5bfe94211dd SP:Homo Sapiens +@SQ SN:X LN:155270560 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7e0e2e580297b7764e31dbc80c2540dd SP:Homo Sapiens +@SQ SN:Y LN:59373566 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1fa3474750af0948bdf97d5a0ee52e51 SP:Homo Sapiens +@SQ SN:MT LN:16569 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:c68f52674c9fb33aef52dcf399755519 SP:Homo Sapiens +@SQ SN:GL000207.1 LN:4262 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:f3814841f1939d3ca19072d9e89f3fd7 SP:Homo Sapiens +@SQ SN:GL000226.1 LN:15008 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1c1b2cd1fccbc0a99b6a447fa24d1504 SP:Homo Sapiens +@SQ SN:GL000229.1 LN:19913 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d0f40ec87de311d8e715b52e4c7062e1 SP:Homo Sapiens +@SQ SN:GL000231.1 LN:27386 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:ba8882ce3a1efa2080e5d29b956568a4 SP:Homo Sapiens +@SQ SN:GL000210.1 LN:27682 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:851106a74238044126131ce2a8e5847c SP:Homo Sapiens +@SQ SN:GL000239.1 LN:33824 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:99795f15702caec4fa1c4e15f8a29c07 SP:Homo Sapiens +@SQ SN:GL000235.1 LN:34474 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:118a25ca210cfbcdfb6c2ebb249f9680 SP:Homo Sapiens +@SQ SN:GL000201.1 LN:36148 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:dfb7e7ec60ffdcb85cb359ea28454ee9 SP:Homo Sapiens +@SQ SN:GL000247.1 LN:36422 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7de00226bb7df1c57276ca6baabafd15 SP:Homo Sapiens +@SQ SN:GL000245.1 LN:36651 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:89bc61960f37d94abf0df2d481ada0ec SP:Homo Sapiens +@SQ SN:GL000197.1 LN:37175 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6f5efdd36643a9b8c8ccad6f2f1edc7b SP:Homo Sapiens +@SQ SN:GL000203.1 LN:37498 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:96358c325fe0e70bee73436e8bb14dbd SP:Homo Sapiens +@SQ SN:GL000246.1 LN:38154 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:e4afcd31912af9d9c2546acf1cb23af2 SP:Homo Sapiens +@SQ SN:GL000249.1 LN:38502 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1d78abec37c15fe29a275eb08d5af236 SP:Homo Sapiens +@SQ SN:GL000196.1 LN:38914 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d92206d1bb4c3b4019c43c0875c06dc0 SP:Homo Sapiens +@SQ SN:GL000248.1 LN:39786 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:5a8e43bec9be36c7b49c84d585107776 SP:Homo Sapiens +@SQ SN:GL000244.1 LN:39929 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:0996b4475f353ca98bacb756ac479140 SP:Homo Sapiens +@SQ SN:GL000238.1 LN:39939 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:131b1efc3270cc838686b54e7c34b17b SP:Homo Sapiens +@SQ SN:GL000202.1 LN:40103 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:06cbf126247d89664a4faebad130fe9c SP:Homo Sapiens +@SQ SN:GL000234.1 LN:40531 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:93f998536b61a56fd0ff47322a911d4b SP:Homo Sapiens +@SQ SN:GL000232.1 LN:40652 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:3e06b6741061ad93a8587531307057d8 SP:Homo Sapiens +@SQ SN:GL000206.1 LN:41001 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:43f69e423533e948bfae5ce1d45bd3f1 SP:Homo Sapiens +@SQ SN:GL000240.1 LN:41933 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:445a86173da9f237d7bcf41c6cb8cc62 SP:Homo Sapiens +@SQ SN:GL000236.1 LN:41934 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fdcd739913efa1fdc64b6c0cd7016779 SP:Homo Sapiens +@SQ SN:GL000241.1 LN:42152 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:ef4258cdc5a45c206cea8fc3e1d858cf SP:Homo Sapiens +@SQ SN:GL000243.1 LN:43341 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:cc34279a7e353136741c9fce79bc4396 SP:Homo Sapiens +@SQ SN:GL000242.1 LN:43523 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:2f8694fc47576bc81b5fe9e7de0ba49e SP:Homo Sapiens +@SQ SN:GL000230.1 LN:43691 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:b4eb71ee878d3706246b7c1dbef69299 SP:Homo Sapiens +@SQ SN:GL000237.1 LN:45867 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:e0c82e7751df73f4f6d0ed30cdc853c0 SP:Homo Sapiens +@SQ SN:GL000233.1 LN:45941 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7fed60298a8d62ff808b74b6ce820001 SP:Homo Sapiens +@SQ SN:GL000204.1 LN:81310 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:efc49c871536fa8d79cb0a06fa739722 SP:Homo Sapiens +@SQ SN:GL000198.1 LN:90085 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:868e7784040da90d900d2d1b667a1383 SP:Homo Sapiens +@SQ SN:GL000208.1 LN:92689 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:aa81be49bf3fe63a79bdc6a6f279abf6 SP:Homo Sapiens +@SQ SN:GL000191.1 LN:106433 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d75b436f50a8214ee9c2a51d30b2c2cc SP:Homo Sapiens +@SQ SN:GL000227.1 LN:128374 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:a4aead23f8053f2655e468bcc6ecdceb SP:Homo Sapiens +@SQ SN:GL000228.1 LN:129120 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:c5a17c97e2c1a0b6a9cc5a6b064b714f SP:Homo Sapiens +@SQ SN:GL000214.1 LN:137718 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:46c2032c37f2ed899eb41c0473319a69 SP:Homo Sapiens +@SQ SN:GL000221.1 LN:155397 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:3238fb74ea87ae857f9c7508d315babb SP:Homo Sapiens +@SQ SN:GL000209.1 LN:159169 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:f40598e2a5a6b26e84a3775e0d1e2c81 SP:Homo Sapiens +@SQ SN:GL000218.1 LN:161147 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:1d708b54644c26c7e01c2dad5426d38c SP:Homo Sapiens +@SQ SN:GL000220.1 LN:161802 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:fc35de963c57bf7648429e6454f1c9db SP:Homo Sapiens +@SQ SN:GL000213.1 LN:164239 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:9d424fdcc98866650b58f004080a992a SP:Homo Sapiens +@SQ SN:GL000211.1 LN:166566 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:7daaa45c66b288847b9b32b964e623d3 SP:Homo Sapiens +@SQ SN:GL000199.1 LN:169874 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:569af3b73522fab4b40995ae4944e78e SP:Homo Sapiens +@SQ SN:GL000217.1 LN:172149 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6d243e18dea1945fb7f2517615b8f52e SP:Homo Sapiens +@SQ SN:GL000216.1 LN:172294 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:642a232d91c486ac339263820aef7fe0 SP:Homo Sapiens +@SQ SN:GL000215.1 LN:172545 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:5eb3b418480ae67a997957c909375a73 SP:Homo Sapiens +@SQ SN:GL000205.1 LN:174588 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d22441398d99caf673e9afb9a1908ec5 SP:Homo Sapiens +@SQ SN:GL000219.1 LN:179198 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:f977edd13bac459cb2ed4a5457dba1b3 SP:Homo Sapiens +@SQ SN:GL000224.1 LN:179693 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:d5b2fc04f6b41b212a4198a07f450e20 SP:Homo Sapiens +@SQ SN:GL000223.1 LN:180455 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:399dfa03bf32022ab52a846f7ca35b30 SP:Homo Sapiens +@SQ SN:GL000195.1 LN:182896 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:5d9ec007868d517e73543b005ba48535 SP:Homo Sapiens +@SQ SN:GL000212.1 LN:186858 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:563531689f3dbd691331fd6c5730a88b SP:Homo Sapiens +@SQ SN:GL000222.1 LN:186861 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6fe9abac455169f50470f5a6b01d0f59 SP:Homo Sapiens +@SQ SN:GL000200.1 LN:187035 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:75e4c8d17cd4addf3917d1703cacaf25 SP:Homo Sapiens +@SQ SN:GL000193.1 LN:189789 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:dbb6e8ece0b5de29da56601613007c2a SP:Homo Sapiens +@SQ SN:GL000194.1 LN:191469 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6ac8f815bf8e845bb3031b73f812c012 SP:Homo Sapiens +@SQ SN:GL000225.1 LN:211173 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:63945c3e6962f28ffd469719a747e73c SP:Homo Sapiens +@SQ SN:GL000192.1 LN:547496 AS:GRCh37 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:325ba9e808f669dfeee210fdd7b470ac SP:Homo Sapiens +@SQ SN:NC_007605 LN:171823 AS:NC_007605.1 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta M5:6743bd63b3ff2b5b8985d8933c53290a SP:Epstein-Barr virus]] + +local function comp(a, b) + if a.chr == b.chr then return a.start < b.start end + local x = tonumber(a.chr) + local y = tonumber(b.chr) + if x and y then + return x < y end + return a.chr < b.chr +end + +local function cleanNumber (x) + local t = x:gsub(",","") + return tonumber(t) +end + +local function cleanDisease (x) + local d = x:gsub("[^%w%s]", "") + return d:gsub(" ", "_") +end + +local function isSameGene (g1, g2) + return g1.chr == g2.chr and + g1.start == g2.start and + g1.finish == g2.finish +end + +local function isValidGene(gene) + local x + if gene.chr == "X" then x = 23 + elseif gene.chr == "Y" then x = 24 + elseif gene.chr == "MT" then x = 25 + else x = tonumber(gene.chr) end + + return x >= 1 and x <= 25 and gene.start < gene.finish and chr_limits[x] > gene.finish +end + + +local function addGene(geneTable, gene) + if isValidGene(gene) then -- only adds valid genes + local geneAdded = false + for _, g in ipairs(geneTable) do + if isSameGene(gene, g) then + geneAdded = true + if g.disease ~= gene.disease then + g.disease = g.disease .. ","..gene.disease + end + end + end + if (not geneAdded) then + table.insert(geneTable, gene) + end + end +end + +local function cleanIntervals(f, t) + for l in io.lines(f) do + local counter = 1 + local id, c, b, e, disease + for match in l:gmatch("(.-);;") do + -- print("DEBUG: ", counter, match) + if counter == 1 then id = match + elseif counter == 3 then disease = cleanDisease(match) + elseif counter == 7 then c = match + elseif counter == 10 then b = cleanNumber(match) + elseif counter == 11 then e = cleanNumber(match) + end + counter = counter + 1 + end + if id and c and b and e and c~= "" then + addGene(t, {id=id, chr=c, start=b, finish=e, disease=disease}) + end + end + return table.sort(t, comp) +end + +local function printIntervals(t) + print(header) + for _,interval in ipairs(t) do + print(interval.chr,interval.start,interval.finish, "+", interval.id..":::"..interval.disease) + end +end + +t = {} +cleanIntervals(arg[1], t) +printIntervals(t) + +--[[ +-- REFERENCE TABLE + +1 ID +2 Association(Y/N) +3 Broad Phenotype +4 Disease Class +5 Disease Class Code +6 MeSH Disease Terms +7 Chromosom +8 Chr-Band +9 Gene +10 DNA Start +11 DNA End +12 P Value +13 Reference +14 Pubmed ID +15 Allele Author Description +16 Allele Functional Effects +17 Polymophism Class +18 Gene Name +19 RefSeq +20 Population +21 MeSH Geolocation +22 Submitter +23 Locus Number +24 Unigene +25 Narrow Phenotype +26 Mole. Phenotype +27 Journal +28 Title +29 rs Number +30 OMIM ID +31 Year +32 Conclusion +33 Study Info +34 Env. Factor +35 GI Gene A +36 GI Allele of Gene A +37 GI Gene B +38 GI Allele of Gene B +39 GI Gene C +40 GI Allele of Gene C +41 GI Association? +42 GI combine Env. Factor +--]] \ No newline at end of file diff --git a/lua/hasTheseTargets.lua b/lua/hasTheseTargets.lua new file mode 100644 index 000000000..912bb7cc4 --- /dev/null +++ b/lua/hasTheseTargets.lua @@ -0,0 +1,147 @@ +-- This script takes two interval lists (a and b) and creates another interval list (c) with the intervals in a that are +-- inside the targets in b. For example, if a is a list of exon targets and b a list of genes, then c would be all exons +-- covered by the genes in file b. +-- +-- Author: carneiro +-- Date: 5/26/2011 + +local targetList = arg[1] -- list of targets to check if they are part of the set +local targetSet = arg[2] -- set of targets + + +------------------------------------------------------------------------------------------------------------------------ +-- Parses a line from an interval list file (not a header line!) +-- +-- Return values: +-- 1: chromosome +-- 2: interval start +-- 3: interval end +-- 4: strand (+/-) +-- 5: info field +------------------------------------------------------------------------------------------------------------------------ +local function parseIntervalLine(l) + return l:match("(%w+)%s+(%d+)%s+(%d+)%s+([%+%-])%s+(.*)") +end + +------------------------------------------------------------------------------------------------------------------------ +-- Reads an interval list file into a table +-- +-- Return values: +-- 1: index pairs table with each interval (each interval is a table of the form {c, s, e, strand, info}) +-- 2: header string +------------------------------------------------------------------------------------------------------------------------ +local function readIntervals(filename) + t = {} + header = "" + for l in io.lines(filename) do + if l:sub(1,1) == "@" then header = header .. l .."\n" + else + local c, s, e, strand, info = parseIntervalLine(l) + table.insert(t, {c=c, s=tonumber(s), e=tonumber(e), strand=strand, info=info}) + end + end + return t, header +end + +------------------------------------------------------------------------------------------------------------------------ +-- Checks if two intervals have the same chromosome, start and end. +-- +-- Return values: +-- 1: true/false +------------------------------------------------------------------------------------------------------------------------ +local function isSameInterval(i, c, s, e) + return i.c == c and s == i.s and e == i.e +end + +------------------------------------------------------------------------------------------------------------------------ +-- Checks if the line from an interval list file is a header line +-- +-- Return values: +-- 1: true/false +------------------------------------------------------------------------------------------------------------------------ +local function isIntervalHeaderLine(l) + return l:sub(1,1) == "@" +end + + +------------------------------------------------------------------------------------------------------------------------ +-- Compares the start position of two intervals +-- +-- Return values: +-- 1: -1, 0 or 1 (respectively for a < b, a == b, a > b) +------------------------------------------------------------------------------------------------------------------------ +local function compIntervals(a, b) + + local function c(x,y) + if x < y then return -1 + elseif x > y then return 1 + else return 0 end + end + -- same chromosomes + if a.c == b.c then return c(a.s, b.s) + else + x = tonumber(a.c) + y = tonumber(b.c) + if x and y then return c(x,y) + else return c(a.c, b.c) end + end +end + +------------------------------------------------------------------------------------------------------------------------ +-- Creates a new interval +-- +-- Return values: +-- 1: interval table {c, s, e, st, info} +------------------------------------------------------------------------------------------------------------------------ +local function newInterval(c,s,e,st,info) + return {c=c, s=tonumber(s), e=tonumber(e), st=st, info=info } +end + +local function printInterval(i) + print(i.c, i.s, i.e, i.st, i.info) +end + +local function intervalContainsInterval(a, b) + return a.c == b.c and a.s <= b.s and a.e >= b.e +end + +local function isInterceptingInterval(a, b) + return a.s < b.s and a.e < b.e and a.e > b.s +end + +local function findInterval(i, intervals) + local start = 1 + local finish = table.getn(intervals) + local current = math.floor((start + finish) / 2) + +-- print("findInterval: ", start, finish, current, intervals[current].c, intervals[current].s, intervals[current].e) + + while start < finish and not intervalContainsInterval(i, intervals[current]) and not isInterceptingInterval(i, intervals[current]) do + if compIntervals(i, intervals[current]) < 0 then + finish = current - 1 + else + start = current + 1 + end + current = math.floor((start + finish) / 2) +-- print("findInterval: ", start, finish, current, intervals[current].c, intervals[current].s, intervals[current].e) + end + return intervalContainsInterval(i, intervals[current]), current +end + + + + +local a, header = readIntervals(targetList) +io.write(header) + +for l in io.lines(targetSet) do + if not isIntervalHeaderLine(l) then + local c, s, e, st, info = parseIntervalLine(l) + local intA = newInterval(c,s,e,st,info) +-- print("Debug: ", c,s,e,info) + local intervalExists, i = findInterval(intA, a) + if intervalExists then + print(a[i].c, a[i].s, a[i].e, st, info) + end + end +end diff --git a/lua/sortGenesByCoverage.lua b/lua/sortGenesByCoverage.lua new file mode 100644 index 000000000..4bbf2b364 --- /dev/null +++ b/lua/sortGenesByCoverage.lua @@ -0,0 +1,66 @@ +-- This script takes the interval summary output from GATK's Depth of Coverage and the +-- interval list for 'genes of interest' and generates an intervals file sorted by +-- coverage. +-- +-- Author: carneiro +-- Date: 5/26/2011 +------------------------------------------------------------------------------- +-- Global script variables +------------------------------------------------------------------------------- +local coverage = arg[1] +local genesOfInterest= arg[2] or "/Volumes/humgen/gsa-hpprojects/dev/carneiro/top_genes/data/disease.genes.annotated.interval_list" + +local genes = {} +local header = "" + +------------------------------------------------------------------------------- +-- Helper functions +------------------------------------------------------------------------------- + +local function isSameGene(i, c, s, e) + return genes[i].c == c and s == genes[i].s and e == genes[i].e +end + +local function isMergedGene(i, c, s, e) + return genes[i].c == c and s <= genes[i].s and e >= genes[i].e +end + +local function comp(a,b) + return a.avgCoverage < b.avgCoverage +end + +for l in io.lines(genesOfInterest) do + if l:sub(1,1) == "@" then header = header .. l .."\n" + else + local c, s, e, info = l:match("(%w+)%s+(%d+)%s+(%d+)%s+%+%s+(.*)") +-- print("Debug: adding gene: ",c,s,e,info) + table.insert(genes, {c=c, s=tonumber(s), e=tonumber(e), info=info}) + end +end + +local i = 1 +for l in io.lines(coverage) do + local geneOk = false + if l:match("(%w+)") ~= "Target" then -- skip the first line (header) + local c, s, e, totalCoverage, avgCoverage = l:match("(%w+):(%d+)%-(%d+)%s+(%d+)%s+([%d%.]+)") + s = tonumber(s) + e = tonumber(e) + while genes[i] ~= nil and (isSameGene(i, c, s, e) or isMergedGene(i, c, s, e)) do +-- print("Debug: ", c, s, e, "--",genes[i].c, genes[i].s, genes[i].e) + genes[i].totalCoverage = tonumber(totalCoverage) + genes[i].avgCoverage = tonumber(avgCoverage) + geneOk = true + i = i + 1 + end + if not geneOk then + error("Warning: Gene mismatch! Crazy!", c,s,e,"--",genes[i].c, genes[i].s, genes[i].e) + break + end + end +end + +table.sort(genes, comp) +io.write(header) +for _, g in ipairs(genes) do + print(g.c,g.s,g.e,"+",g.avgCoverage..":::"..g.totalCoverage..":::"..g.info) +end