Compare commits
969 Commits
7807c96882
...
a37cb500e2
| Author | SHA1 | Date |
|---|---|---|
|
|
a37cb500e2 | |
|
|
ea9e707238 | |
|
|
139f68fc4c | |
|
|
94248a8cea | |
|
|
2e603e4581 | |
|
|
c77ace7059 | |
|
|
ac612b6f5a | |
|
|
165e524e5c | |
|
|
ab01ab4490 | |
|
|
b56db22548 | |
|
|
2160a0c7de | |
|
|
2d4272bc82 | |
|
|
ceaaa6d9cb | |
|
|
8c92aef23d | |
|
|
58e3ed223f | |
|
|
41e007868c | |
|
|
cb7e5dffcf | |
|
|
40b3aeed56 | |
|
|
0747fcc09d | |
|
|
ef52b4ac19 | |
|
|
760098d1c2 | |
|
|
4bf3cdf948 | |
|
|
6b18630a62 | |
|
|
d8dd308a1f | |
|
|
3ddd7b87d4 | |
|
|
fe209ff1ca | |
|
|
f033d4e4e7 | |
|
|
765fac1070 | |
|
|
fbfffc9031 | |
|
|
34374c5613 | |
|
|
110bf9b8ed | |
|
|
f1d1fd7c42 | |
|
|
b9accf95ad | |
|
|
84b2abb675 | |
|
|
50122bed2a | |
|
|
ab9e0e0226 | |
|
|
e050ec9a10 | |
|
|
e5985723cc | |
|
|
13b5637fe6 | |
|
|
c4a010d19f | |
|
|
436aceb4ef | |
|
|
3072c80e9c | |
|
|
2a1ae7b6f3 | |
|
|
3a2b0c05dd | |
|
|
38e751a40a | |
|
|
20d0a13092 | |
|
|
f090f93460 | |
|
|
e624290ad4 | |
|
|
27dd1da702 | |
|
|
11d53b06fb | |
|
|
eb7dbc1429 | |
|
|
fde1fd7ce3 | |
|
|
eef04271e6 | |
|
|
1a2b0c001a | |
|
|
824e00e7ac | |
|
|
ab64326ae6 | |
|
|
9f26bfcc77 | |
|
|
b582816211 | |
|
|
340babdd67 | |
|
|
a9c688ac92 | |
|
|
47d9fb27d7 | |
|
|
5d15c8b09f | |
|
|
950ea8cf40 | |
|
|
83260ccb10 | |
|
|
1eee77ad93 | |
|
|
24a8d66481 | |
|
|
89a5c0c5c8 | |
|
|
0976272d99 | |
|
|
a54e76945e | |
|
|
31bf49d814 | |
|
|
498973642f | |
|
|
87ed015b24 | |
|
|
c56c2e4677 | |
|
|
3b96dcee7f | |
|
|
bcb475cd03 | |
|
|
690649872b | |
|
|
a61b1dc610 | |
|
|
5ede7eb981 | |
|
|
bc58bf265d | |
|
|
d28da05f36 | |
|
|
ab3a92bc73 | |
|
|
05c501332c | |
|
|
298284d754 | |
|
|
7a77cc4785 | |
|
|
6a5caf764f | |
|
|
1972a97813 | |
|
|
6442d009cd | |
|
|
beff2e27f8 | |
|
|
af87825525 | |
|
|
9dfb9708db | |
|
|
8cc02badd9 | |
|
|
00e59ddbae | |
|
|
5961611c35 | |
|
|
f123871451 | |
|
|
08764215c6 | |
|
|
850dd811fc | |
|
|
bdfb9efaaa | |
|
|
58bcc251f2 | |
|
|
ee730f3832 | |
|
|
5db0c362f5 | |
|
|
fe545f6ad5 | |
|
|
7ec3261877 | |
|
|
6b9be61666 | |
|
|
d668311f37 | |
|
|
3c038250f9 | |
|
|
d6636daead | |
|
|
e10fc577e1 | |
|
|
bb7533e32c | |
|
|
6460ef5572 | |
|
|
2c078eb667 | |
|
|
83662032a2 | |
|
|
9e5852c540 | |
|
|
67aa1ca731 | |
|
|
b94a78adbf | |
|
|
f1ce529cd9 | |
|
|
11437a14f4 | |
|
|
5ecdc66e46 | |
|
|
f9a7869079 | |
|
|
4820d8fc32 | |
|
|
9a828344ec | |
|
|
099c24f5fc | |
|
|
cc9eef2d33 | |
|
|
86170ffd2f | |
|
|
25097a9a88 | |
|
|
ede7b092a2 | |
|
|
1e6a10ce2d | |
|
|
6f0ea0b7e3 | |
|
|
fdf47000e7 | |
|
|
c819643412 | |
|
|
1247dc2346 | |
|
|
616a53491d | |
|
|
feeb41f42d | |
|
|
be2020c4af | |
|
|
1b2e9a6d89 | |
|
|
1ec736e06a | |
|
|
176a7ec67e | |
|
|
9be943cec0 | |
|
|
f9ce2a6c73 | |
|
|
e3ea2a8222 | |
|
|
c2efebefa7 | |
|
|
0911122749 | |
|
|
e8d531189f | |
|
|
6c668d3e1a | |
|
|
9812521922 | |
|
|
b1a96f58b6 | |
|
|
eb428d7d31 | |
|
|
3ae437ac13 | |
|
|
163f949a33 | |
|
|
1e29bccc65 | |
|
|
1f99921b73 | |
|
|
5a340797e7 | |
|
|
8211fbcb62 | |
|
|
e746a3eac7 | |
|
|
3ae8d4b80b | |
|
|
b9bbf2d3df | |
|
|
b8e189c1e5 | |
|
|
408fd5e072 | |
|
|
d7ce7e1395 | |
|
|
ed95769a33 | |
|
|
c05a721f28 | |
|
|
5aba188969 | |
|
|
32c4e3fe5a | |
|
|
4ffdc7b5ad | |
|
|
c561759222 | |
|
|
4e705c037e | |
|
|
925ddfb697 | |
|
|
139df93af9 | |
|
|
efa0dd561a | |
|
|
9649f3496f | |
|
|
393735d7f1 | |
|
|
ae4aa77b50 | |
|
|
d6e8e5929b | |
|
|
309d1c543d | |
|
|
9d8c906e4c | |
|
|
cb8992459f | |
|
|
3633436a3d | |
|
|
decc812f77 | |
|
|
6609bafa17 | |
|
|
d58beeee56 | |
|
|
8422950472 | |
|
|
d36083bdc5 | |
|
|
84024101fa | |
|
|
7e29a01b77 | |
|
|
1993cb634d | |
|
|
9ade171f7a | |
|
|
b5f6ed3020 | |
|
|
15c93e7a52 | |
|
|
b31992991a | |
|
|
bb7a97f68c | |
|
|
82ef0500e6 | |
|
|
56f1056b49 | |
|
|
61ab329639 | |
|
|
80e4ecfa79 | |
|
|
6f9bf9a4ae | |
|
|
eb664c2fe8 | |
|
|
cee8149b12 | |
|
|
ad9d12c04d | |
|
|
bf053c779b | |
|
|
0071c4ab22 | |
|
|
792be78a80 | |
|
|
050482ef77 | |
|
|
65c637b036 | |
|
|
4373a077d5 | |
|
|
a5c144db4d | |
|
|
fffe52a45b | |
|
|
904afefbc6 | |
|
|
c5fe18438a | |
|
|
7eabcf42c4 | |
|
|
a81603217e | |
|
|
b2b42cea7e | |
|
|
4e97cdfde8 | |
|
|
c2f2cb03ce | |
|
|
1bee29ef44 | |
|
|
10de1e0b74 | |
|
|
0eee4fa1b0 | |
|
|
2a41772d0a | |
|
|
e5d7f4fc30 | |
|
|
ccf03ffd22 | |
|
|
de268247db | |
|
|
12fff1ad57 | |
|
|
96fba0f2f4 | |
|
|
ace18171b0 | |
|
|
4cbb0cb829 | |
|
|
9dbf152e31 | |
|
|
40c4a6ffe1 | |
|
|
4bc01e90c8 | |
|
|
b7d0a2d537 | |
|
|
9bf898fa10 | |
|
|
6b4e6bf7dd | |
|
|
6db2fd98aa | |
|
|
9867634f72 | |
|
|
6f39715e6e | |
|
|
5c87869d91 | |
|
|
dbe74ca9b8 | |
|
|
0f8a164bd3 | |
|
|
2bfcc421d9 | |
|
|
7327f8fa10 | |
|
|
33938e1ed0 | |
|
|
ba3ab0ca98 | |
|
|
7a8019e6ee | |
|
|
7c362bef69 | |
|
|
3a1da27c1c | |
|
|
d61a1226a8 | |
|
|
b45442fbc1 | |
|
|
f005ea4bf3 | |
|
|
af606fc31a | |
|
|
3d129be642 | |
|
|
4177d6c2c7 | |
|
|
60b728487a | |
|
|
282130a64e | |
|
|
5e00d08346 | |
|
|
574098d034 | |
|
|
acf4922fe4 | |
|
|
4260699a09 | |
|
|
e3adc4275b | |
|
|
76a15ea91b | |
|
|
2bfa3c767b | |
|
|
497913d404 | |
|
|
a6b5a30dab | |
|
|
038af2a551 | |
|
|
25d45512d9 | |
|
|
6f41a27e27 | |
|
|
3370ae9e35 | |
|
|
6e13694721 | |
|
|
e65dc133f4 | |
|
|
f968b4b73d | |
|
|
059c8eb899 | |
|
|
c4ac1bee88 | |
|
|
fa50c1b77f | |
|
|
1b2eb4c9ab | |
|
|
d0a796b644 | |
|
|
c8ae5029b3 | |
|
|
a06646493b | |
|
|
a55aac8a43 | |
|
|
6deafdadca | |
|
|
8af17b3478 | |
|
|
c280274331 | |
|
|
44a185f2c1 | |
|
|
67c2d69218 | |
|
|
fab1f2b561 | |
|
|
76a365a95f | |
|
|
d8d8b230d1 | |
|
|
e318d8e7e5 | |
|
|
ad0da1418f | |
|
|
97a3102c89 | |
|
|
57436ab405 | |
|
|
6804f17d38 | |
|
|
bfd5e1840f | |
|
|
6a0952948d | |
|
|
c5e859b49f | |
|
|
c44357926f | |
|
|
71277f0fea | |
|
|
2a18fa114f | |
|
|
647b0e828a | |
|
|
c79427a22d | |
|
|
15becdd3df | |
|
|
df20911110 | |
|
|
2485a3ca02 | |
|
|
bf08f76a3c | |
|
|
4356fd485c | |
|
|
2d7ec6b051 | |
|
|
2c3619d569 | |
|
|
7026b3ec68 | |
|
|
cd44ac48cd | |
|
|
c4bdf1678d | |
|
|
0a526c698a | |
|
|
7b62fbb4ba | |
|
|
d802cfb7f5 | |
|
|
a03d01f944 | |
|
|
039ea20639 | |
|
|
0a2cf98293 | |
|
|
dae4ca3ced | |
|
|
8d61316cc8 | |
|
|
e8e5bbc071 | |
|
|
5fc0f44e27 | |
|
|
a33ca20499 | |
|
|
fd42d30910 | |
|
|
23bc50e240 | |
|
|
7426a750ec | |
|
|
9af36064e8 | |
|
|
7cbcbc4abf | |
|
|
b22bed2b5e | |
|
|
954ee5eda9 | |
|
|
030825ae85 | |
|
|
ba7ed47d40 | |
|
|
70e9ed0ea9 | |
|
|
a41afe4c97 | |
|
|
dd1c6619d7 | |
|
|
b9026ecafd | |
|
|
a32d44d8d6 | |
|
|
c982443210 | |
|
|
d677a4325b | |
|
|
f5a0e576e6 | |
|
|
825ae92e58 | |
|
|
6f37c14f26 | |
|
|
945d39fb79 | |
|
|
a458442b24 | |
|
|
92bc6849a3 | |
|
|
90518f11e3 | |
|
|
152d25ff87 | |
|
|
230cd16712 | |
|
|
d0af3648d1 | |
|
|
1bdd791539 | |
|
|
7244d66833 | |
|
|
df09f38ad3 | |
|
|
43ce69104d | |
|
|
d013ea94b0 | |
|
|
4b6eeb34c8 | |
|
|
624687b072 | |
|
|
5d26ab0ee3 | |
|
|
41b137d202 | |
|
|
b07587f806 | |
|
|
bd85af08ab | |
|
|
aee53f1334 | |
|
|
015ab3f6c3 | |
|
|
8116bcc786 | |
|
|
dfd8c9eeb6 | |
|
|
8d2b93156b | |
|
|
6739b713dd | |
|
|
f4aedddee6 | |
|
|
ca61fe3ad5 | |
|
|
1934f0cf24 | |
|
|
e4ad616d43 | |
|
|
35ac99b4f7 | |
|
|
c7679ee781 | |
|
|
9c5fd803e9 | |
|
|
1e611b235c | |
|
|
d9f654908f | |
|
|
ec55ed8d83 | |
|
|
b5cba257c1 | |
|
|
bf7d1d46ca | |
|
|
1bba5ef20e | |
|
|
1de61b556e | |
|
|
8a230db596 | |
|
|
b73f4c977a | |
|
|
0a6f2660b4 | |
|
|
c1df343740 | |
|
|
9bdcf31dec | |
|
|
a1a57d97be | |
|
|
d9dfab2ba2 | |
|
|
b5415ba23d | |
|
|
5e56bec0f4 | |
|
|
482a47a57e | |
|
|
705aa53894 | |
|
|
7fd6a11569 | |
|
|
cffff4338f | |
|
|
3efc33160c | |
|
|
031d3d83ce | |
|
|
be74dbc00c | |
|
|
e4752b321b | |
|
|
f00cc94e1d | |
|
|
a5ad0cff7f | |
|
|
012d54fc49 | |
|
|
8d2986ece2 | |
|
|
8f3e7d573b | |
|
|
d67e923026 | |
|
|
1627f9dfae | |
|
|
061c63f36a | |
|
|
0168f39eeb | |
|
|
08517ac09b | |
|
|
a35a6c2580 | |
|
|
39a6cd5bb0 | |
|
|
cfe6996173 | |
|
|
43b498a37e | |
|
|
c9b33502f3 | |
|
|
e56572e88f | |
|
|
1ced2f386c | |
|
|
4b951592fb | |
|
|
6ac8dd5840 | |
|
|
81970e7687 | |
|
|
c50ee8c841 | |
|
|
8d37726325 | |
|
|
024195db62 | |
|
|
ce3c198245 | |
|
|
f21d6498bc | |
|
|
e8f28cb529 | |
|
|
6db761e269 | |
|
|
8763e0ced7 | |
|
|
b7076848ab | |
|
|
fa20c71920 | |
|
|
7954e77a1b | |
|
|
4b2441069f | |
|
|
5aedc978d1 | |
|
|
c6c943f9d7 | |
|
|
d59d78838c | |
|
|
88f89be60e | |
|
|
11698fc4e5 | |
|
|
b603fed39c | |
|
|
44754cd615 | |
|
|
dadd5d6281 | |
|
|
76bb49e01b | |
|
|
6052d3015b | |
|
|
df65893fb5 | |
|
|
b92bbb47e5 | |
|
|
8c12ec4a4b | |
|
|
954cfd766d | |
|
|
b93fca2b2e | |
|
|
48847af2fc | |
|
|
00a07f61bf | |
|
|
45f24b4ae8 | |
|
|
bdb7b000cd | |
|
|
4e22270eba | |
|
|
a38b065642 | |
|
|
6d4a6debdc | |
|
|
1209f161c9 | |
|
|
836d464239 | |
|
|
d5877ad0a9 | |
|
|
f2b7d67ed9 | |
|
|
c9645579bd | |
|
|
685cd235a6 | |
|
|
42e2a7ad46 | |
|
|
bbcabfe342 | |
|
|
658f27eae4 | |
|
|
6fda93502f | |
|
|
db4b171fa6 | |
|
|
07182d9061 | |
|
|
7d25fe2de3 | |
|
|
e80bccc923 | |
|
|
f02cd42679 | |
|
|
8638cfadc8 | |
|
|
e2d0c996e9 | |
|
|
0eeacbbe39 | |
|
|
23e0e99ec0 | |
|
|
ccbbe48c4f | |
|
|
22c1d3cecc | |
|
|
cf6661a8fe | |
|
|
bcf6d1b355 | |
|
|
8220008564 | |
|
|
db58392e9b | |
|
|
d766591c1e | |
|
|
99f6f9a0d1 | |
|
|
c0a308a8b6 | |
|
|
f12dfae772 | |
|
|
b45aeb87e1 | |
|
|
172ba83241 | |
|
|
114901b005 | |
|
|
41f720dfa7 | |
|
|
066ec4aa95 | |
|
|
b6bd33b26c | |
|
|
b3225581be | |
|
|
acfe7613db | |
|
|
9a5705289c | |
|
|
3efb7c0e91 | |
|
|
127c00cc96 | |
|
|
b27bdf1ae0 | |
|
|
b7076d9023 | |
|
|
417c6d66c7 | |
|
|
9ce50a4e5e | |
|
|
578bb55c38 | |
|
|
0c783399e8 | |
|
|
8f9aeef4ec | |
|
|
e6931bec03 | |
|
|
7d63e76245 | |
|
|
8929bd1c25 | |
|
|
8ede4ffbfa | |
|
|
2e9463ebf1 | |
|
|
1c19bc630f | |
|
|
e879817373 | |
|
|
ce026a07fc | |
|
|
17fb85a227 | |
|
|
52391a9855 | |
|
|
bdd14d2946 | |
|
|
4adc34eccb | |
|
|
14aa43cca0 | |
|
|
7c50bad567 | |
|
|
5fdab3ae13 | |
|
|
f524c7d3d8 | |
|
|
ea3dc2f003 | |
|
|
d17ae1e808 | |
|
|
c26ba4e376 | |
|
|
10cb6b0507 | |
|
|
3afcdc7746 | |
|
|
74a1a53499 | |
|
|
f70d80a5a2 | |
|
|
8b6ec74907 | |
|
|
4219e58623 | |
|
|
29aa855432 | |
|
|
ff4762f3c7 | |
|
|
6e3fa0515a | |
|
|
ff6faf811a | |
|
|
deb19593aa | |
|
|
19d33faa30 | |
|
|
c564653b40 | |
|
|
7144a0cefc | |
|
|
ebb7b02e9b | |
|
|
b51a66e4c1 | |
|
|
1e2cff20ba | |
|
|
1346f03ff1 | |
|
|
451d60f3be | |
|
|
623da055e1 | |
|
|
ed78df9184 | |
|
|
3b84c03c1e | |
|
|
426e54740c | |
|
|
b88718d8f4 | |
|
|
128ffc089b | |
|
|
20dc9dd41a | |
|
|
7ec8b5c9e7 | |
|
|
ef18cb91cb | |
|
|
01c9cd152b | |
|
|
4e992769d5 | |
|
|
73619754f8 | |
|
|
6fed8fa0b4 | |
|
|
02d9bf123f | |
|
|
607e11d43d | |
|
|
599e840779 | |
|
|
bde5005f39 | |
|
|
3d2450ed97 | |
|
|
9441bb7f2a | |
|
|
0e759bc1f5 | |
|
|
9735d7a31a | |
|
|
9a6abe51b6 | |
|
|
338ebfaae0 | |
|
|
5c43a1fdc9 | |
|
|
96e445d9e4 | |
|
|
0aa7e0a402 | |
|
|
e88529687f | |
|
|
1a2bd2cf91 | |
|
|
19cb7cd7ed | |
|
|
8896cb942e | |
|
|
b3d0a13b32 | |
|
|
6b59bc5500 | |
|
|
c14aaad1ce | |
|
|
2f6897c72b | |
|
|
78ed00021f | |
|
|
4cb5110d03 | |
|
|
f6ae0d4d0f | |
|
|
3f8caef33c | |
|
|
db7a98636f | |
|
|
f0c94d80d1 | |
|
|
be11e27e12 | |
|
|
2087dc162f | |
|
|
3dd10bd7db | |
|
|
90ecd344ba | |
|
|
499cf4c00d | |
|
|
47520134e7 | |
|
|
3d8a8c1e37 | |
|
|
53bb846407 | |
|
|
d64eaa851d | |
|
|
d7ca0885eb | |
|
|
1e118e0823 | |
|
|
c89756e2b0 | |
|
|
8437cd4edd | |
|
|
1e3cadbfc2 | |
|
|
c862a1a396 | |
|
|
9346acde1b | |
|
|
7dec00c217 | |
|
|
dd51177837 | |
|
|
e5355fe3a0 | |
|
|
cca27c1ef5 | |
|
|
925aa6f60d | |
|
|
bdf34f6ce7 | |
|
|
c29b176cb6 | |
|
|
aa7cdf4bb3 | |
|
|
dab5b17c1a | |
|
|
6c665189ad | |
|
|
0f88103d2a | |
|
|
26f4c704ed | |
|
|
0b0455ca51 | |
|
|
ebb45dc42e | |
|
|
c7edaa8e84 | |
|
|
47952b6f3f | |
|
|
8f0d439913 | |
|
|
9228e48efd | |
|
|
5581cb9152 | |
|
|
2d01a297fb | |
|
|
740d2c1314 | |
|
|
9ea7f83974 | |
|
|
1d132a546d | |
|
|
5370bb23a3 | |
|
|
66c9783daf | |
|
|
af7b4d8980 | |
|
|
274c0ac96c | |
|
|
017be45407 | |
|
|
aabd990e8f | |
|
|
b5b50ac8da | |
|
|
b0a76884e8 | |
|
|
503ca9ed2e | |
|
|
1cadfa1552 | |
|
|
3e3236dfc4 | |
|
|
72817b664e | |
|
|
557d50c7e1 | |
|
|
042e1f4442 | |
|
|
773b86331b | |
|
|
5fbd454682 | |
|
|
6476343a83 | |
|
|
a76b75f41e | |
|
|
25366c7220 | |
|
|
98f8966750 | |
|
|
bb37e14d02 | |
|
|
e6c262594f | |
|
|
086c9d0e7d | |
|
|
07921659cf | |
|
|
8a078cc16d | |
|
|
efd9769b07 | |
|
|
e0991d6a45 | |
|
|
d6096c3f99 | |
|
|
59bc9341f6 | |
|
|
733410b50d | |
|
|
40f1214736 | |
|
|
7e00dbcac5 | |
|
|
1a451df800 | |
|
|
d35f33b513 | |
|
|
35fb7f9fdf | |
|
|
3e4a178e08 | |
|
|
6beab5f765 | |
|
|
3d33ab063e | |
|
|
c5434ac865 | |
|
|
39fcde9c19 | |
|
|
f3cff1c609 | |
|
|
a33b9c0633 | |
|
|
6a4d8c79d8 | |
|
|
df7c3f0000 | |
|
|
64d92d26df | |
|
|
4bb0bdddca | |
|
|
b7791105bc | |
|
|
aef179a580 | |
|
|
292e92b602 | |
|
|
e620f0ff4e | |
|
|
b621d3ae38 | |
|
|
65e099df34 | |
|
|
0b533385ef | |
|
|
ee80fb8bd0 | |
|
|
acd1ab607b | |
|
|
54ab3bbec7 | |
|
|
98787f0ae0 | |
|
|
32f2d60a2e | |
|
|
619ac4f93d | |
|
|
c6b226d719 | |
|
|
bfb2583d7f | |
|
|
aa92c720b5 | |
|
|
80e1137a6c | |
|
|
fd67064207 | |
|
|
264d5e42e5 | |
|
|
c848b44481 | |
|
|
174fe0f1d5 | |
|
|
e70c7c2a71 | |
|
|
61dd3bf13a | |
|
|
77b5b586ad | |
|
|
30cc8a95d1 | |
|
|
d19e834d84 | |
|
|
20aa848b3c | |
|
|
9957e04590 | |
|
|
e9e5ee6a3d | |
|
|
5092211d75 | |
|
|
5ead86acd3 | |
|
|
514563bd0a | |
|
|
4dc982a3c7 | |
|
|
570e082b38 | |
|
|
29e41b592c | |
|
|
0b4a40dc25 | |
|
|
85775c3384 | |
|
|
6bdccf2a8a | |
|
|
ee59a13109 | |
|
|
cda85be059 | |
|
|
6e7903e9f3 | |
|
|
b4c38bcc1c | |
|
|
33236de32e | |
|
|
ee4540c394 | |
|
|
cfa7165036 | |
|
|
67543f19a1 | |
|
|
e613195e17 | |
|
|
3c330d5049 | |
|
|
d460f2ec9e | |
|
|
a19ab654df | |
|
|
dd85c528d6 | |
|
|
904c3205c0 | |
|
|
6230f86799 | |
|
|
545fb87feb | |
|
|
f122fad562 | |
|
|
17c123d65a | |
|
|
ba15b787cb | |
|
|
6c1a113753 | |
|
|
38fc5c8822 | |
|
|
6a16edc15e | |
|
|
c0093264de | |
|
|
c5ce72f593 | |
|
|
ed08d08f36 | |
|
|
dfc63acc11 | |
|
|
d5820177c6 | |
|
|
81fe6f8e38 | |
|
|
58e4cc207f | |
|
|
d4cf6d97a6 | |
|
|
a578688fa8 | |
|
|
cfbc4c89e3 | |
|
|
54da54ffd4 | |
|
|
f8829318cf | |
|
|
84a328764a | |
|
|
a9cae8c9af | |
|
|
41624fb347 | |
|
|
ea8f4f4d34 | |
|
|
5626fe29b7 | |
|
|
a7d574d125 | |
|
|
688872fb1b | |
|
|
66585b7982 | |
|
|
f0a6285aba | |
|
|
ea9fc7df48 | |
|
|
8ee464478a | |
|
|
fe2236f6fe | |
|
|
5f8c6efbc3 | |
|
|
df1ff2b36e | |
|
|
688b524cdf | |
|
|
87d619a21f | |
|
|
557daabf38 | |
|
|
28a7d501f2 | |
|
|
604e3d8da1 | |
|
|
325ba8213b | |
|
|
cd0969332f | |
|
|
22b79b3475 | |
|
|
2fc469d0c9 | |
|
|
cfdc938fc3 | |
|
|
95d18449b3 | |
|
|
6ad5a3c086 | |
|
|
e5ab59db53 | |
|
|
13288e2dcd | |
|
|
dcb190069a | |
|
|
99907c98fb | |
|
|
b6006cbe9d | |
|
|
4431e359e2 | |
|
|
987d4b4205 | |
|
|
59eaf650ac | |
|
|
f4c0672800 | |
|
|
c310fb7424 | |
|
|
cb55617f50 | |
|
|
95a79afe71 | |
|
|
829664d6b5 | |
|
|
b2c7148dc9 | |
|
|
39607065e0 | |
|
|
2848d3045a | |
|
|
220fc39e9d | |
|
|
fdb0a7405f | |
|
|
057b292dde | |
|
|
1bf1a674a8 | |
|
|
245505deed | |
|
|
d8e4d57956 | |
|
|
d890c7997c | |
|
|
45b0d3423a | |
|
|
cd6bd524d4 | |
|
|
83a49f3210 | |
|
|
6ba11ab68c | |
|
|
ff3fea115c | |
|
|
27fdf6397d | |
|
|
49f2bcc015 | |
|
|
1fd51fc3f7 | |
|
|
bfeb37c4de | |
|
|
5dc398cdef | |
|
|
a09db69037 | |
|
|
901d28d5f5 | |
|
|
5a0b32bfd2 | |
|
|
a9292d674d | |
|
|
e65b2096f7 | |
|
|
797a8c147e | |
|
|
a61288c768 | |
|
|
14e6a7bdb9 | |
|
|
1e16f3e701 | |
|
|
86caae811e | |
|
|
1bc9712cd8 | |
|
|
7e1466c885 | |
|
|
d91e320972 | |
|
|
7067af833d | |
|
|
d6a73c9171 | |
|
|
9d0cdb2d3c | |
|
|
c589b42fb5 | |
|
|
29c8546679 | |
|
|
788e9d1e3d | |
|
|
f27bd18f20 | |
|
|
5bfa45a69b | |
|
|
666638a953 | |
|
|
ba18db1a9f | |
|
|
f83dea36d8 | |
|
|
2093398231 | |
|
|
e8a1962efe | |
|
|
92b084e553 | |
|
|
d25a87cc50 | |
|
|
00e5302219 | |
|
|
7ab4b3321f | |
|
|
f8f3b7577a | |
|
|
620ad6e5b9 | |
|
|
abc675f278 | |
|
|
86f2e134ba | |
|
|
5d372cef65 | |
|
|
8977737460 | |
|
|
6c19c9640c | |
|
|
91debf412b | |
|
|
5a4a0c4173 | |
|
|
6de74888fd | |
|
|
543c719a54 | |
|
|
6641788d38 | |
|
|
4f4e998d7f | |
|
|
55f1b36534 | |
|
|
b081ac9b8b | |
|
|
752ce69b78 | |
|
|
292f9061ab | |
|
|
3abfd0743a | |
|
|
fa51f029b9 | |
|
|
f44edd4fc9 | |
|
|
0ae318be0d | |
|
|
f89649ee26 | |
|
|
84d34e1d0b | |
|
|
09ee115dcc | |
|
|
29ed2d8287 | |
|
|
d97ff6bf72 | |
|
|
72f1a95448 | |
|
|
cd818687ac | |
|
|
4f61e2b7f5 | |
|
|
9c486fa41e | |
|
|
3b5a9e5595 | |
|
|
ca93a71e6e | |
|
|
080726cb47 | |
|
|
1cef219667 | |
|
|
66154ff5d2 | |
|
|
a1abfe9977 | |
|
|
cff473393c | |
|
|
790df95e1a | |
|
|
36f2fd6238 | |
|
|
d3169804f6 | |
|
|
173b93dfd4 | |
|
|
d33abf127f | |
|
|
c875085b2b | |
|
|
bdc953cad9 | |
|
|
a471f1918b | |
|
|
91a4a0c8ea | |
|
|
64e353ce49 | |
|
|
bf65b6463a | |
|
|
150bfbdef4 | |
|
|
b5170e0efa | |
|
|
717959e819 | |
|
|
eeedda105d | |
|
|
196b50dde3 | |
|
|
9f84384ed2 | |
|
|
b17b6577ac | |
|
|
107a9870ef | |
|
|
dec584d50b | |
|
|
84aa3fa696 | |
|
|
182cb2e89c | |
|
|
dc4008936c | |
|
|
8f89f55484 | |
|
|
c8edcafb44 | |
|
|
162f220deb | |
|
|
9f2c77880d | |
|
|
770a5f2ae0 | |
|
|
f1517b845c | |
|
|
7544aca718 | |
|
|
2f3cdcd55b | |
|
|
56a18659b6 | |
|
|
8060693411 | |
|
|
fa8cfe5567 | |
|
|
b42910ada6 | |
|
|
e06685db45 | |
|
|
c8c79ef024 | |
|
|
17eaac5a21 | |
|
|
a29b179045 | |
|
|
06687a33b9 | |
|
|
a26096dd75 | |
|
|
4083fe9413 | |
|
|
673ae4aaf8 | |
|
|
02946df28a | |
|
|
7babb54e4c | |
|
|
7467671c30 | |
|
|
e890b8ac2e | |
|
|
55059443bd | |
|
|
7664795ffb | |
|
|
7168f5c10a | |
|
|
4813257d4f | |
|
|
f56edd07dd | |
|
|
aabb807734 | |
|
|
ca809a44d9 | |
|
|
7626595e3a | |
|
|
22c2252e15 | |
|
|
7b4266a6e5 | |
|
|
b59fd2bf47 | |
|
|
8f3c780552 | |
|
|
1f970b4557 | |
|
|
eed46e802a | |
|
|
b204437c7e | |
|
|
8512b55ce3 | |
|
|
4c43c5914d | |
|
|
29c3acfb31 | |
|
|
b3397a1f14 | |
|
|
26b77eabef | |
|
|
1cb409aaf2 | |
|
|
fe9da3c704 | |
|
|
66629512d9 | |
|
|
ec307a10e6 | |
|
|
098f44cd51 | |
|
|
156852b1dd | |
|
|
2d2db5d50f | |
|
|
46123639cf | |
|
|
b96f180a15 | |
|
|
70da24e177 | |
|
|
b7e8c4c5aa | |
|
|
d6155ecf90 | |
|
|
c948c647a0 | |
|
|
b6d807b0b7 | |
|
|
1f02acbe09 | |
|
|
573ad0c98f | |
|
|
95b1ab7e96 | |
|
|
3114edcb7c | |
|
|
d2f357af3a | |
|
|
d70754e234 | |
|
|
80e02281d5 | |
|
|
2255c4cd4b | |
|
|
beedc988f0 | |
|
|
3536e2113c | |
|
|
4f7734eb97 | |
|
|
db59a605d1 | |
|
|
8f115a8e00 | |
|
|
36cd4f9882 | |
|
|
d11674367d | |
|
|
72563c38f3 | |
|
|
a74523a68d | |
|
|
243e735431 | |
|
|
4d064c69ce | |
|
|
8d6b859bf8 | |
|
|
87664941b0 | |
|
|
7fd8948689 | |
|
|
dac5395126 | |
|
|
0380cf02bf | |
|
|
1d7d8be9e8 | |
|
|
51d354cd28 | |
|
|
10721ca602 | |
|
|
f335b33624 | |
|
|
5b362b8718 | |
|
|
f85533c3c7 | |
|
|
ac93bbc6e4 | |
|
|
94122ad6ee | |
|
|
fbedf6370b | |
|
|
5e30884730 | |
|
|
007c3eb75d | |
|
|
58628ff263 |
|
|
@ -0,0 +1,46 @@
|
|||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
compiler: [gcc, clang]
|
||||
|
||||
steps:
|
||||
- name: Checkout bwa
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Compile with ${{ matrix.compiler }}
|
||||
run: make CC=${{ matrix.compiler }}
|
||||
|
||||
build-aarch64:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
compiler: [gcc, clang]
|
||||
|
||||
steps:
|
||||
- name: Checkout bwa
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Compile with ${{ matrix.compiler }}
|
||||
uses: uraimo/run-on-arch-action@v2
|
||||
with:
|
||||
arch: aarch64
|
||||
distro: ubuntu20.04
|
||||
githubToken: ${{ github.token }}
|
||||
dockerRunArgs: |
|
||||
--volume "${PWD}:/bwa"
|
||||
install: |
|
||||
apt-get update -q -y
|
||||
apt-get install -q -y make ${{ matrix.compiler }} zlib1g-dev
|
||||
run: |
|
||||
cd /bwa
|
||||
make CC=${{ matrix.compiler }}
|
||||
|
|
@ -1,3 +1,10 @@
|
|||
*.[oa]
|
||||
bwa
|
||||
test
|
||||
test64
|
||||
.*.swp
|
||||
Makefile.bak
|
||||
bwamem-lite
|
||||
# ---> C
|
||||
# Prerequisites
|
||||
*.d
|
||||
|
|
@ -51,4 +58,3 @@ modules.order
|
|||
Module.symvers
|
||||
Mkfile.old
|
||||
dkms.conf
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,41 @@
|
|||
{
|
||||
// 使用 IntelliSense 了解相关属性。
|
||||
// 悬停以查看现有属性的描述。
|
||||
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "bwa-mem",
|
||||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/bwa",
|
||||
"args": [
|
||||
"mem",
|
||||
"-t",
|
||||
"1",
|
||||
"-M",
|
||||
"-R",
|
||||
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||
"/home/zzh/data/reference/human_g1k_v37_decoy.fasta",
|
||||
"/home/zzh/data/fastq/n_s1.fq",
|
||||
"/home/zzh/data/fastq/n_s2.fq",
|
||||
"-o",
|
||||
"/dev/null"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
},
|
||||
{
|
||||
"name": "index",
|
||||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/bwa",
|
||||
"args": [
|
||||
"index",
|
||||
"/mnt/d/data/reference/human_g1k_v37_decoy.fasta"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"files.associations": {
|
||||
"bwt.h": "c",
|
||||
"bwa.h": "c",
|
||||
"malloc_wrap.h": "c",
|
||||
"bntseq.h": "c",
|
||||
"utils.h": "c",
|
||||
"rle.h": "c",
|
||||
"rope.h": "c",
|
||||
"random": "c",
|
||||
"kseq.h": "c"
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
// See https://go.microsoft.com/fwlink/?LinkId=733558
|
||||
// for the documentation about the tasks.json format
|
||||
"version": "2.0.0",
|
||||
"tasks": [
|
||||
{
|
||||
"label": "Build",
|
||||
"type": "shell",
|
||||
"command": "make clean; make -j 16",
|
||||
"problemMatcher": [],
|
||||
"group": {
|
||||
"kind": "build",
|
||||
"isDefault": true
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,674 @@
|
|||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 3, 29 June 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU General Public License is a free, copyleft license for
|
||||
software and other kinds of works.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
the GNU General Public License is intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users. We, the Free Software Foundation, use the
|
||||
GNU General Public License for most of our software; it applies also to
|
||||
any other work released this way by its authors. You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to prevent others from denying you
|
||||
these rights or asking you to surrender the rights. Therefore, you have
|
||||
certain responsibilities if you distribute copies of the software, or if
|
||||
you modify it: responsibilities to respect the freedom of others.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must pass on to the recipients the same
|
||||
freedoms that you received. You must make sure that they, too, receive
|
||||
or can get the source code. And you must show them these terms so they
|
||||
know their rights.
|
||||
|
||||
Developers that use the GNU GPL protect your rights with two steps:
|
||||
(1) assert copyright on the software, and (2) offer you this License
|
||||
giving you legal permission to copy, distribute and/or modify it.
|
||||
|
||||
For the developers' and authors' protection, the GPL clearly explains
|
||||
that there is no warranty for this free software. For both users' and
|
||||
authors' sake, the GPL requires that modified versions be marked as
|
||||
changed, so that their problems will not be attributed erroneously to
|
||||
authors of previous versions.
|
||||
|
||||
Some devices are designed to deny users access to install or run
|
||||
modified versions of the software inside them, although the manufacturer
|
||||
can do so. This is fundamentally incompatible with the aim of
|
||||
protecting users' freedom to change the software. The systematic
|
||||
pattern of such abuse occurs in the area of products for individuals to
|
||||
use, which is precisely where it is most unacceptable. Therefore, we
|
||||
have designed this version of the GPL to prohibit the practice for those
|
||||
products. If such problems arise substantially in other domains, we
|
||||
stand ready to extend this provision to those domains in future versions
|
||||
of the GPL, as needed to protect the freedom of users.
|
||||
|
||||
Finally, every program is threatened constantly by software patents.
|
||||
States should not allow patents to restrict development and use of
|
||||
software on general-purpose computers, but in those that do, we wish to
|
||||
avoid the special danger that patents applied to a free program could
|
||||
make it effectively proprietary. To prevent this, the GPL assures that
|
||||
patents cannot be used to render the program non-free.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Use with the GNU Affero General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU Affero General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the special requirements of the GNU Affero General Public License,
|
||||
section 13, concerning interaction through a network will apply to the
|
||||
combination as such.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program does terminal interaction, make it output a short
|
||||
notice like this when it starts in an interactive mode:
|
||||
|
||||
<program> Copyright (C) <year> <name of author>
|
||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, your program's commands
|
||||
might be different; for a GUI interface, you would use an "about box".
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU GPL, see
|
||||
<http://www.gnu.org/licenses/>.
|
||||
|
||||
The GNU General Public License does not permit incorporating your program
|
||||
into proprietary programs. If your program is a subroutine library, you
|
||||
may consider it more useful to permit linking proprietary applications with
|
||||
the library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License. But first, please read
|
||||
<http://www.gnu.org/philosophy/why-not-lgpl.html>.
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
CC= gcc
|
||||
#CC= clang --analyze
|
||||
# CFLAGS= -g -Wall -Wno-unused-function -O2
|
||||
CFLAGS= -g -Wall -Wno-unused-function #-O2
|
||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||
AR= ar
|
||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
|
||||
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o
|
||||
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
bwape.o kopen.o pemerge.o maxk.o \
|
||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||
PROG= bwa
|
||||
INCLUDES=
|
||||
LIBS= -lm -lz -lpthread
|
||||
SUBDIRS= .
|
||||
|
||||
ifeq ($(shell uname -s),Linux)
|
||||
LIBS += -lrt
|
||||
endif
|
||||
|
||||
.SUFFIXES:.c .o .cc
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $(CPPFLAGS) $< -o $@
|
||||
|
||||
all:$(PROG)
|
||||
|
||||
bwa:libbwa.a $(AOBJS) main.o
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
bwamem-lite:libbwa.a example.o
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
libbwa.a:$(LOBJS)
|
||||
$(AR) -csru $@ $(LOBJS)
|
||||
|
||||
clean:
|
||||
rm -f gmon.out *.o a.out $(PROG) *~ *.a
|
||||
|
||||
depend:
|
||||
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) $(CPPFLAGS) -- *.c )
|
||||
|
||||
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||||
|
||||
QSufSort.o: QSufSort.h
|
||||
bamlite.o: bamlite.h malloc_wrap.h
|
||||
bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h khash.h
|
||||
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kvec.h
|
||||
bwa.o: kseq.h
|
||||
bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
|
||||
bwamem.o: ksort.h utils.h kbtree.h
|
||||
bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h
|
||||
bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h
|
||||
bwamem_pair.o: utils.h ksw.h
|
||||
bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h
|
||||
bwape.o: ksw.h khash.h
|
||||
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
|
||||
bwase.o: bwa.h ksw.h
|
||||
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
|
||||
bwashm.o: bwa.h bntseq.h bwt.h
|
||||
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
|
||||
bwt_gen.o: QSufSort.h malloc_wrap.h
|
||||
bwt_lite.o: bwt_lite.h malloc_wrap.h
|
||||
bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
|
||||
bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
|
||||
bwtindex.o: bntseq.h bwa.h bwt.h utils.h rle.h rope.h malloc_wrap.h
|
||||
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
|
||||
bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
|
||||
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
|
||||
bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h
|
||||
bwtsw2_core.o: khash.h ksort.h
|
||||
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
|
||||
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
|
||||
bwtsw2_pair.o: malloc_wrap.h ksw.h
|
||||
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
|
||||
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
|
||||
is.o: malloc_wrap.h
|
||||
kopen.o: malloc_wrap.h
|
||||
kstring.o: kstring.h malloc_wrap.h
|
||||
ksw.o: ksw.h neon_sse.h scalar_sse.h malloc_wrap.h
|
||||
main.o: kstring.h malloc_wrap.h utils.h
|
||||
malloc_wrap.o: malloc_wrap.h
|
||||
maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
|
||||
pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
|
||||
rle.o: rle.h
|
||||
rope.o: rle.h rope.h
|
||||
utils.o: utils.h ksort.h malloc_wrap.h kseq.h
|
||||
|
|
@ -0,0 +1,402 @@
|
|||
/* QSufSort.c
|
||||
|
||||
Original source from qsufsort.c
|
||||
|
||||
Copyright 1999, N. Jesper Larsson, all rights reserved.
|
||||
|
||||
This file contains an implementation of the algorithm presented in "Faster
|
||||
Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
|
||||
Sadakane (sada@is.s.u-tokyo.ac.jp).
|
||||
|
||||
This software may be used freely for any purpose. However, when distributed,
|
||||
the original source must be clearly stated, and, when the source code is
|
||||
distributed, the copyright notice must be retained and any alterations in
|
||||
the code must be clearly marked. No warranty is given regarding the quality
|
||||
of this software.
|
||||
|
||||
Modified by Wong Chi-Kwong, 2004
|
||||
|
||||
Changes summary: - Used long variable and function names
|
||||
- Removed global variables
|
||||
- Replace pointer references with array references
|
||||
- Used insertion sort in place of selection sort and increased insertion sort threshold
|
||||
- Reconstructing suffix array from inverse becomes an option
|
||||
- Add handling where end-of-text symbol is not necessary < all characters
|
||||
- Removed codes for supporting alphabet size > number of characters
|
||||
|
||||
No warrenty is given regarding the quality of the modifications.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include "QSufSort.h"
|
||||
|
||||
#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
|
||||
#define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
|
||||
#define swap(a, b, t); t = a; a = b; b = t;
|
||||
|
||||
// Static functions
|
||||
static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar);
|
||||
static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar);
|
||||
static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar);
|
||||
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize);
|
||||
static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
||||
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated);
|
||||
|
||||
/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
|
||||
n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
|
||||
contents of x[n] is disregarded, the n-th symbol being regarded as
|
||||
end-of-string smaller than all other symbols.*/
|
||||
void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
||||
const qsint_t smallestInputSymbol, const int skipTransform)
|
||||
{
|
||||
qsint_t i, j;
|
||||
qsint_t s, negatedSortedGroupLength;
|
||||
qsint_t numSymbolAggregated;
|
||||
qsint_t numSortedPos = 1;
|
||||
qsint_t newAlphabetSize;
|
||||
|
||||
if (!skipTransform) {
|
||||
/* bucketing possible*/
|
||||
newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
|
||||
numChar, &numSymbolAggregated);
|
||||
QSufSortBucketSort(V, I, numChar, newAlphabetSize);
|
||||
I[0] = -1;
|
||||
V[numChar] = 0;
|
||||
numSortedPos = numSymbolAggregated;
|
||||
}
|
||||
|
||||
while ((qsint_t)(I[0]) >= -(qsint_t)numChar) {
|
||||
i = 0;
|
||||
negatedSortedGroupLength = 0;
|
||||
do {
|
||||
s = I[i];
|
||||
if (s < 0) {
|
||||
i -= s; /* skip over sorted group.*/
|
||||
negatedSortedGroupLength += s;
|
||||
} else {
|
||||
if (negatedSortedGroupLength) {
|
||||
I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */
|
||||
negatedSortedGroupLength = 0;
|
||||
}
|
||||
j = V[s] + 1;
|
||||
QSufSortSortSplit(V, I, i, j - 1, numSortedPos);
|
||||
i = j;
|
||||
}
|
||||
} while (i <= numChar);
|
||||
if (negatedSortedGroupLength) {
|
||||
/* array ends with a sorted group.*/
|
||||
I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/
|
||||
}
|
||||
numSortedPos *= 2; /* double sorted-depth.*/
|
||||
}
|
||||
}
|
||||
|
||||
void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar)
|
||||
{
|
||||
qsint_t i;
|
||||
for (i=0; i<=numChar; i++)
|
||||
I[V[i]] = i + 1;
|
||||
}
|
||||
|
||||
/* Sorting routine called for each unsorted group. Sorts the array of integers
|
||||
(suffix numbers) of length n starting at p. The algorithm is a ternary-split
|
||||
quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
|
||||
Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
|
||||
function is based on Program 7.*/
|
||||
static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar) {
|
||||
|
||||
qsint_t a, b, c, d;
|
||||
qsint_t l, m;
|
||||
qsint_t f, v, s, t;
|
||||
qsint_t tmp;
|
||||
qsint_t numItem;
|
||||
|
||||
numItem = highestPos - lowestPos + 1;
|
||||
|
||||
if (numItem <= INSERT_SORT_NUM_ITEM) {
|
||||
QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar);
|
||||
return;
|
||||
}
|
||||
|
||||
v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar);
|
||||
|
||||
a = b = lowestPos;
|
||||
c = d = highestPos;
|
||||
|
||||
while (1) {
|
||||
while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
|
||||
if (f == v) {
|
||||
swap(I[a], I[b], tmp);
|
||||
a++;
|
||||
}
|
||||
b++;
|
||||
}
|
||||
while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) {
|
||||
if (f == v) {
|
||||
swap(I[c], I[d], tmp);
|
||||
d--;
|
||||
}
|
||||
c--;
|
||||
}
|
||||
if (b > c)
|
||||
break;
|
||||
swap(I[b], I[c], tmp);
|
||||
b++;
|
||||
c--;
|
||||
}
|
||||
|
||||
s = a - lowestPos;
|
||||
t = b - a;
|
||||
s = min(s, t);
|
||||
for (l = lowestPos, m = b - s; m < b; l++, m++) {
|
||||
swap(I[l], I[m], tmp);
|
||||
}
|
||||
|
||||
s = d - c;
|
||||
t = highestPos - d;
|
||||
s = min(s, t);
|
||||
for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) {
|
||||
swap(I[l], I[m], tmp);
|
||||
}
|
||||
|
||||
s = b - a;
|
||||
t = d - c;
|
||||
if (s > 0)
|
||||
QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
|
||||
|
||||
// Update group number for equal portion
|
||||
a = lowestPos + s;
|
||||
b = highestPos - t;
|
||||
if (a == b) {
|
||||
// Sorted group
|
||||
V[I[a]] = a;
|
||||
I[a] = -1;
|
||||
} else {
|
||||
// Unsorted group
|
||||
for (c=a; c<=b; c++)
|
||||
V[I[c]] = b;
|
||||
}
|
||||
|
||||
if (t > 0)
|
||||
QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
|
||||
|
||||
}
|
||||
|
||||
/* Algorithm by Bentley & McIlroy.*/
|
||||
static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar) {
|
||||
|
||||
qsint_t m;
|
||||
qsint_t keyl, keym, keyn;
|
||||
qsint_t key1, key2, key3;
|
||||
qsint_t s;
|
||||
qsint_t numItem;
|
||||
|
||||
numItem = highestPos - lowestPos + 1;
|
||||
|
||||
m = lowestPos + numItem / 2;
|
||||
|
||||
s = numItem / 8;
|
||||
key1 = KEY(V, I, lowestPos, numSortedChar);
|
||||
key2 = KEY(V, I, lowestPos+s, numSortedChar);
|
||||
key3 = KEY(V, I, lowestPos+2*s, numSortedChar);
|
||||
keyl = med3(key1, key2, key3);
|
||||
key1 = KEY(V, I, m-s, numSortedChar);
|
||||
key2 = KEY(V, I, m, numSortedChar);
|
||||
key3 = KEY(V, I, m+s, numSortedChar);
|
||||
keym = med3(key1, key2, key3);
|
||||
key1 = KEY(V, I, highestPos-2*s, numSortedChar);
|
||||
key2 = KEY(V, I, highestPos-s, numSortedChar);
|
||||
key3 = KEY(V, I, highestPos, numSortedChar);
|
||||
keyn = med3(key1, key2, key3);
|
||||
|
||||
return med3(keyl, keym, keyn);
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* Quadratic sorting method to use for small subarrays. */
|
||||
static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
||||
const qsint_t highestPos, const qsint_t numSortedChar)
|
||||
{
|
||||
qsint_t i, j;
|
||||
qsint_t tmpKey, tmpPos;
|
||||
qsint_t numItem;
|
||||
qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
|
||||
qsint_t negativeSortedLength;
|
||||
qsint_t groupNum;
|
||||
|
||||
numItem = highestPos - lowestPos + 1;
|
||||
|
||||
for (i=0; i<numItem; i++) {
|
||||
pos[i] = I[lowestPos + i];
|
||||
key[i] = V[pos[i] + numSortedChar];
|
||||
}
|
||||
|
||||
for (i=1; i<numItem; i++) {
|
||||
tmpKey = key[i];
|
||||
tmpPos = pos[i];
|
||||
for (j=i; j>0 && key[j-1] > tmpKey; j--) {
|
||||
key[j] = key[j-1];
|
||||
pos[j] = pos[j-1];
|
||||
}
|
||||
key[j] = tmpKey;
|
||||
pos[j] = tmpPos;
|
||||
}
|
||||
|
||||
negativeSortedLength = -1;
|
||||
|
||||
i = numItem - 1;
|
||||
groupNum = highestPos;
|
||||
while (i > 0) {
|
||||
I[i+lowestPos] = pos[i];
|
||||
V[I[i+lowestPos]] = groupNum;
|
||||
if (key[i-1] == key[i]) {
|
||||
negativeSortedLength = 0;
|
||||
} else {
|
||||
if (negativeSortedLength < 0)
|
||||
I[i+lowestPos] = negativeSortedLength;
|
||||
groupNum = i + lowestPos - 1;
|
||||
negativeSortedLength--;
|
||||
}
|
||||
i--;
|
||||
}
|
||||
|
||||
I[lowestPos] = pos[0];
|
||||
V[I[lowestPos]] = groupNum;
|
||||
if (negativeSortedLength < 0)
|
||||
I[lowestPos] = negativeSortedLength;
|
||||
}
|
||||
|
||||
/* Bucketsort for first iteration.
|
||||
|
||||
Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
|
||||
at least once. x[n] is 0. (This is the corresponding output of transform.) k
|
||||
must be at most n+1. p is array of size n+1 whose contents are disregarded.
|
||||
|
||||
Output: x is V and p is I after the initial sorting stage of the refined
|
||||
suffix sorting algorithm.*/
|
||||
|
||||
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize)
|
||||
{
|
||||
qsint_t i, c;
|
||||
qsint_t d;
|
||||
qsint_t groupNum;
|
||||
qsint_t currentIndex;
|
||||
|
||||
// mark linked list empty
|
||||
for (i=0; i<alphabetSize; i++)
|
||||
I[i] = -1;
|
||||
|
||||
// insert to linked list
|
||||
for (i=0; i<=numChar; i++) {
|
||||
c = V[i];
|
||||
V[i] = (qsint_t)(I[c]);
|
||||
I[c] = i;
|
||||
}
|
||||
|
||||
currentIndex = numChar;
|
||||
for (i=alphabetSize; i>0; i--) {
|
||||
c = I[i-1];
|
||||
d = (qsint_t)(V[c]);
|
||||
groupNum = currentIndex;
|
||||
V[c] = groupNum;
|
||||
if (d >= 0) {
|
||||
I[currentIndex] = c;
|
||||
while (d >= 0) {
|
||||
c = d;
|
||||
d = V[c];
|
||||
V[c] = groupNum;
|
||||
currentIndex--;
|
||||
I[currentIndex] = c;
|
||||
}
|
||||
} else {
|
||||
// sorted group
|
||||
I[currentIndex] = -1;
|
||||
}
|
||||
currentIndex--;
|
||||
}
|
||||
}
|
||||
|
||||
/* Transforms the alphabet of x by attempting to aggregate several symbols into
|
||||
one, while preserving the suffix order of x. The alphabet may also be
|
||||
compacted, so that x on output comprises all integers of the new alphabet
|
||||
with no skipped numbers.
|
||||
|
||||
Input: x is an array of size n+1 whose first n elements are positive
|
||||
integers in the range l...k-1. p is array of size n+1, used for temporary
|
||||
storage. q controls aggregation and compaction by defining the maximum intue
|
||||
for any symbol during transformation: q must be at least k-l; if q<=n,
|
||||
compaction is guaranteed; if k-l>n, compaction is never done; if q is
|
||||
INT_MAX, the maximum number of symbols are aggregated into one.
|
||||
|
||||
Output: Returns an integer j in the range 1...q representing the size of the
|
||||
new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
|
||||
set to the number of old symbols grouped into one. Only x[n] is 0.*/
|
||||
static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
||||
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated)
|
||||
{
|
||||
qsint_t c, i, j;
|
||||
qsint_t a; // numSymbolAggregated
|
||||
qsint_t mask;
|
||||
qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0;
|
||||
qsint_t newAlphabetSize;
|
||||
qsint_t maxNumInputSymbol, maxNumBit, maxSymbol;
|
||||
|
||||
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
|
||||
|
||||
for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit;
|
||||
maxSymbol = QSINT_MAX >> maxNumBit;
|
||||
|
||||
c = maxNumInputSymbol;
|
||||
for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
|
||||
minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1);
|
||||
maxSymbolInChunk = c;
|
||||
c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol;
|
||||
}
|
||||
|
||||
mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/
|
||||
V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
|
||||
|
||||
/* bucketing possible, compact alphabet.*/
|
||||
for (i=0; i<=maxSymbolInChunk; i++)
|
||||
I[i] = 0; /* zero transformation table.*/
|
||||
c = minSymbolInChunk;
|
||||
for (i=a; i<=numChar; i++) {
|
||||
I[c] = 1; /* mark used chunk symbol.*/
|
||||
c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
|
||||
}
|
||||
for (i=1; i<a; i++) { /* handle last r-1 positions.*/
|
||||
I[c] = 1; /* mark used chunk symbol.*/
|
||||
c = (c & mask) << maxNumBit; /* shift in next old symbol in chunk.*/
|
||||
}
|
||||
newAlphabetSize = 1;
|
||||
for (i=0; i<=maxSymbolInChunk; i++) {
|
||||
if (I[i]) {
|
||||
I[i] = newAlphabetSize;
|
||||
newAlphabetSize++;
|
||||
}
|
||||
}
|
||||
c = minSymbolInChunk;
|
||||
for (i=0, j=a; j<=numChar; i++, j++) {
|
||||
V[i] = I[c]; /* transform to new alphabet.*/
|
||||
c = ((c & mask) << maxNumBit) | (V[j] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
|
||||
}
|
||||
for (; i<numChar; i++) { /* handle last a-1 positions.*/
|
||||
V[i] = I[c]; /* transform to new alphabet.*/
|
||||
c = (c & mask) << maxNumBit; /* shift right-end zero in chunk.*/
|
||||
}
|
||||
|
||||
V[numChar] = 0; /* end-of-string symbol is zero.*/
|
||||
|
||||
*numSymbolAggregated = a;
|
||||
return newAlphabetSize;
|
||||
}
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
/* QSufSort.h
|
||||
|
||||
Header file for QSufSort.c
|
||||
|
||||
This file contains an implementation of the algorithm presented in "Faster
|
||||
Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
|
||||
Sadakane (sada@is.s.u-tokyo.ac.jp).
|
||||
|
||||
This software may be used freely for any purpose. However, when distributed,
|
||||
the original source must be clearly stated, and, when the source code is
|
||||
distributed, the copyright notice must be retained and any alterations in
|
||||
the code must be clearly marked. No warranty is given regarding the quality
|
||||
of this software.
|
||||
|
||||
Modified by Wong Chi-Kwong, 2004
|
||||
|
||||
Changes summary: - Used long variable and function names
|
||||
- Removed global variables
|
||||
- Replace pointer references with array references
|
||||
- Used insertion sort in place of selection sort and increased insertion sort threshold
|
||||
- Reconstructing suffix array from inverse becomes an option
|
||||
- Add handling where end-of-text symbol is not necessary < all characters
|
||||
- Removed codes for supporting alphabet size > number of characters
|
||||
|
||||
No warrenty is given regarding the quality of the modifications.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef __QSUFSORT_H__
|
||||
#define __QSUFSORT_H__
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define KEY(V, I, p, h) ( V[ I[p] + h ] )
|
||||
#define INSERT_SORT_NUM_ITEM 16
|
||||
|
||||
typedef int64_t qsint_t;
|
||||
#define QSINT_MAX INT64_MAX
|
||||
|
||||
void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
||||
const qsint_t smallestInputSymbol, const int skipTransform);
|
||||
void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar);
|
||||
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,178 @@
|
|||
## For the Impatient
|
||||
|
||||
```sh
|
||||
# Download bwakit (or from <http://sourceforge.net/projects/bio-bwa/files/bwakit/> manually)
|
||||
wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \
|
||||
| gzip -dc | tar xf -
|
||||
# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
|
||||
bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
|
||||
bwa.kit/bwa index hs38DH.fa # create BWA index
|
||||
# mapping
|
||||
bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh # skip "|sh" to show command lines
|
||||
```
|
||||
|
||||
This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA
|
||||
genotypes on each gene and `out.hla.all` for other possible HLA genotypes.
|
||||
Please check out [bwa/bwakit/README.md][kithelp] for details.
|
||||
|
||||
## Background
|
||||
|
||||
GRCh38 consists of several components: chromosomal assembly, unlocalized contigs
|
||||
(chromosome known but location unknown), unplaced contigs (chromosome unknown)
|
||||
and ALT contigs (long clustered variations). The combination of the first three
|
||||
components is called the *primary assembly*. It is recommended to use the
|
||||
complete primary assembly for all analyses. Using ALT contigs in read mapping is
|
||||
tricky.
|
||||
|
||||
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp of the primary
|
||||
assembly. However, sequences that are highly diverged from the primary assembly
|
||||
only contribute a few million bp. Most subsequences of ALT contigs are nearly
|
||||
identical to the primary assembly. If we align sequence reads to GRCh38+ALT
|
||||
blindly, we will get many additional reads with zero mapping quality and miss
|
||||
variants on them. It is crucial to make mappers aware of ALTs.
|
||||
|
||||
BWA-MEM is ALT-aware. It essentially computes mapping quality across the
|
||||
non-redundant content of the primary assembly plus the ALT contigs and is free
|
||||
of the problem above.
|
||||
|
||||
## Methods
|
||||
|
||||
### Sequence alignment
|
||||
|
||||
As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
|
||||
postprocessing. The `bwa.kit/run-bwamem` script performs the two steps when ALT
|
||||
contigs are present. The following picture shows an example about how BWA-MEM
|
||||
infers mapping quality and reports alignment after step 2:
|
||||
|
||||

|
||||
|
||||
#### Step 1: BWA-MEM mapping
|
||||
|
||||
At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring
|
||||
the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
|
||||
depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
|
||||
alignments and assigns mapQ following these two rules:
|
||||
|
||||
1. The mapQ of a non-ALT hit is computed across non-ALT hits only. The mapQ of
|
||||
an ALT hit is computed across all hits.
|
||||
|
||||
2. If there are no non-ALT hits, the best ALT hit is outputted as the primary
|
||||
alignment. If there are both ALT and non-ALT hits, non-ALT hits will be
|
||||
primary and ALT hits be supplementary (SAM flag 0x800).
|
||||
|
||||
In theory, non-ALT alignments from step 1 should be identical to alignments
|
||||
against the reference genome with ALT contigs. In practice, the two types of
|
||||
alignments may differ in rare cases due to seeding heuristics. When an ALT hit
|
||||
is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
|
||||
non-ALT hits.
|
||||
|
||||
If we don't care about ALT hits, we may skip postprocessing (step 2).
|
||||
Nonetheless, postprocessing is recommended as it improves mapQ and gives more
|
||||
information about ALT hits.
|
||||
|
||||
#### Step 2: Postprocessing
|
||||
|
||||
Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
|
||||
potential hits reported in the XA tag, lifts ALT hits to the chromosomal
|
||||
positions using the ALT-to-ref alignment, groups them based on overlaps between
|
||||
their lifted positions, and then re-estimates mapQ across the best scoring hit
|
||||
in each group. Being aware of the ALT-to-ref alignment, this script can greatly
|
||||
improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. It also
|
||||
writes each hit overlapping the reported hit into a separate SAM line. This
|
||||
enables variant calling on each ALT contig independent of others.
|
||||
|
||||
### On the completeness of GRCh38+ALT
|
||||
|
||||
While GRCh38 is much more complete than GRCh37, it is still missing some true
|
||||
human sequences. To make sure every piece of sequence in the reference assembly
|
||||
is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig
|
||||
to have enough support from multiple sources before considering to add it to the
|
||||
reference assembly. This careful and sophisticated procedure has left out some
|
||||
sequences, one of which is [this example][novel], a 10kb contig assembled from
|
||||
CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or
|
||||
[BLAST][blast] to see where it maps.
|
||||
|
||||
For a more complete reference genome, we compiled a new set of decoy sequences
|
||||
from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
|
||||
The sequences are included in `hs38DH-extra.fa` from the [BWA binary
|
||||
package][res].
|
||||
|
||||
In addition to decoy, we also put multiple alleles of HLA genes in
|
||||
`hs38DH-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
|
||||
version 3.18.0 and are used to collect reads sequenced from these genes.
|
||||
|
||||
### HLA typing
|
||||
|
||||
HLA genes are known to be associated with many autoimmune diseases, infectious
|
||||
diseases and drug responses. They are among the most important genes but are
|
||||
rarely studied by WGS projects due to the high sequence divergence between
|
||||
HLA genes and the reference genome in these regions.
|
||||
|
||||
By including the HLA gene regions in the reference assembly as ALT contigs, we
|
||||
are able to effectively identify reads coming from these genes. We also provide
|
||||
a pipeline, which is included in the [BWA binary package][res], to type the
|
||||
several classic HLA genes. The pipeline is conceptually simple. It de novo
|
||||
assembles sequence reads mapped to each gene, aligns exon sequences of each
|
||||
allele to the assembled contigs and then finds the pairs of alleles that best
|
||||
explain the contigs. In practice, however, the completeness of IMGT/HLA and
|
||||
copy-number changes related to these genes are not so straightforward to
|
||||
resolve. HLA typing may not always be successful. Users may also consider to use
|
||||
other programs for typing such as [Warren et al (2012)][hla4], [Liu et al
|
||||
(2013)][hla2], [Bai et al (2014)][hla3] and [Dilthey et al (2014)][hla1], though
|
||||
most of them are distributed under restrictive licenses.
|
||||
|
||||
## Preliminary Evaluation
|
||||
|
||||
To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878
|
||||
unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy
|
||||
(hs38DH), and called small variants from the alignment. CHM1 is haploid.
|
||||
Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The
|
||||
true positive (TP) heterozygous calls from NA12878 are approximately equal
|
||||
to the difference between NA12878 and CHM1 heterozygous calls. A better assembly
|
||||
should yield higher TP and lower FP. The following table shows the numbers for
|
||||
these assemblies:
|
||||
|
||||
|Assembly|hs37 |hs38 |hs38DH|CHM1_1.1| huref|
|
||||
|:------:|------:|------:|------:|------:|------:|
|
||||
|FP | 255706| 168068| 142516|307172 | 575634|
|
||||
|TP |2142260|2163113|2150844|2167235|2137053|
|
||||
|
||||
With this measurement, hs38 is clearly better than hs37. Genome hs38DH reduces
|
||||
FP by ~25k but also reduces TP by ~12k. We manually inspected variants called
|
||||
from hs38 only and found the majority of them are associated with excessive read
|
||||
depth, clustered variants or weak alignment. We believe most hs38-only calls are
|
||||
problematic. In addition, if we compare two NA12878 replicates from HiSeq X10
|
||||
with nearly identical library construction, the difference is ~140k, an order
|
||||
of magnitude higher than the difference between hs38 and hs38DH. ALT contigs,
|
||||
decoy and HLA genes in hs38DH improve variant calling and enable the analyses of
|
||||
ALT contigs and HLA typing at little cost.
|
||||
|
||||
## Problems and Future Development
|
||||
|
||||
There are some uncertainties about ALT mappings - we are not sure whether they
|
||||
help biological discovery and don't know the best way to analyze them. Without
|
||||
clear demand from downstream analyses, it is very difficult to design the
|
||||
optimal mapping strategy. The current BWA-MEM method is just a start. If it
|
||||
turns out to be useful in research, we will probably rewrite bwa-postalt.js in C
|
||||
for performance; if not, we may make changes. It is also possible that we might
|
||||
make breakthrough on the representation of multiple genomes, in which case, we
|
||||
can even get rid of ALT contigs for good.
|
||||
|
||||
|
||||
|
||||
[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
|
||||
[sb]: https://github.com/GregoryFaust/samblaster
|
||||
[grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/
|
||||
[novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc
|
||||
[blat]: https://genome.ucsc.edu/cgi-bin/hgBlat
|
||||
[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
|
||||
[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/
|
||||
[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/
|
||||
[grcdef]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml
|
||||
[hla1]: http://biorxiv.org/content/early/2014/07/08/006973
|
||||
[hlalink]: http://www.hladiseaseassociations.com
|
||||
[hlatools]: https://www.biostars.org/p/93245/
|
||||
[hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html
|
||||
[hla3]: http://www.biomedcentral.com/1471-2164/15/325
|
||||
[hla4]: http://genomemedicine.com/content/4/12/95
|
||||
[kithelp]: https://github.com/lh3/bwa/tree/master/bwakit
|
||||
188
README.md
188
README.md
|
|
@ -1,3 +1,189 @@
|
|||
[](https://github.com/lh3/bwa/actions)
|
||||
[](https://sourceforge.net/projects/bio-bwa/files/?source=navbar)
|
||||
[](https://github.com/lh3/bwa/releases)
|
||||
[](https://anaconda.org/bioconda/bwa)
|
||||
|
||||
**Note: [minimap2][minimap2] has replaced BWA-MEM for __PacBio and Nanopore__ read
|
||||
alignment.** It retains all major BWA-MEM features, but is ~50 times as fast,
|
||||
more versatile, more accurate and produces better base-level alignment.
|
||||
A beta version of [BWA-MEM2][bwa-mem2] has been released for short-read mapping.
|
||||
BWA-MEM2 is about twice as fast as BWA-MEM and outputs near identical alignments.
|
||||
|
||||
[minimap2]: https://github.com/lh3/minimap2
|
||||
[bwa-mem2]: https://github.com/bwa-mem2/bwa-mem2
|
||||
|
||||
## Getting started
|
||||
|
||||
git clone https://github.com/lh3/bwa.git
|
||||
cd bwa; make
|
||||
./bwa index ref.fa
|
||||
./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz
|
||||
./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz
|
||||
|
||||
## Introduction
|
||||
|
||||
BWA is a software package for mapping DNA sequences against a large reference
|
||||
genome, such as the human genome. It consists of three algorithms:
|
||||
BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
|
||||
sequence reads up to 100bp, while the rest two for longer sequences ranged from
|
||||
70bp to a few megabases. BWA-MEM and BWA-SW share similar features such as the
|
||||
support of long reads and chimeric alignment, but BWA-MEM, which is the latest,
|
||||
is generally recommended as it is faster and more accurate. BWA-MEM also has
|
||||
better performance than BWA-backtrack for 70-100bp Illumina reads.
|
||||
|
||||
For all the algorithms, BWA first needs to construct the FM-index for the
|
||||
reference genome (the **index** command). Alignment algorithms are invoked with
|
||||
different sub-commands: **aln/samse/sampe** for BWA-backtrack,
|
||||
**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm.
|
||||
|
||||
## Availability
|
||||
|
||||
BWA is released under [GPLv3][1]. The latest source code is [freely
|
||||
available at github][2]. Released packages can [be downloaded][3] at
|
||||
SourceForge. After you acquire the source code, simply use `make` to compile
|
||||
and copy the single executable `bwa` to the destination you want. The only
|
||||
dependency required to build BWA is [zlib][14].
|
||||
|
||||
Since 0.7.11, precompiled binary for x86\_64-linux is available in [bwakit][17].
|
||||
In addition to BWA, this self-consistent package also comes with bwa-associated
|
||||
and 3rd-party tools for proper BAM-to-FASTQ conversion, mapping to ALT contigs,
|
||||
adapter triming, duplicate marking, HLA typing and associated data files.
|
||||
|
||||
## Seeking help
|
||||
|
||||
The detailed usage is described in the man page available together with the
|
||||
source code. You can use `man ./bwa.1` to view the man page in a terminal. The
|
||||
[HTML version][4] of the man page can be found at the [BWA website][5]. If you
|
||||
have questions about BWA, you may [sign up the mailing list][6] and then send
|
||||
the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions
|
||||
in forums such as [BioStar][8] and [SEQanswers][9].
|
||||
|
||||
## Citing BWA
|
||||
|
||||
* Li H. and Durbin R. (2009) Fast and accurate short read alignment with
|
||||
Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID:
|
||||
[19451168][10]]. (if you use the BWA-backtrack algorithm)
|
||||
|
||||
* Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
|
||||
Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID:
|
||||
[20080505][11]]. (if you use the BWA-SW algorithm)
|
||||
|
||||
* Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs
|
||||
with BWA-MEM. [arXiv:1303.3997v2][12] [q-bio.GN]. (if you use the BWA-MEM
|
||||
algorithm or the **fastmap** command, or want to cite the whole BWA package)
|
||||
|
||||
Please note that the last reference is a preprint hosted at [arXiv.org][13]. I
|
||||
do not have plan to submit it to a peer-reviewed journal in the near future.
|
||||
|
||||
## Frequently asked questions (FAQs)
|
||||
|
||||
1. [What types of data does BWA work with?](#type)
|
||||
2. [Why does a read appear multiple times in the output SAM?](#multihit)
|
||||
3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
|
||||
4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
|
||||
5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
|
||||
6. [Does BWA work with ALT contigs in the GRCh38 release?](#altctg)
|
||||
7. [Can I just run BWA-MEM against GRCh38+ALT without post-processing?](#postalt)
|
||||
|
||||
#### <a name="type"></a>1. What types of data does BWA work with?
|
||||
|
||||
BWA works with a variety types of DNA sequence data, though the optimal
|
||||
algorithm and setting may vary. The following list gives the recommended
|
||||
settings:
|
||||
|
||||
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
|
||||
contigs up to a few megabases mapped to a closely related reference genome:
|
||||
|
||||
bwa mem ref.fa reads.fq > aln.sam
|
||||
|
||||
* Illumina single-end reads shorter than ~70bp:
|
||||
|
||||
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
|
||||
|
||||
* Illumina/454/IonTorrent paired-end reads longer than ~70bp:
|
||||
|
||||
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
|
||||
|
||||
* Illumina paired-end reads shorter than ~70bp:
|
||||
|
||||
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
|
||||
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
|
||||
|
||||
* PacBio subreads or Oxford Nanopore reads to a reference genome:
|
||||
|
||||
bwa mem -x pacbio ref.fa reads.fq > aln.sam
|
||||
bwa mem -x ont2d ref.fa reads.fq > aln.sam
|
||||
|
||||
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
|
||||
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
|
||||
errors given longer query sequences as the chance of missing all seeds is small.
|
||||
As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
|
||||
reads with a sequencing error rate over 20%.
|
||||
|
||||
#### <a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
|
||||
|
||||
BWA-SW and BWA-MEM perform local alignments. If there is a translocation, a gene
|
||||
fusion or a long deletion, a read bridging the break point may have two hits,
|
||||
occupying two lines in the SAM output. With the default setting of BWA-MEM, one
|
||||
and only one line is primary and is soft clipped; other lines are tagged with
|
||||
0x800 SAM flag (supplementary alignment) and are hard clipped.
|
||||
|
||||
#### <a name="4gb"></a>3. Does BWA work on reference sequences longer than 4GB in total?
|
||||
|
||||
Yes. Since 0.6.x, all BWA algorithms work with a genome with total length over
|
||||
4GB. However, individual chromosome should not be longer than 2GB.
|
||||
|
||||
#### <a name="pe0"></a>4. Why can one read in a pair have a high mapping quality but the other has zero?
|
||||
|
||||
This is correct. Mapping quality is assigned for individual read, not for a read
|
||||
pair. It is possible that one read can be mapped unambiguously, but its mate
|
||||
falls in a tandem repeat and thus its accurate position cannot be determined.
|
||||
|
||||
#### <a name="endref"></a>5. How can a BWA-backtrack alignment stand out of the end of a chromosome?
|
||||
|
||||
Internally BWA concatenates all reference sequences into one long sequence. A
|
||||
read may be mapped to the junction of two adjacent reference sequences. In this
|
||||
case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
|
||||
position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
|
||||
as well. BWA-MEM does not have this problem.
|
||||
|
||||
#### <a name="altctg"></a>6. Does BWA work with ALT contigs in the GRCh38 release?
|
||||
|
||||
Yes, since 0.7.11, BWA-MEM officially supports mapping to GRCh38+ALT.
|
||||
BWA-backtrack and BWA-SW don't properly support ALT mapping as of now. Please
|
||||
see [README-alt.md][18] for details. Briefly, it is recommended to use
|
||||
[bwakit][17], the binary release of BWA, for generating the reference genome
|
||||
and for mapping.
|
||||
|
||||
#### <a name="postalt"></a>7. Can I just run BWA-MEM against GRCh38+ALT without post-processing?
|
||||
|
||||
If you are not interested in hits to ALT contigs, it is okay to run BWA-MEM
|
||||
without post-processing. The alignments produced this way are very close to
|
||||
alignments against GRCh38 without ALT contigs. Nonetheless, applying
|
||||
post-processing helps to reduce false mappings caused by reads from the
|
||||
diverged part of ALT contigs and also enables HLA typing. It is recommended to
|
||||
run the post-processing script.
|
||||
|
||||
|
||||
|
||||
[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License
|
||||
[2]: https://github.com/lh3/bwa
|
||||
[3]: http://sourceforge.net/projects/bio-bwa/files/
|
||||
[4]: http://bio-bwa.sourceforge.net/bwa.shtml
|
||||
[5]: http://bio-bwa.sourceforge.net/
|
||||
[6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help
|
||||
[7]: mailto:bio-bwa-help@sourceforge.net
|
||||
[8]: http://biostars.org
|
||||
[9]: http://seqanswers.com/
|
||||
[10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168
|
||||
[11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505
|
||||
[12]: http://arxiv.org/abs/1303.3997
|
||||
[13]: http://arxiv.org/
|
||||
[14]: http://zlib.net/
|
||||
[15]: https://github.com/lh3/bwa/tree/mem
|
||||
[16]: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/
|
||||
[17]: http://sourceforge.net/projects/bio-bwa/files/bwakit/
|
||||
[18]: https://github.com/lh3/bwa/blob/master/README-alt.md
|
||||
# fast-bwa
|
||||
|
||||
基于bwa,做一些优化,注释
|
||||
基于bwa,做一些优化,注释
|
||||
|
|
|
|||
|
|
@ -0,0 +1,210 @@
|
|||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include "bamlite.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
/*********************
|
||||
* from bam_endian.c *
|
||||
*********************/
|
||||
|
||||
static inline int bam_is_big_endian()
|
||||
{
|
||||
long one= 1;
|
||||
return !(*((char *)(&one)));
|
||||
}
|
||||
static inline uint16_t bam_swap_endian_2(uint16_t v)
|
||||
{
|
||||
return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
|
||||
}
|
||||
static inline void *bam_swap_endian_2p(void *x)
|
||||
{
|
||||
*(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
|
||||
return x;
|
||||
}
|
||||
static inline uint32_t bam_swap_endian_4(uint32_t v)
|
||||
{
|
||||
v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
|
||||
return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
|
||||
}
|
||||
static inline void *bam_swap_endian_4p(void *x)
|
||||
{
|
||||
*(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
|
||||
return x;
|
||||
}
|
||||
static inline uint64_t bam_swap_endian_8(uint64_t v)
|
||||
{
|
||||
v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
|
||||
v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
|
||||
return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
|
||||
}
|
||||
static inline void *bam_swap_endian_8p(void *x)
|
||||
{
|
||||
*(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
|
||||
return x;
|
||||
}
|
||||
|
||||
/**************
|
||||
* from bam.c *
|
||||
**************/
|
||||
|
||||
int bam_is_be;
|
||||
|
||||
bam_header_t *bam_header_init()
|
||||
{
|
||||
bam_is_be = bam_is_big_endian();
|
||||
return (bam_header_t*)calloc(1, sizeof(bam_header_t));
|
||||
}
|
||||
|
||||
void bam_header_destroy(bam_header_t *header)
|
||||
{
|
||||
int32_t i;
|
||||
if (header == 0) return;
|
||||
if (header->target_name) {
|
||||
for (i = 0; i < header->n_targets; ++i)
|
||||
if (header->target_name[i]) free(header->target_name[i]);
|
||||
if (header->target_len) free(header->target_len);
|
||||
free(header->target_name);
|
||||
}
|
||||
if (header->text) free(header->text);
|
||||
free(header);
|
||||
}
|
||||
|
||||
bam_header_t *bam_header_read(bamFile fp)
|
||||
{
|
||||
bam_header_t *header;
|
||||
char buf[4];
|
||||
int magic_len;
|
||||
int32_t i = 1, name_len;
|
||||
// read "BAM1"
|
||||
magic_len = bam_read(fp, buf, 4);
|
||||
if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
|
||||
fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
|
||||
return NULL;
|
||||
}
|
||||
header = bam_header_init();
|
||||
// read plain text and the number of reference sequences
|
||||
if (bam_read(fp, &header->l_text, 4) != 4) goto fail;
|
||||
if (bam_is_be) bam_swap_endian_4p(&header->l_text);
|
||||
header->text = (char*)calloc(header->l_text + 1, 1);
|
||||
if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail;
|
||||
if (bam_read(fp, &header->n_targets, 4) != 4) goto fail;
|
||||
if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
|
||||
// read reference sequence names and lengths
|
||||
header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
|
||||
header->target_len = (uint32_t*)calloc(header->n_targets, 4);
|
||||
for (i = 0; i != header->n_targets; ++i) {
|
||||
if (bam_read(fp, &name_len, 4) != 4) goto fail;
|
||||
if (bam_is_be) bam_swap_endian_4p(&name_len);
|
||||
header->target_name[i] = (char*)calloc(name_len, 1);
|
||||
if (bam_read(fp, header->target_name[i], name_len) != name_len) {
|
||||
goto fail;
|
||||
}
|
||||
if (bam_read(fp, &header->target_len[i], 4) != 4) goto fail;
|
||||
if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
|
||||
}
|
||||
return header;
|
||||
fail:
|
||||
bam_header_destroy(header);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
|
||||
{
|
||||
uint8_t *s;
|
||||
uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
|
||||
s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
|
||||
for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
|
||||
while (s < data + data_len) {
|
||||
uint8_t type;
|
||||
s += 2; // skip key
|
||||
type = toupper(*s); ++s; // skip type
|
||||
if (type == 'C' || type == 'A') ++s;
|
||||
else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
|
||||
else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
|
||||
else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
|
||||
else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
|
||||
}
|
||||
}
|
||||
|
||||
int bam_read1(bamFile fp, bam1_t *b)
|
||||
{
|
||||
bam1_core_t *c = &b->core;
|
||||
int32_t block_len, ret, i;
|
||||
uint32_t x[8];
|
||||
|
||||
if ((ret = bam_read(fp, &block_len, 4)) != 4) {
|
||||
if (ret == 0) return -1; // normal end-of-file
|
||||
else return -2; // truncated
|
||||
}
|
||||
if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3;
|
||||
if (bam_is_be) {
|
||||
bam_swap_endian_4p(&block_len);
|
||||
for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
|
||||
}
|
||||
c->tid = x[0]; c->pos = x[1];
|
||||
c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
|
||||
c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
|
||||
c->l_qseq = x[4];
|
||||
c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
|
||||
b->data_len = block_len - sizeof(bam1_core_t);
|
||||
if (b->m_data < b->data_len) {
|
||||
b->m_data = b->data_len;
|
||||
kroundup32(b->m_data);
|
||||
b->data = (uint8_t*)realloc(b->data, b->m_data);
|
||||
}
|
||||
if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
|
||||
b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
|
||||
if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
|
||||
return 4 + block_len;
|
||||
}
|
||||
|
||||
|
||||
#ifdef USE_VERBOSE_ZLIB_WRAPPERS
|
||||
// Versions of gzopen, gzread and gzclose that print up error messages
|
||||
|
||||
gzFile bamlite_gzopen(const char *fn, const char *mode) {
|
||||
gzFile fp;
|
||||
if (strcmp(fn, "-") == 0) {
|
||||
fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode);
|
||||
if (!fp) {
|
||||
fprintf(stderr, "Couldn't open %s : %s",
|
||||
(strstr(mode, "r"))? "stdin" : "stdout",
|
||||
strerror(errno));
|
||||
}
|
||||
return fp;
|
||||
}
|
||||
if ((fp = gzopen(fn, mode)) == 0) {
|
||||
fprintf(stderr, "Couldn't open %s : %s\n", fn,
|
||||
errno ? strerror(errno) : "Out of memory");
|
||||
}
|
||||
return fp;
|
||||
}
|
||||
|
||||
int bamlite_gzread(gzFile file, void *ptr, unsigned int len) {
|
||||
int ret = gzread(file, ptr, len);
|
||||
|
||||
if (ret < 0) {
|
||||
int errnum = 0;
|
||||
const char *msg = gzerror(file, &errnum);
|
||||
fprintf(stderr, "gzread error: %s\n",
|
||||
Z_ERRNO == errnum ? strerror(errno) : msg);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bamlite_gzclose(gzFile file) {
|
||||
int ret = gzclose(file);
|
||||
if (Z_OK != ret) {
|
||||
fprintf(stderr, "gzclose error: %s\n",
|
||||
Z_ERRNO == ret ? strerror(errno) : zError(ret));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
#ifndef BAMLITE_H_
|
||||
#define BAMLITE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <zlib.h>
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
#define USE_VERBOSE_ZLIB_WRAPPERS
|
||||
|
||||
typedef gzFile bamFile;
|
||||
#ifdef USE_VERBOSE_ZLIB_WRAPPERS
|
||||
/* These print error messages on failure */
|
||||
# define bam_open(fn, mode) bamlite_gzopen(fn, mode)
|
||||
# define bam_dopen(fd, mode) gzdopen(fd, mode)
|
||||
# define bam_close(fp) bamlite_gzclose(fp)
|
||||
# define bam_read(fp, buf, size) bamlite_gzread(fp, buf, size)
|
||||
#else
|
||||
# define bam_open(fn, mode) gzopen(fn, mode)
|
||||
# define bam_dopen(fd, mode) gzdopen(fd, mode)
|
||||
# define bam_close(fp) gzclose(fp)
|
||||
# define bam_read(fp, buf, size) gzread(fp, buf, size)
|
||||
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
|
||||
|
||||
typedef struct {
|
||||
int32_t n_targets;
|
||||
char **target_name;
|
||||
uint32_t *target_len;
|
||||
size_t l_text, n_text;
|
||||
char *text;
|
||||
} bam_header_t;
|
||||
|
||||
#define BAM_FPAIRED 1
|
||||
#define BAM_FPROPER_PAIR 2
|
||||
#define BAM_FUNMAP 4
|
||||
#define BAM_FMUNMAP 8
|
||||
#define BAM_FREVERSE 16
|
||||
#define BAM_FMREVERSE 32
|
||||
#define BAM_FREAD1 64
|
||||
#define BAM_FREAD2 128
|
||||
#define BAM_FSECONDARY 256
|
||||
#define BAM_FQCFAIL 512
|
||||
#define BAM_FDUP 1024
|
||||
|
||||
#define BAM_CIGAR_SHIFT 4
|
||||
#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
|
||||
|
||||
#define BAM_CMATCH 0
|
||||
#define BAM_CINS 1
|
||||
#define BAM_CDEL 2
|
||||
#define BAM_CREF_SKIP 3
|
||||
#define BAM_CSOFT_CLIP 4
|
||||
#define BAM_CHARD_CLIP 5
|
||||
#define BAM_CPAD 6
|
||||
|
||||
typedef struct {
|
||||
int32_t tid;
|
||||
int32_t pos;
|
||||
uint32_t bin:16, qual:8, l_qname:8;
|
||||
uint32_t flag:16, n_cigar:16;
|
||||
int32_t l_qseq;
|
||||
int32_t mtid;
|
||||
int32_t mpos;
|
||||
int32_t isize;
|
||||
} bam1_core_t;
|
||||
|
||||
typedef struct {
|
||||
bam1_core_t core;
|
||||
int l_aux, data_len, m_data;
|
||||
uint8_t *data;
|
||||
} bam1_t;
|
||||
|
||||
#ifndef kroundup32
|
||||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
|
||||
#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
|
||||
#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
|
||||
#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
|
||||
#define bam1_qname(b) ((char*)((b)->data))
|
||||
#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
|
||||
#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
|
||||
#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
|
||||
#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
|
||||
|
||||
#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
|
||||
#define bam_destroy1(b) do { \
|
||||
if (b) { free((b)->data); free(b); } \
|
||||
} while (0)
|
||||
|
||||
extern int bam_is_be;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bam_header_t *bam_header_init(void);
|
||||
void bam_header_destroy(bam_header_t *header);
|
||||
bam_header_t *bam_header_read(bamFile fp);
|
||||
int bam_read1(bamFile fp, bam1_t *b);
|
||||
|
||||
#ifdef USE_VERBOSE_ZLIB_WRAPPERS
|
||||
gzFile bamlite_gzopen(const char *fn, const char *mode);
|
||||
int bamlite_gzread(gzFile file, void *ptr, unsigned int len);
|
||||
int bamlite_gzclose(gzFile file);
|
||||
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,451 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include "bntseq.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
#include "khash.h"
|
||||
KHASH_MAP_INIT_STR(str, int)
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
unsigned char nst_nt4_table[256] = {
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
||||
};
|
||||
|
||||
void bns_dump(const bntseq_t *bns, const char *prefix)
|
||||
{
|
||||
char str[1024];
|
||||
FILE *fp;
|
||||
int i;
|
||||
{ // dump .ann
|
||||
strcpy(str, prefix); strcat(str, ".ann");
|
||||
fp = xopen(str, "w");
|
||||
err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
|
||||
for (i = 0; i != bns->n_seqs; ++i) {
|
||||
bntann1_t *p = bns->anns + i;
|
||||
err_fprintf(fp, "%d %s", p->gi, p->name);
|
||||
if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno);
|
||||
else err_fprintf(fp, "\n");
|
||||
err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
|
||||
}
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
{ // dump .amb
|
||||
strcpy(str, prefix); strcat(str, ".amb");
|
||||
fp = xopen(str, "w");
|
||||
err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
|
||||
for (i = 0; i != bns->n_holes; ++i) {
|
||||
bntamb1_t *p = bns->ambs + i;
|
||||
err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
|
||||
}
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
|
||||
{
|
||||
char str[8192];
|
||||
FILE *fp;
|
||||
const char *fname;
|
||||
bntseq_t *bns;
|
||||
long long xx;
|
||||
int i;
|
||||
int scanres;
|
||||
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
|
||||
{ // read .ann
|
||||
fp = xopen(fname = ann_filename, "r");
|
||||
scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
|
||||
if (scanres != 3) goto badread;
|
||||
bns->l_pac = xx;
|
||||
bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
bntann1_t *p = bns->anns + i;
|
||||
char *q = str;
|
||||
int c;
|
||||
// read gi and sequence name
|
||||
scanres = fscanf(fp, "%u%s", &p->gi, str);
|
||||
if (scanres != 2) goto badread;
|
||||
p->name = strdup(str);
|
||||
// read fasta comments
|
||||
while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
|
||||
while (c != '\n' && c != EOF) c = fgetc(fp);
|
||||
if (c == EOF) {
|
||||
scanres = EOF;
|
||||
goto badread;
|
||||
}
|
||||
*q = 0;
|
||||
if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space
|
||||
else p->anno = strdup("");
|
||||
// read the rest
|
||||
scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
|
||||
if (scanres != 3) goto badread;
|
||||
p->offset = xx;
|
||||
}
|
||||
err_fclose(fp);
|
||||
}
|
||||
{ // read .amb
|
||||
int64_t l_pac;
|
||||
int32_t n_seqs;
|
||||
fp = xopen(fname = amb_filename, "r");
|
||||
scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
|
||||
if (scanres != 3) goto badread;
|
||||
l_pac = xx;
|
||||
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
|
||||
bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0;
|
||||
for (i = 0; i < bns->n_holes; ++i) {
|
||||
bntamb1_t *p = bns->ambs + i;
|
||||
scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str);
|
||||
if (scanres != 3) goto badread;
|
||||
p->offset = xx;
|
||||
p->amb = str[0];
|
||||
}
|
||||
err_fclose(fp);
|
||||
}
|
||||
{ // open .pac
|
||||
bns->fp_pac = xopen(pac_filename, "rb");
|
||||
}
|
||||
return bns;
|
||||
|
||||
badread:
|
||||
if (EOF == scanres) {
|
||||
err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file");
|
||||
}
|
||||
err_fatal(__func__, "Parse error reading %s\n", fname);
|
||||
}
|
||||
|
||||
bntseq_t *bns_restore(const char *prefix)
|
||||
{
|
||||
char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024];
|
||||
FILE *fp;
|
||||
bntseq_t *bns;
|
||||
strcat(strcpy(ann_filename, prefix), ".ann");
|
||||
strcat(strcpy(amb_filename, prefix), ".amb");
|
||||
strcat(strcpy(pac_filename, prefix), ".pac");
|
||||
bns = bns_restore_core(ann_filename, amb_filename, pac_filename);
|
||||
if (bns == 0) return 0;
|
||||
if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
|
||||
char str[1024];
|
||||
khash_t(str) *h;
|
||||
int c, i, absent;
|
||||
khint_t k;
|
||||
h = kh_init(str);
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
k = kh_put(str, h, bns->anns[i].name, &absent);
|
||||
kh_val(h, k) = i;
|
||||
}
|
||||
i = 0;
|
||||
while ((c = fgetc(fp)) != EOF) {
|
||||
if (c == '\t' || c == '\n' || c == '\r') {
|
||||
str[i] = 0;
|
||||
if (str[0] != '@') {
|
||||
k = kh_get(str, h, str);
|
||||
if (k != kh_end(h))
|
||||
bns->anns[kh_val(h, k)].is_alt = 1;
|
||||
}
|
||||
while (c != '\n' && c != EOF) c = fgetc(fp);
|
||||
i = 0;
|
||||
} else {
|
||||
if (i >= 1022) {
|
||||
fprintf(stderr, "[E::%s] sequence name longer than 1023 characters. Abort!\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
str[i++] = c;
|
||||
}
|
||||
}
|
||||
kh_destroy(str, h);
|
||||
fclose(fp);
|
||||
}
|
||||
return bns;
|
||||
}
|
||||
|
||||
void bns_destroy(bntseq_t *bns)
|
||||
{
|
||||
if (bns == 0) return;
|
||||
else {
|
||||
int i;
|
||||
if (bns->fp_pac) err_fclose(bns->fp_pac);
|
||||
free(bns->ambs);
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
free(bns->anns[i].name);
|
||||
free(bns->anns[i].anno);
|
||||
}
|
||||
free(bns->anns);
|
||||
free(bns);
|
||||
}
|
||||
}
|
||||
|
||||
#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1))
|
||||
#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3)
|
||||
|
||||
static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
|
||||
{
|
||||
bntann1_t *p; // 染色体,contig
|
||||
int i, lasts;
|
||||
if (bns->n_seqs == *m_seqs) { // 空间不够,重新开辟空间,n_seqs表示contig(染色体)数量
|
||||
*m_seqs <<= 1;
|
||||
bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
|
||||
}
|
||||
p = bns->anns + bns->n_seqs; // p表示当前要读入的contig
|
||||
p->name = strdup((char*)seq->name.s); // contig名字,1,2,3... X,Y
|
||||
p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)"); // 染色体注释,名称等信息
|
||||
p->gi = 0; p->len = seq->seq.l; // contig长度
|
||||
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; // offset表示该contig在所有序列中的偏移位置
|
||||
p->n_ambs = 0; // 模棱两可碱基的个数
|
||||
for (i = lasts = 0; i < seq->seq.l; ++i) { // 挨个读取该contig的碱基
|
||||
int c = nst_nt4_table[(int)seq->seq.s[i]]; // 碱基编码
|
||||
if (c >= 4) { // N
|
||||
if (lasts == seq->seq.s[i]) { // contiguous N
|
||||
++(*q)->len; // 该连续的模棱两可碱基长度+1
|
||||
} else { // 新一串模棱两可碱基
|
||||
if (bns->n_holes == *m_holes) { // 模棱两可碱基串容量不够,扩容
|
||||
(*m_holes) <<= 1;
|
||||
bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
|
||||
}
|
||||
*q = bns->ambs + bns->n_holes;
|
||||
(*q)->len = 1;
|
||||
(*q)->offset = p->offset + i; // 模棱两可碱基偏移
|
||||
(*q)->amb = seq->seq.s[i];
|
||||
++p->n_ambs; // 该contig包括的模棱两可碱基数量
|
||||
++bns->n_holes; // 模棱两可碱基串数量
|
||||
}
|
||||
}
|
||||
lasts = seq->seq.s[i]; // 保存当前字符,用来下一轮比较
|
||||
{ // fill buffer
|
||||
if (c >= 4) c = lrand48()&3; // 如果是模棱两可碱基,那就随机给分配一个ATGC
|
||||
if (bns->l_pac == *m_pac) { // double the pac size
|
||||
*m_pac <<= 1;
|
||||
pac = realloc(pac, *m_pac/4);
|
||||
memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4);
|
||||
}
|
||||
_set_pac(pac, bns->l_pac, c);
|
||||
++bns->l_pac;
|
||||
}
|
||||
}
|
||||
++bns->n_seqs;
|
||||
return pac;
|
||||
}
|
||||
|
||||
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
|
||||
{
|
||||
extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
|
||||
kseq_t *seq;
|
||||
char name[1024];
|
||||
bntseq_t *bns;
|
||||
uint8_t *pac = 0;
|
||||
int32_t m_seqs, m_holes;
|
||||
int64_t ret = -1, m_pac, l;
|
||||
bntamb1_t *q;
|
||||
FILE *fp;
|
||||
|
||||
// initialization
|
||||
seq = kseq_init(fp_fa);
|
||||
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
|
||||
bns->seed = 11; // fixed seed for random generator
|
||||
srand48(bns->seed);
|
||||
m_seqs = m_holes = 8; m_pac = 0x10000;
|
||||
bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
|
||||
bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
|
||||
pac = calloc(m_pac/4, 1);
|
||||
q = bns->ambs;
|
||||
strcpy(name, prefix); strcat(name, ".pac");
|
||||
fp = xopen(name, "wb");
|
||||
// read sequences
|
||||
while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
|
||||
if (!for_only) { // add the reverse complemented sequence
|
||||
int64_t ll_pac = (bns->l_pac * 2 + 3) / 4 * 4;
|
||||
if (ll_pac > m_pac) pac = realloc(pac, ll_pac/4);
|
||||
memset(pac + (bns->l_pac+3)/4, 0, (ll_pac - (bns->l_pac+3)/4*4) / 4);
|
||||
for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
|
||||
_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
|
||||
}
|
||||
ret = bns->l_pac;
|
||||
{ // finalize .pac file
|
||||
ubyte_t ct;
|
||||
err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
|
||||
// the following codes make the pac file size always (l_pac/4+1+1)
|
||||
if (bns->l_pac % 4 == 0) {
|
||||
ct = 0;
|
||||
err_fwrite(&ct, 1, 1, fp);
|
||||
}
|
||||
ct = bns->l_pac % 4;
|
||||
err_fwrite(&ct, 1, 1, fp);
|
||||
// close .pac file
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
bns_dump(bns, prefix);
|
||||
bns_destroy(bns);
|
||||
kseq_destroy(seq);
|
||||
free(pac);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bwa_fa2pac(int argc, char *argv[])
|
||||
{
|
||||
int c, for_only = 0;
|
||||
gzFile fp;
|
||||
while ((c = getopt(argc, argv, "f")) >= 0) {
|
||||
switch (c) {
|
||||
case 'f': for_only = 1; break;
|
||||
}
|
||||
}
|
||||
if (argc == optind) {
|
||||
fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
|
||||
return 1;
|
||||
}
|
||||
fp = xzopen(argv[optind], "r");
|
||||
bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
|
||||
err_gzclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
|
||||
{
|
||||
int left, mid, right;
|
||||
if (pos_f >= bns->l_pac) return -1;
|
||||
left = 0; mid = 0; right = bns->n_seqs;
|
||||
while (left < right) { // binary search
|
||||
mid = (left + right) >> 1;
|
||||
if (pos_f >= bns->anns[mid].offset) {
|
||||
if (mid == bns->n_seqs - 1) break;
|
||||
if (pos_f < bns->anns[mid+1].offset) break; // bracketed
|
||||
left = mid + 1;
|
||||
} else right = mid;
|
||||
}
|
||||
return mid;
|
||||
}
|
||||
|
||||
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
|
||||
{
|
||||
int is_rev, rid_b, rid_e;
|
||||
if (rb < bns->l_pac && re > bns->l_pac) return -2;
|
||||
assert(rb <= re);
|
||||
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
|
||||
rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b;
|
||||
return rid_b == rid_e? rid_b : -1;
|
||||
}
|
||||
|
||||
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
|
||||
{
|
||||
int left, mid, right, nn;
|
||||
if (ref_id) *ref_id = bns_pos2rid(bns, pos_f);
|
||||
left = 0; right = bns->n_holes; nn = 0;
|
||||
while (left < right) {
|
||||
mid = (left + right) >> 1;
|
||||
if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
|
||||
else if (pos_f + len <= bns->ambs[mid].offset) right = mid;
|
||||
else { // overlap
|
||||
if (pos_f >= bns->ambs[mid].offset) {
|
||||
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
|
||||
bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len;
|
||||
} else {
|
||||
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
|
||||
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return nn;
|
||||
}
|
||||
|
||||
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len)
|
||||
{
|
||||
uint8_t *seq = 0;
|
||||
if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
|
||||
if (end > l_pac<<1) end = l_pac<<1;
|
||||
if (beg < 0) beg = 0;
|
||||
if (beg >= l_pac || end <= l_pac) {
|
||||
int64_t k, l = 0;
|
||||
*len = end - beg;
|
||||
seq = malloc(end - beg);
|
||||
if (beg >= l_pac) { // reverse strand
|
||||
int64_t beg_f = (l_pac<<1) - 1 - end;
|
||||
int64_t end_f = (l_pac<<1) - 1 - beg;
|
||||
for (k = end_f; k > beg_f; --k)
|
||||
seq[l++] = 3 - _get_pac(pac, k);
|
||||
} else { // forward strand
|
||||
for (k = beg; k < end; ++k)
|
||||
seq[l++] = _get_pac(pac, k);
|
||||
}
|
||||
} else *len = 0; // if bridging the forward-reverse boundary, return nothing
|
||||
return seq;
|
||||
}
|
||||
|
||||
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid)
|
||||
{
|
||||
int64_t far_beg, far_end, len;
|
||||
int is_rev;
|
||||
uint8_t *seq;
|
||||
|
||||
if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap
|
||||
assert(*beg <= mid && mid < *end);
|
||||
*rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev));
|
||||
far_beg = bns->anns[*rid].offset;
|
||||
far_end = far_beg + bns->anns[*rid].len;
|
||||
if (is_rev) { // flip to the reverse strand
|
||||
int64_t tmp = far_beg;
|
||||
far_beg = (bns->l_pac<<1) - far_end;
|
||||
far_end = (bns->l_pac<<1) - tmp;
|
||||
}
|
||||
*beg = *beg > far_beg? *beg : far_beg;
|
||||
*end = *end < far_end? *end : far_end;
|
||||
seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len);
|
||||
if (seq == 0 || *end - *beg != len) {
|
||||
fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n",
|
||||
__func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end);
|
||||
}
|
||||
assert(seq && *end - *beg == len); // assertion failure should never happen
|
||||
return seq;
|
||||
}
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BWT_BNTSEQ_H
|
||||
#define BWT_BNTSEQ_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
|
||||
#ifndef BWA_UBYTE
|
||||
#define BWA_UBYTE
|
||||
typedef uint8_t ubyte_t;
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int64_t offset;
|
||||
int32_t len;
|
||||
int32_t n_ambs;
|
||||
uint32_t gi;
|
||||
int32_t is_alt;
|
||||
char *name, *anno;
|
||||
} bntann1_t;
|
||||
|
||||
typedef struct {
|
||||
int64_t offset;
|
||||
int32_t len;
|
||||
char amb;
|
||||
} bntamb1_t;
|
||||
|
||||
typedef struct {
|
||||
int64_t l_pac;
|
||||
int32_t n_seqs;
|
||||
uint32_t seed;
|
||||
bntann1_t *anns; // n_seqs elements 染色体
|
||||
int32_t n_holes;
|
||||
bntamb1_t *ambs; // n_holes elements 非AGCT字符
|
||||
FILE *fp_pac;
|
||||
} bntseq_t;
|
||||
|
||||
extern unsigned char nst_nt4_table[256];
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void bns_dump(const bntseq_t *bns, const char *prefix);
|
||||
bntseq_t *bns_restore(const char *prefix);
|
||||
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
|
||||
void bns_destroy(bntseq_t *bns);
|
||||
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
|
||||
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
|
||||
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
|
||||
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
|
||||
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid);
|
||||
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev)
|
||||
{
|
||||
return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - 1 - pos : pos;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,871 @@
|
|||
.TH bwa 1 "23 October 2017" "bwa-0.7.17-r1188" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
bwa - Burrows-Wheeler Alignment Tool
|
||||
.SH SYNOPSIS
|
||||
.PP
|
||||
bwa index ref.fa
|
||||
.PP
|
||||
bwa mem ref.fa reads.fq > aln-se.sam
|
||||
.PP
|
||||
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
|
||||
.PP
|
||||
bwa aln ref.fa short_read.fq > aln_sa.sai
|
||||
.PP
|
||||
bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
|
||||
.PP
|
||||
bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
|
||||
.PP
|
||||
bwa bwasw ref.fa long_read.fq > aln.sam
|
||||
|
||||
.SH DESCRIPTION
|
||||
.PP
|
||||
BWA is a software package for mapping low-divergent sequences against a large
|
||||
reference genome, such as the human genome. It consists of three algorithms:
|
||||
BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
|
||||
sequence reads up to 100bp, while the rest two for longer sequences ranged from
|
||||
70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read
|
||||
support and split alignment, but BWA-MEM, which is the latest, is generally
|
||||
recommended for high-quality queries as it is faster and more accurate.
|
||||
BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina
|
||||
reads.
|
||||
|
||||
For all the algorithms, BWA first needs to construct the FM-index for
|
||||
the reference genome (the
|
||||
.B index
|
||||
command). Alignment algorithms are invoked with different sub-commands:
|
||||
.BR aln / samse / sampe
|
||||
for BWA-backtrack,
|
||||
.B bwasw
|
||||
for BWA-SW and
|
||||
.B mem
|
||||
for the BWA-MEM algorithm.
|
||||
|
||||
.SH COMMANDS AND OPTIONS
|
||||
.TP
|
||||
.B index
|
||||
.B bwa index
|
||||
.RB [ -p
|
||||
.IR prefix ]
|
||||
.RB [ -a
|
||||
.IR algoType ]
|
||||
.I db.fa
|
||||
|
||||
Index database sequences in the FASTA format.
|
||||
|
||||
.B OPTIONS:
|
||||
.RS
|
||||
.TP 10
|
||||
.BI -p \ STR
|
||||
Prefix of the output database [same as db filename]
|
||||
.TP
|
||||
.BI -a \ STR
|
||||
Algorithm for constructing BWT index. BWA implements three algorithms for BWT
|
||||
construction:
|
||||
.BR is ,
|
||||
.B bwtsw
|
||||
and
|
||||
.BR rb2 .
|
||||
The first algorithm is a little faster for small database but requires large
|
||||
RAM and does not work for databases with total length longer than 2GB. The
|
||||
second algorithm is adapted from the BWT-SW source code. It in theory works
|
||||
with database with trillions of bases. When this option is not specified, the
|
||||
appropriate algorithm will be chosen automatically.
|
||||
.RE
|
||||
|
||||
.TP
|
||||
.B mem
|
||||
.B bwa mem
|
||||
.RB [ -aCHjMpP ]
|
||||
.RB [ -t
|
||||
.IR nThreads ]
|
||||
.RB [ -k
|
||||
.IR minSeedLen ]
|
||||
.RB [ -w
|
||||
.IR bandWidth ]
|
||||
.RB [ -d
|
||||
.IR zDropoff ]
|
||||
.RB [ -r
|
||||
.IR seedSplitRatio ]
|
||||
.RB [ -c
|
||||
.IR maxOcc ]
|
||||
.RB [ -D
|
||||
.IR chainShadow ]
|
||||
.RB [ -m
|
||||
.IR maxMateSW ]
|
||||
.RB [ -W
|
||||
.IR minSeedMatch ]
|
||||
.RB [ -A
|
||||
.IR matchScore ]
|
||||
.RB [ -B
|
||||
.IR mmPenalty ]
|
||||
.RB [ -O
|
||||
.IR gapOpenPen ]
|
||||
.RB [ -E
|
||||
.IR gapExtPen ]
|
||||
.RB [ -L
|
||||
.IR clipPen ]
|
||||
.RB [ -U
|
||||
.IR unpairPen ]
|
||||
.RB [ -x
|
||||
.IR readType ]
|
||||
.RB [ -R
|
||||
.IR RGline ]
|
||||
.RB [ -H
|
||||
.IR HDlines ]
|
||||
.RB [ -v
|
||||
.IR verboseLevel ]
|
||||
.I db.prefix
|
||||
.I reads.fq
|
||||
.RI [ mates.fq ]
|
||||
|
||||
Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the
|
||||
algorithm works by seeding alignments with maximal exact matches (MEMs) and
|
||||
then extending seeds with the affine-gap Smith-Waterman algorithm (SW).
|
||||
|
||||
If
|
||||
.I mates.fq
|
||||
file is absent and option
|
||||
.B -p
|
||||
is not set, this command regards input reads are single-end. If
|
||||
.I mates.fq
|
||||
is present, this command assumes the
|
||||
.IR i -th
|
||||
read in
|
||||
.I reads.fq
|
||||
and the
|
||||
.IR i -th
|
||||
read in
|
||||
.I mates.fq
|
||||
constitute a read pair. If
|
||||
.B -p
|
||||
is used, the command assumes the
|
||||
.RI 2 i -th
|
||||
and the
|
||||
.RI (2 i +1)-th
|
||||
read in
|
||||
.I reads.fq
|
||||
constitute a read pair (such input file is said to be interleaved). In this case,
|
||||
.I mates.fq
|
||||
is ignored. In the paired-end mode, the
|
||||
.B mem
|
||||
command will infer the read orientation and the insert size distribution from a
|
||||
batch of reads.
|
||||
|
||||
The BWA-MEM algorithm performs local alignment. It may produce multiple primary
|
||||
alignments for different part of a query sequence. This is a crucial feature
|
||||
for long sequences. However, some tools may not work with split alignments.
|
||||
One may consider to use option
|
||||
.B -M
|
||||
to flag shorter split hits as secondary.
|
||||
|
||||
.RS
|
||||
.TP 10
|
||||
.B ALGORITHM OPTIONS:
|
||||
.TP
|
||||
.BI -t \ INT
|
||||
Number of threads [1]
|
||||
.TP
|
||||
.BI -k \ INT
|
||||
Minimum seed length. Matches shorter than
|
||||
.I INT
|
||||
will be missed. The alignment speed is usually insensitive to this value unless
|
||||
it significantly deviates from 20. [19]
|
||||
.TP
|
||||
.BI -w \ INT
|
||||
Band width. Essentially, gaps longer than
|
||||
.I INT
|
||||
will not be found. Note that the maximum gap length is also affected by the
|
||||
scoring matrix and the hit length, not solely determined by this option. [100]
|
||||
.TP
|
||||
.BI -d \ INT
|
||||
Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between
|
||||
the best and the current extension score is above
|
||||
.RI | i - j |* A + INT ,
|
||||
where
|
||||
.I i
|
||||
and
|
||||
.I j
|
||||
are the current positions of the query and reference, respectively, and
|
||||
.I A
|
||||
is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it
|
||||
doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not
|
||||
only avoids unnecessary extension, but also reduces poor alignments inside a
|
||||
long good alignment. [100]
|
||||
.TP
|
||||
.BI -r \ FLOAT
|
||||
Trigger re-seeding for a MEM longer than
|
||||
.IR minSeedLen * FLOAT .
|
||||
This is a key heuristic parameter for tuning the performance. Larger value
|
||||
yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
|
||||
.TP
|
||||
.BI -c \ INT
|
||||
Discard a MEM if it has more than
|
||||
.I INT
|
||||
occurence in the genome. This is an insensitive parameter. [500]
|
||||
.TP
|
||||
.BI -D \ FLOAT
|
||||
Drop chains shorter than
|
||||
.I FLOAT
|
||||
fraction of the longest overlapping chain [0.5]
|
||||
.TP
|
||||
.BI -m \ INT
|
||||
Perform at most
|
||||
.I INT
|
||||
rounds of mate-SW [50]
|
||||
.TP
|
||||
.BI -W \ INT
|
||||
Drop a chain if the number of bases in seeds is smaller than
|
||||
.IR INT .
|
||||
This option is primarily used for longer contigs/reads. When positive, it also
|
||||
affects seed filtering. [0]
|
||||
.TP
|
||||
.B -P
|
||||
In the paired-end mode, perform SW to rescue missing hits only but do not try to find
|
||||
hits that fit a proper pair.
|
||||
|
||||
.TP
|
||||
.B SCORING OPTIONS:
|
||||
.TP
|
||||
.BI -A \ INT
|
||||
Matching score. [1]
|
||||
.TP
|
||||
.BI -B \ INT
|
||||
Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
|
||||
.TP
|
||||
.BI -O \ INT[,INT]
|
||||
Gap open penalty. If two numbers are specified, the first is the penalty of
|
||||
openning a deletion and the second for openning an insertion. [6]
|
||||
.TP
|
||||
.BI -E \ INT[,INT]
|
||||
Gap extension penalty. If two numbers are specified, the first is the penalty
|
||||
of extending a deletion and second for extending an insertion. A gap of length
|
||||
k costs O + k*E (i.e.
|
||||
.B -O
|
||||
is for opening a zero-length gap). [1]
|
||||
.TP
|
||||
.BI -L \ INT[,INT]
|
||||
Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
|
||||
score reaching the end of query. If this score is larger than the best SW score
|
||||
minus the clipping penalty, clipping will not be applied. Note that in this
|
||||
case, the SAM AS tag reports the best SW score; clipping penalty is not
|
||||
deduced. If two numbers are provided, the first is for 5'-end clipping and
|
||||
second for 3'-end clipping. [5]
|
||||
.TP
|
||||
.BI -U \ INT
|
||||
Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
|
||||
.RI scoreRead1+scoreRead2- INT
|
||||
and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
|
||||
two scores to determine whether we should force pairing. A larger value leads to
|
||||
more aggressive read pair. [17]
|
||||
.TP
|
||||
.BI -x \ STR
|
||||
Read type. Changes multiple parameters unless overriden [null]
|
||||
.RS
|
||||
.TP 10
|
||||
.BR pacbio :
|
||||
.B -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0
|
||||
(PacBio reads to ref)
|
||||
.TP
|
||||
.BR ont2d :
|
||||
.B -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0
|
||||
(Oxford Nanopore 2D-reads to ref)
|
||||
.TP
|
||||
.BR intractg :
|
||||
.B -B9 -O16 -L5
|
||||
(intra-species contigs to ref)
|
||||
.RE
|
||||
.TP
|
||||
.B INPUT/OUTPUT OPTIONS:
|
||||
.TP
|
||||
.B -p
|
||||
Smart pairing. If two adjacent reads have the same name, they are considered
|
||||
to form a read pair. This way, paired-end and single-end reads can be mixed
|
||||
in a single FASTA/Q stream.
|
||||
.TP
|
||||
.BI -R \ STR
|
||||
Complete read group header line. '\\t' can be used in
|
||||
.I STR
|
||||
and will be converted to a TAB in the output SAM. The read group ID will be
|
||||
attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
|
||||
[null]
|
||||
.TP
|
||||
.BI -H \ ARG
|
||||
If ARG starts with @, it is interpreted as a string and gets inserted into the
|
||||
output SAM header; otherwise, ARG is interpreted as a file with all lines
|
||||
starting with @ in the file inserted into the SAM header. [null]
|
||||
.TP
|
||||
.BI -o \ FILE
|
||||
Write the output SAM file to
|
||||
.IR FILE .
|
||||
For compatibility with other BWA commands, this option may also be given as
|
||||
.B -f
|
||||
.IR FILE .
|
||||
[standard ouptut]
|
||||
.TP
|
||||
.B -q
|
||||
Don't reduce the mapping quality of split alignment of lower alignment score.
|
||||
.TP
|
||||
.B -5
|
||||
For split alignment, mark the segment with the smallest coordinate as the
|
||||
primary. It automatically applies option
|
||||
.B -q
|
||||
as well. This option may help some Hi-C pipelines. By default, BWA-MEM marks
|
||||
highest scoring segment as primary.
|
||||
.TP
|
||||
.B -K \ INT
|
||||
Process
|
||||
.I INT
|
||||
input bases in each batch regardless of the number of threads in use
|
||||
.RI [10000000* nThreads ].
|
||||
By default, the batch size is proportional to the number of threads in use.
|
||||
Because the inferred insert size distribution slightly depends on the batch
|
||||
size, using different number of threads may produce different output.
|
||||
Specifying this option helps reproducibility.
|
||||
.TP
|
||||
.BI -T \ INT
|
||||
Don't output alignment with score lower than
|
||||
.IR INT .
|
||||
This option affects output and occasionally SAM flag 2. [30]
|
||||
.TP
|
||||
.BI -j
|
||||
Treat ALT contigs as part of the primary assembly (i.e. ignore the
|
||||
.I db.prefix.alt
|
||||
file).
|
||||
.TP
|
||||
.BI -h \ INT[,INT2]
|
||||
If a query has not more than
|
||||
.I INT
|
||||
hits with score higher than 80% of the best hit, output them all in the XA tag.
|
||||
If
|
||||
.I INT2
|
||||
is specified, BWA-MEM outputs up to
|
||||
.I INT2
|
||||
hits if the list contains a hit to an ALT contig. [5,200]
|
||||
.TP
|
||||
.B -a
|
||||
Output all found alignments for single-end or unpaired paired-end reads. These
|
||||
alignments will be flagged as secondary alignments.
|
||||
.TP
|
||||
.B -C
|
||||
Append FASTA/Q comment to SAM output. This option can be used to
|
||||
transfer read meta information (e.g. barcode) to the SAM output. Note that the
|
||||
FASTA/Q comment (the string after a space in the header line) must conform the SAM
|
||||
spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
|
||||
.TP
|
||||
.B -Y
|
||||
Use soft clipping CIGAR operation for supplementary alignments. By default, BWA-MEM
|
||||
uses soft clipping for the primary alignment and hard clipping for
|
||||
supplementary alignments.
|
||||
.TP
|
||||
.B -M
|
||||
Mark shorter split hits as secondary (for Picard compatibility).
|
||||
.TP
|
||||
.BI -v \ INT
|
||||
Control the verbosity level of the output. This option has not been fully
|
||||
supported throughout BWA. Ideally, a value 0 for disabling all the output to
|
||||
stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
|
||||
all normal messages; 4 or higher for debugging. When this option takes value
|
||||
4, the output is not SAM. [3]
|
||||
.TP
|
||||
.BI -I \ FLOAT[,FLOAT[,INT[,INT]]]
|
||||
Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma
|
||||
from the mean if absent) and min (4 sigma if absent) of the insert size
|
||||
distribution. Only applicable to the FR orientation. By default, BWA-MEM infers
|
||||
these numbers and the pair orientations given enough reads. [inferred]
|
||||
|
||||
.RE
|
||||
|
||||
.TP
|
||||
.B aln
|
||||
bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
|
||||
nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc]
|
||||
[-O gapOsc] [-E gapEsc] [-q trimQual] <in.db.fasta> <in.query.fq> >
|
||||
<out.sai>
|
||||
|
||||
Find the SA coordinates of the input reads. Maximum
|
||||
.I maxSeedDiff
|
||||
differences are allowed in the first
|
||||
.I seedLen
|
||||
subsequence and maximum
|
||||
.I maxDiff
|
||||
differences are allowed in the whole sequence.
|
||||
|
||||
.B OPTIONS:
|
||||
.RS
|
||||
.TP 10
|
||||
.BI -n \ NUM
|
||||
Maximum edit distance if the value is INT, or the fraction of missing
|
||||
alignments given 2% uniform base error rate if FLOAT. In the latter
|
||||
case, the maximum edit distance is automatically chosen for different
|
||||
read lengths. [0.04]
|
||||
.TP
|
||||
.BI -o \ INT
|
||||
Maximum number of gap opens [1]
|
||||
.TP
|
||||
.BI -e \ INT
|
||||
Maximum number of gap extensions, -1 for k-difference mode (disallowing
|
||||
long gaps) [-1]
|
||||
.TP
|
||||
.BI -d \ INT
|
||||
Disallow a long deletion within INT bp towards the 3'-end [16]
|
||||
.TP
|
||||
.BI -i \ INT
|
||||
Disallow an indel within INT bp towards the ends [5]
|
||||
.TP
|
||||
.BI -l \ INT
|
||||
Take the first INT subsequence as seed. If INT is larger than the query
|
||||
sequence, seeding will be disabled. For long reads, this option is
|
||||
typically ranged from 25 to 35 for `-k 2'. [inf]
|
||||
.TP
|
||||
.BI -k \ INT
|
||||
Maximum edit distance in the seed [2]
|
||||
.TP
|
||||
.BI -t \ INT
|
||||
Number of threads (multi-threading mode) [1]
|
||||
.TP
|
||||
.BI -M \ INT
|
||||
Mismatch penalty. BWA will not search for suboptimal hits with a score
|
||||
lower than (bestScore-misMsc). [3]
|
||||
.TP
|
||||
.BI -O \ INT
|
||||
Gap open penalty [11]
|
||||
.TP
|
||||
.BI -E \ INT
|
||||
Gap extension penalty [4]
|
||||
.TP
|
||||
.BI -R \ INT
|
||||
Proceed with suboptimal alignments if there are no more than INT equally
|
||||
best hits. This option only affects paired-end mapping. Increasing this
|
||||
threshold helps to improve the pairing accuracy at the cost of speed,
|
||||
especially for short reads (~32bp).
|
||||
.TP
|
||||
.B -c
|
||||
Reverse query but not complement it, which is required for alignment in
|
||||
the color space. (Disabled since 0.6.x)
|
||||
.TP
|
||||
.B -N
|
||||
Disable iterative search. All hits with no more than
|
||||
.I maxDiff
|
||||
differences will be found. This mode is much slower than the default.
|
||||
.TP
|
||||
.BI -q \ INT
|
||||
Parameter for read trimming. BWA trims a read down to
|
||||
argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l<INT where l is the original
|
||||
read length. [0]
|
||||
.TP
|
||||
.B -I
|
||||
The input is in the Illumina 1.3+ read format (quality equals ASCII-64).
|
||||
.TP
|
||||
.BI -B \ INT
|
||||
Length of barcode starting from the 5'-end. When
|
||||
.I INT
|
||||
is positive, the barcode of each read will be trimmed before mapping and will
|
||||
be written at the
|
||||
.B BC
|
||||
SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]
|
||||
.TP
|
||||
.B -b
|
||||
Specify the input read sequence file is the BAM format. For paired-end
|
||||
data, two ends in a pair must be grouped together and options
|
||||
.B -1
|
||||
or
|
||||
.B -2
|
||||
are usually applied to specify which end should be mapped. Typical
|
||||
command lines for mapping pair-end data in the BAM format are:
|
||||
|
||||
bwa aln ref.fa -b1 reads.bam > 1.sai
|
||||
bwa aln ref.fa -b2 reads.bam > 2.sai
|
||||
bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam
|
||||
.TP
|
||||
.B -0
|
||||
When
|
||||
.B -b
|
||||
is specified, only use single-end reads in mapping.
|
||||
.TP
|
||||
.B -1
|
||||
When
|
||||
.B -b
|
||||
is specified, only use the first read in a read pair in mapping (skip
|
||||
single-end reads and the second reads).
|
||||
.TP
|
||||
.B -2
|
||||
When
|
||||
.B -b
|
||||
is specified, only use the second read in a read pair in mapping.
|
||||
.B
|
||||
.RE
|
||||
|
||||
.TP
|
||||
.B samse
|
||||
bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam>
|
||||
|
||||
Generate alignments in the SAM format given single-end reads. Repetitive
|
||||
hits will be randomly chosen.
|
||||
|
||||
.B OPTIONS:
|
||||
.RS
|
||||
.TP 10
|
||||
.BI -n \ INT
|
||||
Maximum number of alignments to output in the XA tag for reads paired
|
||||
properly. If a read has more than INT hits, the XA tag will not be
|
||||
written. [3]
|
||||
.TP
|
||||
.BI -r \ STR
|
||||
Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
|
||||
.RE
|
||||
|
||||
.TP
|
||||
.B sampe
|
||||
bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis]
|
||||
[-P] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam>
|
||||
|
||||
Generate alignments in the SAM format given paired-end reads. Repetitive
|
||||
read pairs will be placed randomly.
|
||||
|
||||
.B OPTIONS:
|
||||
.RS
|
||||
.TP 8
|
||||
.BI -a \ INT
|
||||
Maximum insert size for a read pair to be considered being mapped
|
||||
properly. Since 0.4.5, this option is only used when there are not
|
||||
enough good alignment to infer the distribution of insert sizes. [500]
|
||||
.TP
|
||||
.BI -o \ INT
|
||||
Maximum occurrences of a read for pairing. A read with more occurrneces
|
||||
will be treated as a single-end read. Reducing this parameter helps
|
||||
faster pairing. [100000]
|
||||
.TP
|
||||
.B -P
|
||||
Load the entire FM-index into memory to reduce disk operations
|
||||
(base-space reads only). With this option, at least 1.25N bytes of
|
||||
memory are required, where N is the length of the genome.
|
||||
.TP
|
||||
.BI -n \ INT
|
||||
Maximum number of alignments to output in the XA tag for reads paired
|
||||
properly. If a read has more than INT hits, the XA tag will not be
|
||||
written. [3]
|
||||
.TP
|
||||
.BI -N \ INT
|
||||
Maximum number of alignments to output in the XA tag for disconcordant
|
||||
read pairs (excluding singletons). If a read has more than INT hits, the
|
||||
XA tag will not be written. [10]
|
||||
.TP
|
||||
.BI -r \ STR
|
||||
Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
|
||||
.RE
|
||||
|
||||
.TP
|
||||
.B bwasw
|
||||
bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t
|
||||
nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N
|
||||
nHspRev] [-c thresCoef] <in.db.fasta> <in.fq> [mate.fq]
|
||||
|
||||
Align query sequences in the
|
||||
.I in.fq
|
||||
file. When
|
||||
.I mate.fq
|
||||
is present, perform paired-end alignment. The paired-end mode only works
|
||||
for reads Illumina short-insert libraries. In the paired-end mode, BWA-SW
|
||||
may still output split alignments but they are all marked as not properly
|
||||
paired; the mate positions will not be written if the mate has multiple
|
||||
local hits.
|
||||
|
||||
.B OPTIONS:
|
||||
.RS
|
||||
.TP 10
|
||||
.BI -a \ INT
|
||||
Score of a match [1]
|
||||
.TP
|
||||
.BI -b \ INT
|
||||
Mismatch penalty [3]
|
||||
.TP
|
||||
.BI -q \ INT
|
||||
Gap open penalty [5]
|
||||
.TP
|
||||
.BI -r \ INT
|
||||
Gap extension penalty. The penalty for a contiguous gap of size k is
|
||||
q+k*r. [2]
|
||||
.TP
|
||||
.BI -t \ INT
|
||||
Number of threads in the multi-threading mode [1]
|
||||
.TP
|
||||
.BI -w \ INT
|
||||
Band width in the banded alignment [33]
|
||||
.TP
|
||||
.BI -T \ INT
|
||||
Minimum score threshold divided by a [37]
|
||||
.TP
|
||||
.BI -c \ FLOAT
|
||||
Coefficient for threshold adjustment according to query length. Given an
|
||||
l-long query, the threshold for a hit to be retained is
|
||||
a*max{T,c*log(l)}. [5.5]
|
||||
.TP
|
||||
.BI -z \ INT
|
||||
Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]
|
||||
.TP
|
||||
.BI -s \ INT
|
||||
Maximum SA interval size for initiating a seed. Higher -s increases
|
||||
accuracy at the cost of speed. [3]
|
||||
.TP
|
||||
.BI -N \ INT
|
||||
Minimum number of seeds supporting the resultant alignment to skip
|
||||
reverse alignment. [5]
|
||||
.RE
|
||||
|
||||
.SH SAM ALIGNMENT FORMAT
|
||||
.PP
|
||||
The output of the
|
||||
.B `aln'
|
||||
command is binary and designed for BWA use only. BWA outputs the final
|
||||
alignment in the SAM (Sequence Alignment/Map) format. Each line consists
|
||||
of:
|
||||
|
||||
.TS
|
||||
center box;
|
||||
cb | cb | cb
|
||||
n | l | l .
|
||||
Col Field Description
|
||||
_
|
||||
1 QNAME Query (pair) NAME
|
||||
2 FLAG bitwise FLAG
|
||||
3 RNAME Reference sequence NAME
|
||||
4 POS 1-based leftmost POSition/coordinate of clipped sequence
|
||||
5 MAPQ MAPping Quality (Phred-scaled)
|
||||
6 CIAGR extended CIGAR string
|
||||
7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME)
|
||||
8 MPOS 1-based Mate POSistion
|
||||
9 ISIZE Inferred insert SIZE
|
||||
10 SEQ query SEQuence on the same strand as the reference
|
||||
11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
|
||||
12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE
|
||||
.TE
|
||||
|
||||
.PP
|
||||
Each bit in the FLAG field is defined as:
|
||||
|
||||
.TS
|
||||
center box;
|
||||
cb | cb | cb
|
||||
c | l | l .
|
||||
Chr Flag Description
|
||||
_
|
||||
p 0x0001 the read is paired in sequencing
|
||||
P 0x0002 the read is mapped in a proper pair
|
||||
u 0x0004 the query sequence itself is unmapped
|
||||
U 0x0008 the mate is unmapped
|
||||
r 0x0010 strand of the query (1 for reverse)
|
||||
R 0x0020 strand of the mate
|
||||
1 0x0040 the read is the first read in a pair
|
||||
2 0x0080 the read is the second read in a pair
|
||||
s 0x0100 the alignment is not primary
|
||||
f 0x0200 QC failure
|
||||
d 0x0400 optical or PCR duplicate
|
||||
S 0x0800 supplementary alignment
|
||||
.TE
|
||||
|
||||
.PP
|
||||
The Please check <http://samtools.sourceforge.net> for the format
|
||||
specification and the tools for post-processing the alignment.
|
||||
|
||||
BWA generates the following optional fields. Tags starting with `X' are
|
||||
specific to BWA.
|
||||
|
||||
.TS
|
||||
center box;
|
||||
cb | cb
|
||||
cB | l .
|
||||
Tag Meaning
|
||||
_
|
||||
NM Edit distance
|
||||
MD Mismatching positions/bases
|
||||
AS Alignment score
|
||||
BC Barcode sequence
|
||||
SA Supplementary alignments
|
||||
_
|
||||
X0 Number of best hits
|
||||
X1 Number of suboptimal hits found by BWA
|
||||
XN Number of ambiguous bases in the referenece
|
||||
XM Number of mismatches in the alignment
|
||||
XO Number of gap opens
|
||||
XG Number of gap extentions
|
||||
XT Type: Unique/Repeat/N/Mate-sw
|
||||
XA Alternative hits; format: /(chr,pos,CIGAR,NM;)*/
|
||||
_
|
||||
XS Suboptimal alignment score
|
||||
XF Support from forward/reverse alignment
|
||||
XE Number of supporting seeds
|
||||
.TE
|
||||
|
||||
.PP
|
||||
Note that XO and XG are generated by BWT search while the CIGAR string
|
||||
by Smith-Waterman alignment. These two tags may be inconsistent with the
|
||||
CIGAR string. This is not a bug.
|
||||
|
||||
.SH NOTES ON SHORT-READ ALIGNMENT
|
||||
.SS Alignment Accuracy
|
||||
.PP
|
||||
When seeding is disabled, BWA guarantees to find an alignment
|
||||
containing maximum
|
||||
.I maxDiff
|
||||
differences including
|
||||
.I maxGapO
|
||||
gap opens which do not occur within
|
||||
.I nIndelEnd
|
||||
bp towards either end of the query. Longer gaps may be found if
|
||||
.I maxGapE
|
||||
is positive, but it is not guaranteed to find all hits. When seeding is
|
||||
enabled, BWA further requires that the first
|
||||
.I seedLen
|
||||
subsequence contains no more than
|
||||
.I maxSeedDiff
|
||||
differences.
|
||||
.PP
|
||||
When gapped alignment is disabled, BWA is expected to generate the same
|
||||
alignment as Eland version 1, the Illumina alignment program. However, as BWA
|
||||
change `N' in the database sequence to random nucleotides, hits to these
|
||||
random sequences will also be counted. As a consequence, BWA may mark a
|
||||
unique hit as a repeat, if the random sequences happen to be identical
|
||||
to the sequences which should be unqiue in the database.
|
||||
.PP
|
||||
By default, if the best hit is not highly repetitive (controlled by -R), BWA
|
||||
also finds all hits contains one more mismatch; otherwise, BWA finds all
|
||||
equally best hits only. Base quality is NOT considered in evaluating
|
||||
hits. In the paired-end mode, BWA pairs all hits it found. It further
|
||||
performs Smith-Waterman alignment for unmapped reads to rescue reads with a
|
||||
high erro rate, and for high-quality anomalous pairs to fix potential alignment
|
||||
errors.
|
||||
|
||||
.SS Estimating Insert Size Distribution
|
||||
.PP
|
||||
BWA estimates the insert size distribution per 256*1024 read pairs. It
|
||||
first collects pairs of reads with both ends mapped with a single-end
|
||||
quality 20 or higher and then calculates median (Q2), lower and higher
|
||||
quartile (Q1 and Q3). It estimates the mean and the variance of the
|
||||
insert size distribution from pairs whose insert sizes are within
|
||||
interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair
|
||||
considered to be properly paired (SAM flag 0x2) is calculated by solving
|
||||
equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the
|
||||
standard error of the insert size distribution, L is the length of the
|
||||
genome, p0 is prior of anomalous pair and Phi() is the standard
|
||||
cumulative distribution function. For mapping Illumina short-insert
|
||||
reads to the human genome, x is about 6-7 sigma away from the
|
||||
mean. Quartiles, mean, variance and x will be printed to the standard
|
||||
error output.
|
||||
|
||||
.SS Memory Requirement
|
||||
.PP
|
||||
With bwtsw algorithm, 5GB memory is required for indexing the complete
|
||||
human genome sequences. For short reads, the
|
||||
.B aln
|
||||
command uses ~3.2GB memory and the
|
||||
.B sampe
|
||||
command uses ~5.4GB.
|
||||
|
||||
.SS Speed
|
||||
.PP
|
||||
Indexing the human genome sequences takes 3 hours with bwtsw
|
||||
algorithm. Indexing smaller genomes with IS algorithms is
|
||||
faster, but requires more memory.
|
||||
.PP
|
||||
The speed of alignment is largely determined by the error rate of the query
|
||||
sequences (r). Firstly, BWA runs much faster for near perfect hits than
|
||||
for hits with many differences, and it stops searching for a hit with
|
||||
l+2 differences if a l-difference hit is found. This means BWA will be
|
||||
very slow if r is high because in this case BWA has to visit hits with
|
||||
many differences and looking for these hits is expensive. Secondly, the
|
||||
alignment algorithm behind makes the speed sensitive to [k log(N)/m],
|
||||
where k is the maximum allowed differences, N the size of database and m
|
||||
the length of a query. In practice, we choose k w.r.t. r and therefore r
|
||||
is the leading factor. I would not recommend to use BWA on data with
|
||||
r>0.02.
|
||||
.PP
|
||||
Pairing is slower for shorter reads. This is mainly because shorter
|
||||
reads have more spurious hits and converting SA coordinates to
|
||||
chromosomal coordinates are very costly.
|
||||
|
||||
.SH CHANGES IN BWA-0.6
|
||||
.PP
|
||||
Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
|
||||
This feature makes it possible to integrate the forward and reverse complemented
|
||||
genome in one FM-index, which speeds up both BWA-short and BWA-SW. As a tradeoff,
|
||||
BWA uses more memory because it has to keep all positions and ranks in 64-bit
|
||||
integers, twice larger than 32-bit integers used in the previous versions.
|
||||
|
||||
The latest BWA-SW also works for paired-end reads longer than 100bp. In
|
||||
comparison to BWA-short, BWA-SW tends to be more accurate for highly unique
|
||||
reads and more robust to relative long INDELs and structural variants.
|
||||
Nonetheless, BWA-short usually has higher power to distinguish the optimal hit
|
||||
from many suboptimal hits. The choice of the mapping algorithm may depend on
|
||||
the application.
|
||||
|
||||
.SH SEE ALSO
|
||||
BWA website <http://bio-bwa.sourceforge.net>, Samtools website
|
||||
<http://samtools.sourceforge.net>
|
||||
|
||||
.SH AUTHOR
|
||||
Heng Li at the Sanger Institute wrote the key source codes and
|
||||
integrated the following codes for BWT construction: bwtsw
|
||||
<http://i.cs.hku.hk/~ckwong3/bwtsw/>, implemented by Chi-Kwong Wong at
|
||||
the University of Hong Kong and IS
|
||||
<http://yuta.256.googlepages.com/sais> originally proposed by Nong Ge
|
||||
<http://www.cs.sysu.edu.cn/nong/> at the Sun Yat-Sen University and
|
||||
implemented by Yuta Mori.
|
||||
|
||||
.SH LICENSE AND CITATION
|
||||
.PP
|
||||
The full BWA package is distributed under GPLv3 as it uses source codes
|
||||
from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
|
||||
libraries are distributed under the MIT license.
|
||||
.PP
|
||||
If you use the BWA-backtrack algorithm, please cite the following
|
||||
paper:
|
||||
.PP
|
||||
Li H. and Durbin R. (2009) Fast and accurate short read alignment with
|
||||
Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
|
||||
.PP
|
||||
If you use the BWA-SW algorithm, please cite:
|
||||
.PP
|
||||
Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
|
||||
Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
|
||||
.PP
|
||||
If you use BWA-MEM or the fastmap component of BWA, please cite:
|
||||
.PP
|
||||
Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with
|
||||
BWA-MEM. arXiv:1303.3997v1 [q-bio.GN].
|
||||
.PP
|
||||
It is likely that the BWA-MEM manuscript will not appear in a peer-reviewed
|
||||
journal.
|
||||
|
||||
.SH HISTORY
|
||||
BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
|
||||
and mimics its binary file formats; BWA-SW resembles BWT-SW in several
|
||||
ways. The initial idea about BWT-based alignment also came from the
|
||||
group who developed BWT-SW. At the same time, BWA is different enough
|
||||
from BWT-SW. The short-read alignment algorithm bears no similarity to
|
||||
Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it
|
||||
introduces heuristics that can hardly be applied to the original
|
||||
algorithm. In all, BWA does not guarantee to find all local hits as what
|
||||
BWT-SW is designed to do, but it is much faster than BWT-SW on both
|
||||
short and long query sequences.
|
||||
|
||||
I started to write the first piece of codes on 24 May 2008 and got the
|
||||
initial stable version on 02 June 2008. During this period, I was
|
||||
acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper,
|
||||
was collaborating with Beijing Genomics Institute on SOAP2, the successor
|
||||
to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in
|
||||
November 2008. According to the SourceForge download page, the third
|
||||
BWT-based short read aligner, bowtie, was first released in August
|
||||
2008. At the time of writing this manual, at least three more BWT-based
|
||||
short-read aligners are being implemented.
|
||||
|
||||
The BWA-SW algorithm is a new component of BWA. It was conceived in
|
||||
November 2008 and implemented ten months later.
|
||||
|
||||
The BWA-MEM algorithm is based on an algorithm finding super-maximal exact
|
||||
matches (SMEMs), which was first published with the fermi assembler paper
|
||||
in 2012. I first implemented the basic SMEM algorithm in the
|
||||
.B fastmap
|
||||
command for an experiment and then extended the basic algorithm and added the
|
||||
extension part in Feburary 2013 to make BWA-MEM a fully featured mapper.
|
||||
|
||||
|
|
@ -0,0 +1,501 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
#include <assert.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
#include "utils.h"
|
||||
#include "kstring.h"
|
||||
#include "kvec.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
int bwa_verbose = 3;
|
||||
int bwa_dbg = 0;
|
||||
char bwa_rg_id[256];
|
||||
char *bwa_pg;
|
||||
|
||||
/************************
|
||||
* Batch FASTA/Q reader *
|
||||
************************/
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
static inline void trim_readno(kstring_t *s)
|
||||
{
|
||||
if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
|
||||
s->l -= 2, s->s[s->l] = 0;
|
||||
}
|
||||
|
||||
static inline char *dupkstring(const kstring_t *str, int dupempty)
|
||||
{
|
||||
char *s = (str->l > 0 || dupempty)? malloc(str->l + 1) : NULL;
|
||||
if (!s) return NULL;
|
||||
|
||||
memcpy(s, str->s, str->l);
|
||||
s[str->l] = '\0';
|
||||
return s;
|
||||
}
|
||||
|
||||
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
|
||||
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
|
||||
s->name = dupkstring(&ks->name, 1);
|
||||
s->comment = dupkstring(&ks->comment, 0);
|
||||
s->seq = dupkstring(&ks->seq, 1);
|
||||
s->qual = dupkstring(&ks->qual, 0);
|
||||
s->l_seq = ks->seq.l;
|
||||
}
|
||||
|
||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
||||
{
|
||||
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
|
||||
int size = 0, m, n;
|
||||
bseq1_t *seqs;
|
||||
m = n = 0; seqs = 0;
|
||||
while (kseq_read(ks) >= 0) {
|
||||
if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
|
||||
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
|
||||
break;
|
||||
}
|
||||
if (n >= m) {
|
||||
m = m? m<<1 : 256;
|
||||
seqs = realloc(seqs, m * sizeof(bseq1_t));
|
||||
}
|
||||
trim_readno(&ks->name);
|
||||
kseq2bseq1(ks, &seqs[n]);
|
||||
seqs[n].id = n;
|
||||
size += seqs[n++].l_seq;
|
||||
if (ks2) {
|
||||
trim_readno(&ks2->name);
|
||||
kseq2bseq1(ks2, &seqs[n]);
|
||||
seqs[n].id = n;
|
||||
size += seqs[n++].l_seq;
|
||||
}
|
||||
if (size >= chunk_size && (n&1) == 0) break;
|
||||
}
|
||||
if (size == 0) { // test if the 2nd file is finished
|
||||
if (ks2 && kseq_read(ks2) >= 0)
|
||||
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
|
||||
}
|
||||
*n_ = n;
|
||||
return seqs;
|
||||
}
|
||||
|
||||
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
|
||||
{
|
||||
int i, has_last;
|
||||
kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
|
||||
for (i = 1, has_last = 1; i < n; ++i) {
|
||||
if (has_last) {
|
||||
if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
|
||||
kv_push(bseq1_t, a[1], seqs[i-1]);
|
||||
kv_push(bseq1_t, a[1], seqs[i]);
|
||||
has_last = 0;
|
||||
} else kv_push(bseq1_t, a[0], seqs[i-1]);
|
||||
} else has_last = 1;
|
||||
}
|
||||
if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
|
||||
sep[0] = a[0].a, m[0] = a[0].n;
|
||||
sep[1] = a[1].a, m[1] = a[1].n;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* CIGAR related *
|
||||
*****************/
|
||||
|
||||
void bwa_fill_scmat(int a, int b, int8_t mat[25])
|
||||
{
|
||||
int i, j, k;
|
||||
for (i = k = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
mat[k++] = i == j? a : -b;
|
||||
mat[k++] = -1; // ambiguous base
|
||||
}
|
||||
for (j = 0; j < 5; ++j) mat[k++] = -1;
|
||||
}
|
||||
|
||||
// Generate CIGAR when the alignment end points are known
|
||||
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
|
||||
{
|
||||
uint32_t *cigar = 0;
|
||||
uint8_t tmp, *rseq;
|
||||
int i;
|
||||
int64_t rlen;
|
||||
kstring_t str;
|
||||
const char *int2base;
|
||||
|
||||
if (n_cigar) *n_cigar = 0;
|
||||
if (NM) *NM = -1;
|
||||
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
||||
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
||||
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
|
||||
if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
|
||||
for (i = 0; i < l_query>>1; ++i)
|
||||
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
||||
for (i = 0; i < rlen>>1; ++i)
|
||||
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
||||
}
|
||||
if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
|
||||
// UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
|
||||
if (n_cigar) {
|
||||
cigar = malloc(4);
|
||||
cigar[0] = l_query<<4 | 0;
|
||||
*n_cigar = 1;
|
||||
}
|
||||
for (i = 0, *score = 0; i < l_query; ++i)
|
||||
*score += mat[rseq[i]*5 + query[i]];
|
||||
} else {
|
||||
int w, max_gap, max_ins, max_del, min_w;
|
||||
// set the band-width
|
||||
max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.);
|
||||
max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.);
|
||||
max_gap = max_ins > max_del? max_ins : max_del;
|
||||
max_gap = max_gap > 1? max_gap : 1;
|
||||
w = (max_gap + abs((int)rlen - l_query) + 1) >> 1;
|
||||
w = w < w_? w : w_;
|
||||
min_w = abs((int)rlen - l_query) + 3;
|
||||
w = w > min_w? w : min_w;
|
||||
// NW alignment
|
||||
if (bwa_verbose >= 4) {
|
||||
printf("* Global bandwidth: %d\n", w);
|
||||
printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
||||
printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
||||
}
|
||||
*score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar);
|
||||
}
|
||||
if (NM && n_cigar) {// compute NM and MD
|
||||
int k, x, y, u, n_mm = 0, n_gap = 0;
|
||||
str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR
|
||||
int2base = rb < l_pac? "ACGTN" : "TGCAN";
|
||||
for (k = 0, x = y = u = 0; k < *n_cigar; ++k) {
|
||||
int op, len;
|
||||
cigar = (uint32_t*)str.s;
|
||||
op = cigar[k]&0xf, len = cigar[k]>>4;
|
||||
if (op == 0) { // match
|
||||
for (i = 0; i < len; ++i) {
|
||||
if (query[x + i] != rseq[y + i]) {
|
||||
kputw(u, &str);
|
||||
kputc(int2base[rseq[y+i]], &str);
|
||||
++n_mm; u = 0;
|
||||
} else ++u;
|
||||
}
|
||||
x += len; y += len;
|
||||
} else if (op == 2) { // deletion
|
||||
if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR
|
||||
kputw(u, &str); kputc('^', &str);
|
||||
for (i = 0; i < len; ++i)
|
||||
kputc(int2base[rseq[y+i]], &str);
|
||||
u = 0; n_gap += len;
|
||||
}
|
||||
y += len;
|
||||
} else if (op == 1) x += len, n_gap += len; // insertion
|
||||
}
|
||||
kputw(u, &str); kputc(0, &str);
|
||||
*NM = n_mm + n_gap;
|
||||
cigar = (uint32_t*)str.s;
|
||||
}
|
||||
if (rb >= l_pac) // reverse back query
|
||||
for (i = 0; i < l_query>>1; ++i)
|
||||
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
||||
|
||||
ret_gen_cigar:
|
||||
free(rseq);
|
||||
return cigar;
|
||||
}
|
||||
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
|
||||
{
|
||||
return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM);
|
||||
}
|
||||
|
||||
/*********************
|
||||
* Full index reader *
|
||||
*********************/
|
||||
|
||||
char *bwa_idx_infer_prefix(const char *hint)
|
||||
{
|
||||
char *prefix;
|
||||
int l_hint;
|
||||
FILE *fp;
|
||||
l_hint = strlen(hint);
|
||||
prefix = malloc(l_hint + 3 + 4 + 1);
|
||||
strcpy(prefix, hint);
|
||||
strcpy(prefix + l_hint, ".64.bwt");
|
||||
if ((fp = fopen(prefix, "rb")) != 0) {
|
||||
fclose(fp);
|
||||
prefix[l_hint + 3] = 0;
|
||||
return prefix;
|
||||
} else {
|
||||
strcpy(prefix + l_hint, ".bwt");
|
||||
if ((fp = fopen(prefix, "rb")) == 0) {
|
||||
free(prefix);
|
||||
return 0;
|
||||
} else {
|
||||
fclose(fp);
|
||||
prefix[l_hint] = 0;
|
||||
return prefix;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bwt_t *bwa_idx_load_bwt(const char *hint)
|
||||
{
|
||||
char *tmp, *prefix;
|
||||
bwt_t *bwt;
|
||||
prefix = bwa_idx_infer_prefix(hint);
|
||||
if (prefix == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
tmp = calloc(strlen(prefix) + 5, 1);
|
||||
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
|
||||
bwt = bwt_restore_bwt(tmp);
|
||||
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
|
||||
bwt_restore_sa(tmp, bwt);
|
||||
free(tmp); free(prefix);
|
||||
return bwt;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
||||
{
|
||||
bwaidx_t *idx;
|
||||
char *prefix;
|
||||
prefix = bwa_idx_infer_prefix(hint);
|
||||
if (prefix == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
idx = calloc(1, sizeof(bwaidx_t));
|
||||
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
||||
if (which & BWA_IDX_BNS) {
|
||||
int i, c;
|
||||
idx->bns = bns_restore(prefix);
|
||||
for (i = c = 0; i < idx->bns->n_seqs; ++i)
|
||||
if (idx->bns->anns[i].is_alt) ++c;
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
|
||||
if (which & BWA_IDX_PAC) {
|
||||
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
|
||||
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
||||
err_fclose(idx->bns->fp_pac);
|
||||
idx->bns->fp_pac = 0;
|
||||
}
|
||||
}
|
||||
free(prefix);
|
||||
return idx;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_idx_load(const char *hint, int which)
|
||||
{
|
||||
return bwa_idx_load_from_disk(hint, which);
|
||||
}
|
||||
|
||||
void bwa_idx_destroy(bwaidx_t *idx)
|
||||
{
|
||||
if (idx == 0) return;
|
||||
if (idx->mem == 0) {
|
||||
if (idx->bwt) bwt_destroy(idx->bwt);
|
||||
if (idx->bns) bns_destroy(idx->bns);
|
||||
if (idx->pac) free(idx->pac);
|
||||
} else {
|
||||
free(idx->bwt); free(idx->bns->anns); free(idx->bns);
|
||||
if (!idx->is_shm) free(idx->mem);
|
||||
}
|
||||
free(idx);
|
||||
}
|
||||
|
||||
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
||||
{
|
||||
int64_t k = 0, x;
|
||||
int i;
|
||||
|
||||
// generate idx->bwt
|
||||
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
|
||||
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
|
||||
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
|
||||
|
||||
// generate idx->bns and idx->pac
|
||||
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
|
||||
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
|
||||
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
||||
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
|
||||
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
|
||||
}
|
||||
idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
|
||||
assert(k == l_mem);
|
||||
|
||||
idx->l_mem = k; idx->mem = mem;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_idx2mem(bwaidx_t *idx)
|
||||
{
|
||||
int i;
|
||||
int64_t k, x, tmp;
|
||||
uint8_t *mem;
|
||||
|
||||
// copy idx->bwt
|
||||
x = idx->bwt->bwt_size * 4;
|
||||
mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
|
||||
memmove(mem + sizeof(bwt_t), mem, x);
|
||||
memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
|
||||
x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
|
||||
free(idx->bwt->sa);
|
||||
free(idx->bwt); idx->bwt = 0;
|
||||
|
||||
// copy idx->bns
|
||||
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
|
||||
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
|
||||
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
|
||||
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
|
||||
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
|
||||
free(idx->bns->ambs);
|
||||
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
||||
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
|
||||
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
|
||||
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
|
||||
}
|
||||
free(idx->bns->anns);
|
||||
|
||||
// copy idx->pac
|
||||
x = idx->bns->l_pac/4+1;
|
||||
mem = realloc(mem, k + x);
|
||||
memcpy(mem + k, idx->pac, x); k += x;
|
||||
free(idx->bns); idx->bns = 0;
|
||||
free(idx->pac); idx->pac = 0;
|
||||
|
||||
return bwa_mem2idx(k, mem, idx);
|
||||
}
|
||||
|
||||
/***********************
|
||||
* SAM header routines *
|
||||
***********************/
|
||||
|
||||
void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
|
||||
{
|
||||
int i, n_HD = 0, n_SQ = 0;
|
||||
extern char *bwa_pg;
|
||||
|
||||
if (hdr_line) {
|
||||
// check for HD line
|
||||
const char *p = hdr_line;
|
||||
if ((p = strstr(p, "@HD")) != 0) {
|
||||
++n_HD;
|
||||
}
|
||||
// check for SQ lines
|
||||
p = hdr_line;
|
||||
while ((p = strstr(p, "@SQ\t")) != 0) {
|
||||
if (p == hdr_line || *(p-1) == '\n') ++n_SQ;
|
||||
p += 4;
|
||||
}
|
||||
}
|
||||
if (n_SQ == 0) {
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
err_printf("@SQ\tSN:%s\tLN:%d", bns->anns[i].name, bns->anns[i].len);
|
||||
if (bns->anns[i].is_alt) err_printf("\tAH:*\n");
|
||||
else err_fputc('\n', stdout);
|
||||
}
|
||||
} else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
|
||||
fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
|
||||
if (n_HD == 0) {
|
||||
err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n");
|
||||
}
|
||||
if (hdr_line) err_printf("%s\n", hdr_line);
|
||||
if (bwa_pg) err_printf("%s\n", bwa_pg);
|
||||
}
|
||||
|
||||
static char *bwa_escape(char *s)
|
||||
{
|
||||
char *p, *q;
|
||||
for (p = q = s; *p; ++p) {
|
||||
if (*p == '\\') {
|
||||
++p;
|
||||
if (*p == 't') *q++ = '\t';
|
||||
else if (*p == 'n') *q++ = '\n';
|
||||
else if (*p == 'r') *q++ = '\r';
|
||||
else if (*p == '\\') *q++ = '\\';
|
||||
} else *q++ = *p;
|
||||
}
|
||||
*q = '\0';
|
||||
return s;
|
||||
}
|
||||
|
||||
char *bwa_set_rg(const char *s)
|
||||
{
|
||||
char *p, *q, *r, *rg_line = 0;
|
||||
memset(bwa_rg_id, 0, 256);
|
||||
if (strstr(s, "@RG") != s) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
|
||||
goto err_set_rg;
|
||||
}
|
||||
if (strstr(s, "\t") != NULL) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n", __func__);
|
||||
goto err_set_rg;
|
||||
}
|
||||
rg_line = strdup(s);
|
||||
bwa_escape(rg_line);
|
||||
if ((p = strstr(rg_line, "\tID:")) == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__);
|
||||
goto err_set_rg;
|
||||
}
|
||||
p += 4;
|
||||
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
||||
if (q - p + 1 > 256) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
|
||||
goto err_set_rg;
|
||||
}
|
||||
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
||||
*r++ = *q;
|
||||
return rg_line;
|
||||
|
||||
err_set_rg:
|
||||
free(rg_line);
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *bwa_insert_header(const char *s, char *hdr)
|
||||
{
|
||||
int len = 0;
|
||||
if (s == 0 || s[0] != '@') return hdr;
|
||||
if (hdr) {
|
||||
len = strlen(hdr);
|
||||
hdr = realloc(hdr, len + strlen(s) + 2);
|
||||
hdr[len++] = '\n';
|
||||
strcpy(hdr + len, s);
|
||||
} else hdr = strdup(s);
|
||||
bwa_escape(hdr + len);
|
||||
return hdr;
|
||||
}
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#ifndef BWA_H_
|
||||
#define BWA_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwt.h"
|
||||
|
||||
#define BWA_IDX_BWT 0x1
|
||||
#define BWA_IDX_BNS 0x2
|
||||
#define BWA_IDX_PAC 0x4
|
||||
#define BWA_IDX_ALL 0x7
|
||||
|
||||
#define BWA_CTL_SIZE 0x10000
|
||||
|
||||
#define BWTALGO_AUTO 0
|
||||
#define BWTALGO_RB2 1
|
||||
#define BWTALGO_BWTSW 2
|
||||
#define BWTALGO_IS 3
|
||||
|
||||
#define BWA_DBG_QNAME 0x1
|
||||
|
||||
typedef struct {
|
||||
bwt_t *bwt; // FM-index
|
||||
bntseq_t *bns; // information on the reference sequences
|
||||
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
|
||||
|
||||
int is_shm;
|
||||
int64_t l_mem;
|
||||
uint8_t *mem;
|
||||
} bwaidx_t;
|
||||
|
||||
typedef struct {
|
||||
int l_seq, id;
|
||||
char *name, *comment, *seq, *qual, *sam;
|
||||
} bseq1_t;
|
||||
|
||||
extern int bwa_verbose, bwa_dbg;
|
||||
extern char bwa_rg_id[256];
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
|
||||
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
|
||||
|
||||
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
|
||||
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size);
|
||||
|
||||
char *bwa_idx_infer_prefix(const char *hint);
|
||||
bwt_t *bwa_idx_load_bwt(const char *hint);
|
||||
|
||||
bwaidx_t *bwa_idx_load_from_shm(const char *hint);
|
||||
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
|
||||
bwaidx_t *bwa_idx_load(const char *hint, int which);
|
||||
void bwa_idx_destroy(bwaidx_t *idx);
|
||||
int bwa_idx2mem(bwaidx_t *idx);
|
||||
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
|
||||
|
||||
void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line);
|
||||
char *bwa_set_rg(const char *s);
|
||||
char *bwa_insert_header(const char *s, char *hdr);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,115 @@
|
|||
## Introduction
|
||||
|
||||
Bwakit is a self-consistent installation-free package of scripts and precompiled
|
||||
binaries, providing an end-to-end solution to read mapping. In addition to the
|
||||
basic mapping functionality implemented in bwa, bwakit is able to generate
|
||||
proper human reference genome and to take advantage of ALT contigs, if present,
|
||||
to improve read mapping and to perform HLA typing for high-coverage human data.
|
||||
It can remap name- or coordinate-sorted BAM with read group and barcode
|
||||
information retained. Bwakit also *optionally* trims adapters (via
|
||||
[trimadap][ta]), marks duplicates (via [samblaster][sb]) and sorts the final
|
||||
alignment (via [samtools][smtl]).
|
||||
|
||||
Bwakit has two entry scripts: `run-gen-ref` which downloads and generates human
|
||||
reference genomes, and `run-bwamem` which prints mapping command lines on the
|
||||
standard output that can be piped to `sh` to execute. The two scripts will call
|
||||
other programs or use data in `bwa.kit`. The following shows an example about
|
||||
how to use bwakit:
|
||||
|
||||
```sh
|
||||
# Download the bwa-0.7.11 binary package (download link may change)
|
||||
wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \
|
||||
| gzip -dc | tar xf -
|
||||
# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
|
||||
bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
|
||||
bwa.kit/bwa index hs38DH.fa # create BWA index
|
||||
# mapping
|
||||
bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh
|
||||
```
|
||||
|
||||
The last mapping command line will generate the following files:
|
||||
|
||||
* `out.aln.bam`: unsorted alignments with ALT-aware mapping quality. In this
|
||||
file, one read may be placed on multiple overlapping ALT contigs at the same
|
||||
time even if the read is mapped better to some contigs than others. This makes
|
||||
it possible to analyze each contig independent of others.
|
||||
|
||||
* `out.hla.top`: best genotypes for HLA-A, -B, -C, -DQA1, -DQB1 and -DRB1 genes.
|
||||
|
||||
* `out.hla.all`: other possible genotypes on the six HLA genes.
|
||||
|
||||
* `out.log.*`: bwa-mem, samblaster and HLA typing log files.
|
||||
|
||||
Bwakit can be [downloaded here][res]. It is only available to x86_64-linux. The
|
||||
scripts in the package are available in the [bwa/bwakit][kit] directory.
|
||||
Packaging is done manually for now.
|
||||
|
||||
## Limitations
|
||||
|
||||
* HLA typing only works for high-coverage human data. The typing accuracy can
|
||||
still be improved. We encourage researchers to develop better HLA typing tools
|
||||
based on the intermediate output of bwakit (for each HLA gene included in the
|
||||
index, bwakit writes all reads matching it in a separate file).
|
||||
|
||||
* Duplicate marking only works when all reads from a single paired-end library
|
||||
are provided as the input. This limitation is the necessary tradeoff of fast
|
||||
MarkDuplicate provided by samblaster.
|
||||
|
||||
* The adapter trimmer is chosen as it is fast, pipe friendly and does not
|
||||
discard reads. However, it is conservative and suboptimal. If this is a
|
||||
concern, it is recommended to preprocess input reads with a more sophisticated
|
||||
adapter trimmer. We also hope existing trimmers can be modified to operate on
|
||||
an interleaved FASTQ stream. We will replace trimadap once a better trimmer
|
||||
meets our needs.
|
||||
|
||||
* Bwakit can be memory demanding depends on the functionality invoked. For 30X
|
||||
human data, bwa-mem takes about 11GB RAM with 32 threads, samblaster uses
|
||||
close to 10GB and BAM shuffling (if the input is sorted BAM) uses several GB.
|
||||
In the current setting, sorting uses about 10GB.
|
||||
|
||||
|
||||
## Package Contents
|
||||
```
|
||||
bwa.kit
|
||||
|-- README.md This README file.
|
||||
|-- run-bwamem *Entry script* for the entire mapping pipeline.
|
||||
|-- bwa *BWA binary*
|
||||
|-- k8 Interpretor for *.js scripts.
|
||||
|-- bwa-postalt.js Post-process alignments to ALT contigs/decoys/HLA genes.
|
||||
|-- htsbox Used by run-bwamem for shuffling BAMs and BAM=>FASTQ.
|
||||
|-- samblaster MarkDuplicates for reads from the same library. v0.1.20
|
||||
|-- samtools SAMtools for sorting and SAM=>BAM conversion. v1.1
|
||||
|-- seqtk For FASTQ manipulation.
|
||||
|-- trimadap Trim Illumina PE sequencing adapters.
|
||||
|
|
||||
|-- run-gen-ref *Entry script* for generating human reference genomes.
|
||||
|-- resource-GRCh38 Resources for generating GRCh38
|
||||
| |-- hs38DH-extra.fa Decoy and HLA gene sequences. Used by run-gen-ref.
|
||||
| `-- hs38DH.fa.alt ALT-to-GRCh38 alignment. Used by run-gen-ref.
|
||||
|
|
||||
|-- run-HLA HLA typing for sequences extracted by bwa-postalt.js.
|
||||
|-- typeHLA.sh Type one HLA-gene. Called by run-HLA.
|
||||
|-- typeHLA.js HLA typing from exon-to-contig alignment. Used by typeHLA.sh.
|
||||
|-- typeHLA-selctg.js Select contigs overlapping HLA exons. Used by typeHLA.sh.
|
||||
|-- fermi2.pl Fermi2 wrapper. Used by typeHLA.sh for de novo assembly.
|
||||
|-- fermi2 Fermi2 binary. Used by fermi2.pl.
|
||||
|-- ropebwt2 RopeBWT2 binary. Used by fermi2.pl.
|
||||
|-- resource-human-HLA Resources for HLA typing
|
||||
| |-- HLA-ALT-exons.bed Exonic regions of HLA ALT contigs. Used by typeHLA.sh.
|
||||
| |-- HLA-CDS.fa CDS of HLA-{A,B,C,DQA1,DQB1,DRB1} genes from IMGT/HLA-3.18.0.
|
||||
| |-- HLA-ALT-type.txt HLA types for each HLA ALT contig. Not used.
|
||||
| `-- HLA-ALT-idx BWA indices of each HLA ALT contig. Used by typeHLA.sh
|
||||
| `-- (...)
|
||||
|
|
||||
`-- doc BWA documentations
|
||||
|-- bwa.1 Manpage
|
||||
|-- NEWS.md Release Notes
|
||||
|-- README.md GitHub README page
|
||||
`-- README-alt.md Documentation for ALT mapping
|
||||
```
|
||||
|
||||
[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
|
||||
[sb]: https://github.com/GregoryFaust/samblaster
|
||||
[ta]: https://github.com/lh3/seqtk/blob/master/trimadap.c
|
||||
[smtl]: http://www.htslib.org
|
||||
[kit]: https://github.com/lh3/bwa/tree/master/bwakit
|
||||
|
|
@ -0,0 +1,524 @@
|
|||
/*****************************************************************
|
||||
* The K8 Javascript interpreter is required to run this script. *
|
||||
* *
|
||||
* Source code: https://github.com/attractivechaos/k8 *
|
||||
* Binary: http://sourceforge.net/projects/lh3/files/k8/ *
|
||||
* *
|
||||
* Data file used for generating GRCh38 ALT alignments: *
|
||||
* *
|
||||
* http://sourceforge.net/projects/bio-bwa/files/ *
|
||||
*****************************************************************/
|
||||
|
||||
/******************
|
||||
*** From k8.js ***
|
||||
******************/
|
||||
|
||||
// Parse command-line options. A BSD getopt() clone in javascript.
|
||||
var getopt = function(args, ostr) {
|
||||
var oli; // option letter list index
|
||||
if (typeof(getopt.place) == 'undefined')
|
||||
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
||||
if (getopt.place == -1) { // update scanning pointer
|
||||
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
||||
++getopt.ind;
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
||||
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
||||
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
||||
if (getopt.place < 0) ++getopt.ind;
|
||||
return '?';
|
||||
}
|
||||
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
||||
getopt.arg = null;
|
||||
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
||||
} else { // need an argument
|
||||
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
||||
getopt.arg = args[getopt.ind].substr(getopt.place);
|
||||
else if (args.length <= ++getopt.ind) { // no arg
|
||||
getopt.place = -1;
|
||||
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
||||
return '?';
|
||||
} else getopt.arg = args[getopt.ind]; // white space
|
||||
getopt.place = -1;
|
||||
++getopt.ind;
|
||||
}
|
||||
return optopt;
|
||||
}
|
||||
|
||||
// reverse a string
|
||||
Bytes.prototype.reverse = function()
|
||||
{
|
||||
for (var i = 0; i < this.length>>1; ++i) {
|
||||
var tmp = this[i];
|
||||
this[i] = this[this.length - i - 1];
|
||||
this[this.length - i - 1] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
// reverse complement a DNA string
|
||||
Bytes.prototype.revcomp = function()
|
||||
{
|
||||
if (Bytes.rctab == null) {
|
||||
var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn';
|
||||
var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn';
|
||||
Bytes.rctab = [];
|
||||
for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0;
|
||||
for (var i = 0; i < s1.length; ++i)
|
||||
Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
|
||||
}
|
||||
for (var i = 0; i < this.length>>1; ++i) {
|
||||
var tmp = this[this.length - i - 1];
|
||||
this[this.length - i - 1] = Bytes.rctab[this[i]];
|
||||
this[i] = Bytes.rctab[tmp];
|
||||
}
|
||||
if (this.length&1)
|
||||
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
|
||||
}
|
||||
|
||||
// create index for a list of intervals for fast interval queries; ported from bedidx.c in samtools
|
||||
function intv_ovlp(intv, bits)
|
||||
{
|
||||
if (typeof bits == "undefined") bits = 13;
|
||||
intv.sort(function(a,b) {return a[0]-b[0];});
|
||||
// create the index
|
||||
var idx = [], max = 0;
|
||||
for (var i = 0; i < intv.length; ++i) {
|
||||
var b = intv[i][0]>>bits;
|
||||
var e = (intv[i][1]-1)>>bits;
|
||||
if (b != e) {
|
||||
for (var j = b; j <= e; ++j)
|
||||
if (idx[j] == null) idx[j] = i;
|
||||
} else if (idx[b] == null) idx[b] = i;
|
||||
max = max > e? max : e;
|
||||
}
|
||||
// closure
|
||||
return function(_b, _e) {
|
||||
var x = _b >> bits;
|
||||
if (x > max) return [];
|
||||
var off = idx[x];
|
||||
if (off == null) {
|
||||
var i;
|
||||
for (i = ((_e - 1) >> bits) - 1; i >= 0; --i)
|
||||
if (idx[i] != null) break;
|
||||
off = i < 0? 0 : idx[i];
|
||||
}
|
||||
var ovlp = [];
|
||||
for (var i = off; i < intv.length && intv[i][0] < _e; ++i)
|
||||
if (intv[i][1] > _b) ovlp.push(intv[i]);
|
||||
return ovlp;
|
||||
}
|
||||
}
|
||||
|
||||
var re_cigar = /(\d+)([MIDSHN])/g;
|
||||
|
||||
/******************************
|
||||
*** Generate ALT alignment ***
|
||||
******************************/
|
||||
|
||||
// given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF
|
||||
function cigar2pos(cigar, pos)
|
||||
{
|
||||
var x = 0, y = 0;
|
||||
for (var i = 0; i < cigar.length; ++i) {
|
||||
var op = cigar[i][0], len = cigar[i][1];
|
||||
if (op == 'M') {
|
||||
if (y <= pos && pos < y + len)
|
||||
return x + (pos - y);
|
||||
x += len, y += len;
|
||||
} else if (op == 'D') {
|
||||
x += len;
|
||||
} else if (op == 'I') {
|
||||
if (y <= pos && pos < y + len)
|
||||
return x;
|
||||
y += len;
|
||||
} else if (op == 'S' || op == 'H') {
|
||||
if (y <= pos && pos < y + len)
|
||||
return -1;
|
||||
y += len;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Parse a hit. $s is an array that looks something like ["chr1", "+12345", "100M", 5]
|
||||
// Return an object keeping various information about the alignment.
|
||||
function parse_hit(s, opt)
|
||||
{
|
||||
var h = {};
|
||||
h.ctg = s[0];
|
||||
h.start = parseInt(s[1].substr(1)) - 1;
|
||||
h.rev = (s[1].charAt(0) == '-');
|
||||
h.cigar = s[2];
|
||||
h.NM = parseInt(s[3]);
|
||||
h.hard = false;
|
||||
var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip;
|
||||
l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0;
|
||||
while ((m = re_cigar.exec(h.cigar)) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
if (m[2] == 'M') l_match += l;
|
||||
else if (m[2] == 'D') ++n_del, l_del += l;
|
||||
else if (m[2] == 'I') ++n_ins, l_ins += l;
|
||||
else if (m[2] == 'N') l_skip += l;
|
||||
else if (m[2] == 'H' || m[2] == 'S') {
|
||||
l_clip += l;
|
||||
if (m[2] == 'H') h.hard = true;
|
||||
}
|
||||
}
|
||||
h.end = h.start + l_match + l_del + l_skip;
|
||||
h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins;
|
||||
h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499);
|
||||
h.l_query = l_match + l_ins + l_clip;
|
||||
return h;
|
||||
}
|
||||
|
||||
function print_buffer(buf2, fp_hla, hla) // output alignments
|
||||
{
|
||||
if (buf2.length == 0) return;
|
||||
for (var i = 0; i < buf2.length; ++i)
|
||||
print(buf2[i].join("\t"));
|
||||
if (fp_hla != null) {
|
||||
var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+');
|
||||
for (var x in hla) {
|
||||
if (fp_hla[x] != null);
|
||||
fp_hla[x].write('@' + name + '\n' + buf2[0][9] + '\n+\n' + buf2[0][10] + '\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function collect_hla_hits(idx, ctg, start, end, hla) // collect reads hit to HLA genes
|
||||
{
|
||||
var m, ofunc = idx[ctg];
|
||||
if (ofunc == null) return;
|
||||
var ovlp_alt = ofunc(start, end);
|
||||
for (var i = 0; i < ovlp_alt.length; ++i)
|
||||
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null)
|
||||
hla[m[1]] = true;
|
||||
}
|
||||
|
||||
function bwa_postalt(args)
|
||||
{
|
||||
var version = "r985";
|
||||
var c, opt = { a:1, b:4, o:6, e:1, min_mapq:10, min_sc:90, max_nm_sc:10, min_pa_ratio:1 };
|
||||
|
||||
while ((c = getopt(args, 'vp:r:')) != null) {
|
||||
if (c == 'p') opt.pre = getopt.arg;
|
||||
else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg);
|
||||
else if (c == 'v') { print(version); exit(0); }
|
||||
}
|
||||
if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.;
|
||||
|
||||
if (args.length == getopt.ind) {
|
||||
print("");
|
||||
print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
|
||||
print("Options: -p STR prefix of output files containting sequences matching HLA genes [null]");
|
||||
print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa<FLOAT ["+opt.min_pa_ratio+"]");
|
||||
print(" -v show version number");
|
||||
print("");
|
||||
print("Note: This script extracts the XA tag, lifts the mapping positions of ALT hits to");
|
||||
print(" the primary assembly, groups them and then estimates mapQ across groups. If");
|
||||
print(" a non-ALT hit overlaps a lifted ALT hit, its mapping quality is set to the");
|
||||
print(" smaller between its original mapQ and the adjusted mapQ of the ALT hit. If");
|
||||
print(" multiple ALT hits are lifted to the same position, they will yield new SAM");
|
||||
print(" lines with the same mapQ.");
|
||||
print("");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
var aux = new Bytes(); // used for reverse and reverse complement
|
||||
var buf = new Bytes(); // line reading buffer
|
||||
|
||||
// read ALT-to-REF alignment
|
||||
var intv_alt = {}, intv_pri = {}, hla_ctg = {}, is_alt = {}, hla_chr = null;
|
||||
var file = new File(args[getopt.ind]);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var line = buf.toString();
|
||||
if (line.charAt(0) == '@') continue;
|
||||
var t = line.split("\t");
|
||||
if (t.length < 11) continue; // incomplete lines
|
||||
is_alt[t[0]] = true;
|
||||
var pos = parseInt(t[3]) - 1;
|
||||
var flag = parseInt(t[1]);
|
||||
if ((flag&4) || t[2] == '*') continue;
|
||||
var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
|
||||
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs
|
||||
if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0;
|
||||
++hla_ctg[m[1]];
|
||||
hla_chr = t[2];
|
||||
}
|
||||
while ((m = re_cigar.exec(t[5])) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
|
||||
if (m[2] == 'M') l_qaln += l, l_tlen += l;
|
||||
else if (m[2] == 'I') l_qaln += l;
|
||||
else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
|
||||
else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
|
||||
}
|
||||
var j = flag&16? cigar.length-1 : 0;
|
||||
var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
|
||||
if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
|
||||
intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
|
||||
if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
|
||||
intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
|
||||
}
|
||||
file.close();
|
||||
var idx_alt = {}, idx_pri = {};
|
||||
for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
|
||||
for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
|
||||
|
||||
// initialize the list of HLA contigs
|
||||
var fp_hla = null;
|
||||
if (opt.pre) {
|
||||
fp_hla = {};
|
||||
for (var h in hla_ctg)
|
||||
fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w");
|
||||
}
|
||||
|
||||
// process SAM
|
||||
var buf2 = [], hla = {};
|
||||
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
|
||||
while (file.readline(buf) > 0) {
|
||||
var m, line = buf.toString();
|
||||
|
||||
if (line.charAt(0) == '@') { // print and then skip the header line
|
||||
print(line);
|
||||
continue;
|
||||
}
|
||||
|
||||
var t = line.split("\t");
|
||||
t[1] = parseInt(t[1]); t[3] = parseInt(t[3]); t[4] = parseInt(t[4]);
|
||||
|
||||
// print bufferred reads
|
||||
if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) {
|
||||
print_buffer(buf2, fp_hla, hla);
|
||||
buf2 = [], hla = {};
|
||||
}
|
||||
|
||||
// skip unmapped lines
|
||||
if (t[1]&4) {
|
||||
buf2.push(t);
|
||||
continue;
|
||||
}
|
||||
|
||||
// parse the reported hit
|
||||
var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
|
||||
var flag = t[1];
|
||||
var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
|
||||
if (t[2] == hla_chr) collect_hla_hits(idx_pri, h.ctg, h.start, h.end, hla);
|
||||
|
||||
if (h.hard) { // the following does not work with hard clipped alignments
|
||||
buf2.push(t);
|
||||
continue;
|
||||
}
|
||||
var hits = [h];
|
||||
|
||||
// parse hits in the XA tag
|
||||
if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) {
|
||||
var XA_strs = m[1].split(";");
|
||||
for (var i = 0; i < XA_strs.length; ++i)
|
||||
if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
|
||||
hits.push(parse_hit(XA_strs[i].split(","), opt));
|
||||
}
|
||||
|
||||
// check if there are ALT hits
|
||||
var has_alt = false;
|
||||
for (var i = 0; i < hits.length; ++i)
|
||||
if (is_alt[hits[i].ctg] != null) {
|
||||
has_alt = true;
|
||||
break;
|
||||
}
|
||||
if (!has_alt) {
|
||||
buf2.push(t);
|
||||
continue;
|
||||
}
|
||||
|
||||
// lift mapping positions to the primary assembly
|
||||
var n_rpt_lifted = 0, rpt_lifted = null;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
var a, h = hits[i];
|
||||
|
||||
if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0)
|
||||
continue;
|
||||
|
||||
// find the approximate position on the primary assembly
|
||||
var lifted = [];
|
||||
for (var j = 0; j < a.length; ++j) {
|
||||
var s, e;
|
||||
if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly
|
||||
s = cigar2pos(a[j][6], h.start);
|
||||
e = cigar2pos(a[j][6], h.end - 1) + 1;
|
||||
} else {
|
||||
s = cigar2pos(a[j][6], a[j][2] - h.end);
|
||||
e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1;
|
||||
}
|
||||
if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment
|
||||
s += a[j][5]; e += a[j][5];
|
||||
lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
|
||||
if (i == 0) ++n_rpt_lifted;
|
||||
}
|
||||
if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0);
|
||||
if (lifted.length) hits[i].lifted = lifted;
|
||||
}
|
||||
|
||||
// prepare for hits grouping
|
||||
for (var i = 0; i < hits.length; ++i) { // set keys for sorting
|
||||
if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used
|
||||
hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
|
||||
else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
|
||||
hits[i].i = i; // keep the original index
|
||||
}
|
||||
|
||||
// group hits based on the lifted positions on non-ALT sequences
|
||||
if (hits.length > 1) {
|
||||
hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
|
||||
var last_chr = null, end = 0, g = -1;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
|
||||
else if (hits[i].pstart >= end) ++g;
|
||||
hits[i].g = g;
|
||||
end = end > hits[i].pend? end : hits[i].pend;
|
||||
}
|
||||
} else hits[0].g = 0;
|
||||
|
||||
// find the index and group id of the reported hit; find the size of the reported group
|
||||
var reported_g = null, reported_i = null, n_group0 = 0;
|
||||
if (hits.length > 1) {
|
||||
for (var i = 0; i < hits.length; ++i)
|
||||
if (hits[i].i == 0)
|
||||
reported_g = hits[i].g, reported_i = i;
|
||||
for (var i = 0; i < hits.length; ++i)
|
||||
if (hits[i].g == reported_g)
|
||||
++n_group0;
|
||||
} else {
|
||||
if (is_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
|
||||
buf2.push(t);
|
||||
continue;
|
||||
}
|
||||
reported_g = reported_i = 0, n_group0 = 1;
|
||||
}
|
||||
|
||||
// re-estimate mapping quality if necessary
|
||||
var mapQ, ori_mapQ = t[4];
|
||||
if (n_group0 > 1) {
|
||||
var group_max = [];
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
var g = hits[i].g;
|
||||
if (group_max[g] == null || group_max[g][0] < hits[i].score)
|
||||
group_max[g] = [hits[i].score, g];
|
||||
}
|
||||
if (group_max.length > 1)
|
||||
group_max.sort(function(x,y) {return y[0]-x[0]});
|
||||
if (group_max[0][1] == reported_g) { // the best hit is the hit reported in SAM
|
||||
mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
|
||||
} else mapQ = 0;
|
||||
mapQ = mapQ < 60? mapQ : 60;
|
||||
if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ;
|
||||
else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
|
||||
} else mapQ = t[4];
|
||||
|
||||
// find out whether the read is overlapping HLA genes
|
||||
if (hits[reported_i].pctg == hla_chr) {
|
||||
var rpt_start = 1<<30, rpt_end = 0;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
var h = hits[i];
|
||||
if (h.g == reported_g) {
|
||||
rpt_start = rpt_start < h.pstart? rpt_start : h.pstart;
|
||||
rpt_end = rpt_end > h.pend ? rpt_end : h.pend;
|
||||
}
|
||||
}
|
||||
collect_hla_hits(idx_pri, hla_chr, rpt_start, rpt_end, hla);
|
||||
}
|
||||
|
||||
// adjust the mapQ of the primary hits
|
||||
if (n_rpt_lifted <= 1) {
|
||||
var l = n_rpt_lifted == 1? rpt_lifted : null;
|
||||
for (var i = 0; i < buf2.length; ++i) {
|
||||
var s = buf2[i], is_ovlp = true;
|
||||
if (l != null) {
|
||||
if (l[0] != s[2]) is_ovlp = false; // different chr
|
||||
else if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand
|
||||
else {
|
||||
var start = s[3] - 1, end = start;
|
||||
while ((m = re_cigar.exec(t[5])) != null)
|
||||
if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
|
||||
end += parseInt(m[1]);
|
||||
if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap
|
||||
}
|
||||
} else is_ovlp = false;
|
||||
// get the "pa" tag if present
|
||||
var om = -1, pa = 10.;
|
||||
for (var j = 11; j < s.length; ++j)
|
||||
if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
|
||||
om = parseInt(m[1]);
|
||||
else if ((m = /^pa:f:(\S+)/.exec(s[j])) != null)
|
||||
pa = parseFloat(m[1]);
|
||||
if (is_ovlp) { // overlapping the lifted hit
|
||||
if (om > 0) s[4] = om;
|
||||
s[4] = s[4] < mapQ? s[4] : mapQ;
|
||||
} else if (pa < opt.min_pa_ratio) { // not overlapping; has a small pa
|
||||
if (om < 0) s.push("om:i:" + s[4]);
|
||||
s[4] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// generate lifted_str
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
if (hits[i].lifted && hits[i].lifted.length) {
|
||||
var u = '', lifted = hits[i].lifted;
|
||||
for (var j = 0; j < lifted.length; ++j)
|
||||
u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";";
|
||||
hits[i].lifted_str = u;
|
||||
}
|
||||
}
|
||||
|
||||
// stage the reported hit
|
||||
t[4] = mapQ;
|
||||
if (n_group0 > 1) t.push("om:i:"+ori_mapQ);
|
||||
if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str);
|
||||
buf2.push(t);
|
||||
|
||||
// stage the hits generated from the XA tag
|
||||
var cnt = 0, rs = null, rq = null; // rq: reverse quality; rs: reverse complement sequence
|
||||
var rg = (m = /\t(RG:Z:\S+)/.exec(line)) != null? m[1] : null;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
if (hits[i].g != reported_g || i == reported_i) continue;
|
||||
if (idx_alt[hits[i].ctg] == null) continue;
|
||||
var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, t[6], t[7], t[8]];
|
||||
if (t[6] == '=' && s[2] != t[2]) s[6] = t[2];
|
||||
// print sequence/quality and set the rev flag
|
||||
if (hits[i].rev == hits[reported_i].rev) {
|
||||
s.push(t[9], t[10]);
|
||||
s[1] = flag | 0x800;
|
||||
} else { // we need to write the reverse sequence
|
||||
if (rs == null || rq == null) {
|
||||
aux.length = 0;
|
||||
aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
|
||||
aux.set(t[10],0); aux.reverse(); rq = aux.toString();
|
||||
}
|
||||
s.push(rs, rq);
|
||||
s[1] = (flag ^ 0x10) | 0x800;
|
||||
}
|
||||
s.push("NM:i:" + hits[i].NM);
|
||||
if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
|
||||
if (rg != null) s.push(rg);
|
||||
buf2.push(s);
|
||||
}
|
||||
}
|
||||
print_buffer(buf2, fp_hla, hla);
|
||||
file.close();
|
||||
if (fp_hla != null)
|
||||
for (var h in fp_hla)
|
||||
fp_hla[h].close();
|
||||
|
||||
buf.destroy();
|
||||
aux.destroy();
|
||||
}
|
||||
|
||||
bwa_postalt(arguments);
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
#!/bin/bash
|
||||
|
||||
ctg_opt=""
|
||||
if [ $# -gt 1 ] && [ $1 == '-A' ]; then
|
||||
ctg_opt="-A"
|
||||
shift
|
||||
fi
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <prefix>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for f in $1.HLA-*.fq; do
|
||||
gene=`echo $f | perl -pe 's/^.*(HLA-[A-Z]+[0-9]*).*fq$/$1/'`
|
||||
echo -e "\n*** Processing gene $gene...\n" >&2
|
||||
`dirname $0`/typeHLA.sh $ctg_opt $1 $gene
|
||||
done
|
||||
|
||||
ls $1.HLA-*.gt | xargs -i echo grep ^GT {} \| head -1 | sh | sed "s,^GT,$1,"
|
||||
|
|
@ -0,0 +1,187 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Std;
|
||||
|
||||
my %opts = (t=>1);
|
||||
getopts("MPSadskHo:R:x:t:", \%opts);
|
||||
|
||||
die('
|
||||
Usage: run-bwamem [options] <idxbase> <file1> [file2]
|
||||
|
||||
Options: -o STR prefix for output files [inferred from input]
|
||||
-R STR read group header line such as \'@RG\tID:foo\tSM:bar\' [null]
|
||||
-x STR read type: pacbio, ont2d or intractg [default]
|
||||
intractg: intra-species contig (kb query, highly similar)
|
||||
pacbio: pacbio subreads (~10kb query, high error rate)
|
||||
ont2d: Oxford Nanopore reads (~10kb query, higher error rate)
|
||||
-t INT number of threads [1]
|
||||
|
||||
-H apply HLA typing
|
||||
-a trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
|
||||
-d mark duplicate (via samblaster)
|
||||
-S for BAM input, don\'t shuffle
|
||||
-s sort the output alignment (via samtools; requring more RAM)
|
||||
-k keep temporary files generated by typeHLA
|
||||
-M mark shorter split hits as secondary
|
||||
|
||||
Examples:
|
||||
|
||||
* Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
|
||||
|
||||
run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38DH.fa read1.fq.gz read2.fq.gz
|
||||
|
||||
Note: HLA typing is only effective for high-coverage data. The typing accuracy varies
|
||||
with the quality of input. It is only intended for research purpose, not for diagnostic.
|
||||
|
||||
* Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and
|
||||
sort the output. The BAM may contain single-end or paired-end reads, or a mixture of
|
||||
the two types. Specifying -R stops read group transfer.
|
||||
|
||||
run-bwamem -sao prefix hs38DH.fa old-srt.bam
|
||||
|
||||
Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current
|
||||
mapping pipeline better. It is conservative and suboptimal. A more sophisticated
|
||||
trimmer is recommended if this becomes a concern.
|
||||
|
||||
* Remap name-grouped BAM and mark duplicates:
|
||||
|
||||
run-bwamem -Sdo prefix hs38DH.fa old-unsrt.bam
|
||||
|
||||
Note: streamed duplicate marking requires all reads from a single paired-end library
|
||||
to be aligned at the same time.
|
||||
|
||||
Output files:
|
||||
|
||||
{-o}.aln.bam - final alignment
|
||||
{-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
|
||||
{-o}.hla.all - additional HLA genotypes consistent with data
|
||||
{-o}.log.* - log files
|
||||
|
||||
') if @ARGV < 2;
|
||||
|
||||
my $idx = $ARGV[0];
|
||||
|
||||
my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
|
||||
my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
|
||||
$root = $exepath =~/^(\S+)\/[^\/\s]+/? $1 : undef if !defined($root);
|
||||
die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root);
|
||||
|
||||
die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n")
|
||||
unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb");
|
||||
|
||||
if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) {
|
||||
warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n");
|
||||
@ARGV = 2;
|
||||
}
|
||||
|
||||
if (defined($opts{p}) && @ARGV >= 3) {
|
||||
warn("WARNING: option -P is ignored as there are two input sequence files.\n");
|
||||
delete $opts{p};
|
||||
}
|
||||
|
||||
my $prefix;
|
||||
if (defined $opts{o}) {
|
||||
$prefix = $opts{o};
|
||||
} elsif (@ARGV >= 3) {
|
||||
my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]);
|
||||
my $i;
|
||||
for ($i = 0; $i < $len; ++$i) {
|
||||
last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1)
|
||||
}
|
||||
$prefix = substr($ARGV[1], 0, $i) if $i > 0;
|
||||
} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
|
||||
$prefix = $1;
|
||||
}
|
||||
die("ERROR: failed to identify the prefix for output. Please specify -o.\n") unless defined($prefix);
|
||||
|
||||
my $size = 0;
|
||||
my $comp_ratio = 3.;
|
||||
for my $f (@ARGV[1..$#ARGV]) {
|
||||
my @a = stat($f);
|
||||
my $s = $a[7];
|
||||
die("ERROR: failed to read file $f\n") if !defined($s);
|
||||
$s *= $comp_ratio if $f =~ /\.(gz|bam)$/;
|
||||
$size += int($s) + 1;
|
||||
}
|
||||
|
||||
my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0;
|
||||
my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
|
||||
|
||||
if (defined($opts{x})) {
|
||||
delete($opts{d}); delete($opts{a}); delete $opts{p};
|
||||
}
|
||||
|
||||
# for BAM input, find @RG header lines
|
||||
my @RG_lines = ();
|
||||
if ($is_bam && !defined($opts{R})) {
|
||||
my $fh;
|
||||
open($fh, "$root/samtools view -H $ARGV[1] |") || die;
|
||||
while (<$fh>) {
|
||||
chomp;
|
||||
if (/^\@RG\t/) {
|
||||
s/\t/\\t/g;
|
||||
push(@RG_lines, "-H'$_'");
|
||||
}
|
||||
}
|
||||
close($fh);
|
||||
}
|
||||
|
||||
warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") if !defined($opts{R}) && @RG_lines == 0;
|
||||
|
||||
my $cmd = '';
|
||||
if ($is_bam) {
|
||||
my $cmd_sam2bam = "cat $ARGV[1] \\\n";
|
||||
my $ntmps = int($size / 4e9) + 1;
|
||||
my $cmd_shuf = !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
|
||||
my $bam2fq_opt = @RG_lines > 0? " -t" : "";
|
||||
my $cmd_bam2fq = " | $root/htsbox bam2fq -O$bam2fq_opt - \\\n";
|
||||
$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
|
||||
} elsif (@ARGV >= 3) {
|
||||
$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
|
||||
} else {
|
||||
$cmd = "cat $ARGV[1] \\\n";
|
||||
}
|
||||
|
||||
my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "") . (defined($opts{M})? "-M " : "");
|
||||
$bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0;
|
||||
|
||||
$cmd .= " | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a});
|
||||
$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
|
||||
$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
|
||||
|
||||
my $has_hla = 0;
|
||||
if (-f "$ARGV[0].alt" && !defined($opts{P})) {
|
||||
my $fh;
|
||||
open($fh, "$ARGV[0].alt") || die;
|
||||
while (<$fh>) {
|
||||
$has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
|
||||
}
|
||||
close($fh);
|
||||
my $hla_pre = $has_hla? "-p $prefix.hla " : "";
|
||||
$cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
|
||||
}
|
||||
|
||||
my $t_sort = $opts{t} < 4? $opts{t} : 4;
|
||||
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - -o $prefix.aln.bam;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n";
|
||||
|
||||
if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
|
||||
$cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
|
||||
$cmd .= "touch $prefix.hla.HLA-dummy.gt; cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
|
||||
$cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
|
||||
}
|
||||
|
||||
print $cmd;
|
||||
|
||||
sub which
|
||||
{
|
||||
my $file = shift;
|
||||
my $path = (@_)? shift : $ENV{PATH};
|
||||
return if (!defined($path));
|
||||
foreach my $x (split(":", $path)) {
|
||||
$x =~ s/\/$//;
|
||||
return "$x/$file" if (-x "$x/$file");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
#!/bin/bash
|
||||
|
||||
root=`dirname $0`
|
||||
|
||||
url38="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
|
||||
url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <hs38|hs38a|hs38DH|hs37|hs37d5>"
|
||||
echo "Analysis sets:"
|
||||
echo " hs38 primary assembly of GRCh38 (incl. chromosomes, unplaced and unlocalized contigs) and EBV"
|
||||
echo " hs38a hs38 plus ALT contigs"
|
||||
echo " hs38DH hs38a plus decoy contigs and HLA genes (recommended for GRCh38 mapping)"
|
||||
echo " hs37 primary assembly of GRCh37 (used by 1000g phase 1) plus the EBV genome"
|
||||
echo " hs37d5 hs37 plus decoy contigs (used by 1000g phase 3)"
|
||||
echo ""
|
||||
echo "Note: This script downloads human reference genomes. For hs38a and hs38DH, it needs additional"
|
||||
echo " sequences and ALT-to-REF mapping included in the bwa.kit package."
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [ $1 == "hs38DH" ]; then
|
||||
(wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38DH-extra.fa) > $1.fa
|
||||
[ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38DH.fa.alt $1.fa.alt
|
||||
elif [ $1 == "hs38a" ]; then
|
||||
wget -O- $url38 | gzip -dc > $1.fa
|
||||
[ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38DH.fa.alt > $1.fa.alt
|
||||
elif [ $1 == "hs38" ]; then
|
||||
wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa
|
||||
elif [ $1 == "hs37d5" ]; then
|
||||
wget -O- $url37d5 | gzip -dc > $1.fa 2>/dev/null
|
||||
elif [ $1 == "hs37" ]; then
|
||||
wget -O- $url37d5 | gzip -dc 2>/dev/null | awk '/^>/{f=/>hs37d5/?0:1}f' > $1.fa
|
||||
else
|
||||
echo "ERROR: unknown genome build"
|
||||
fi
|
||||
|
||||
[ ! -f $1.fa.bwt ] && echo -e "\nPlease run 'bwa index $1.fa'...\n"
|
||||
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
var min_ovlp = 30;
|
||||
|
||||
if (arguments.length < 3) {
|
||||
print("Usage: k8 selctg.js <HLA-gene> <HLA-ALT-exons.bed> <ctg-to-ALT.sam> [min_ovlp="+min_ovlp+"]");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (arguments.length >= 4) min_ovlp = parseInt(arguments[3]);
|
||||
var gene = arguments[0];
|
||||
|
||||
var buf = new Bytes();
|
||||
|
||||
var h = {};
|
||||
var file = new File(arguments[1]);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split("\t");
|
||||
if (t[3] != gene) continue;
|
||||
if (h[t[0]] == null) h[t[0]] = [];
|
||||
h[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
|
||||
}
|
||||
file.close();
|
||||
|
||||
var s = {}, re = /(\d+)([MIDSHN])/g;
|
||||
file = new File(arguments[2]);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var line = buf.toString();
|
||||
var m, t = line.split("\t");
|
||||
var x = h[t[2]];
|
||||
if (x == null) continue;
|
||||
|
||||
var start = parseInt(t[3]) - 1, end = start;
|
||||
while ((m = re.exec(t[5])) != null) // parse CIGAR to get the end position
|
||||
if (m[2] == 'M' || m[2] == 'D')
|
||||
end += parseInt(m[1]);
|
||||
|
||||
var max_ovlp = 0;
|
||||
for (var i = 0; i < x.length; ++i) {
|
||||
var max_left = x[i][0] > start? x[i][0] : start;
|
||||
var min_rght = x[i][1] < end ? x[i][1] : end;
|
||||
max_ovlp = max_ovlp > min_rght - max_left? max_ovlp : min_rght - max_left;
|
||||
}
|
||||
|
||||
var AS = null, XS = null;
|
||||
if ((m = /AS:i:(\d+)/.exec(line)) != null) AS = parseInt(m[1]);
|
||||
if ((m = /XS:i:(\d+)/.exec(line)) != null) XS = parseInt(m[1]);
|
||||
|
||||
if (s[t[0]] == null) s[t[0]] = [];
|
||||
s[t[0]].push([AS, XS, max_ovlp]);
|
||||
}
|
||||
file.close();
|
||||
|
||||
buf.destroy();
|
||||
|
||||
for (var x in s) {
|
||||
var is_rejected = false, y = s[x];
|
||||
y.sort(function(a,b) {return b[0]-a[0]});
|
||||
for (var i = 0; i < y.length && y[i][0] == y[0][0]; ++i)
|
||||
if (y[0][2] < min_ovlp || y[i][0] == y[i][1])
|
||||
is_rejected = true;
|
||||
if (is_rejected) continue;
|
||||
print(x);
|
||||
}
|
||||
|
|
@ -0,0 +1,496 @@
|
|||
/*****************************************************************
|
||||
* The K8 Javascript interpreter is required to run this script. *
|
||||
* *
|
||||
* Source code: https://github.com/attractivechaos/k8 *
|
||||
* Binary: http://sourceforge.net/projects/lh3/files/k8/ *
|
||||
*****************************************************************/
|
||||
|
||||
var getopt = function(args, ostr) {
|
||||
var oli; // option letter list index
|
||||
if (typeof(getopt.place) == 'undefined')
|
||||
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
||||
if (getopt.place == -1) { // update scanning pointer
|
||||
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
||||
++getopt.ind;
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
||||
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
||||
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
||||
if (getopt.place < 0) ++getopt.ind;
|
||||
return '?';
|
||||
}
|
||||
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
||||
getopt.arg = null;
|
||||
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
||||
} else { // need an argument
|
||||
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
||||
getopt.arg = args[getopt.ind].substr(getopt.place);
|
||||
else if (args.length <= ++getopt.ind) { // no arg
|
||||
getopt.place = -1;
|
||||
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
||||
return '?';
|
||||
} else getopt.arg = args[getopt.ind]; // white space
|
||||
getopt.place = -1;
|
||||
++getopt.ind;
|
||||
}
|
||||
return optopt;
|
||||
}
|
||||
|
||||
/************************
|
||||
* Command line parsing *
|
||||
************************/
|
||||
|
||||
var ver = "r19";
|
||||
var c, thres_len = 50, thres_ratio = .8, thres_nm = 5, thres_frac = .33, dbg = false;
|
||||
|
||||
// parse command line options
|
||||
while ((c = getopt(arguments, "vdl:n:f:")) != null) {
|
||||
if (c == 'l') thres_len = parseInt(getopt.arg);
|
||||
else if (c == 'n') thres_nm = parseInt(getopt.arg);
|
||||
else if (c == 'd') dbg = true;
|
||||
else if (c == 'f') thres_frac = parseFloat(getopt.arg);
|
||||
else if (c == 'v') { print(ver); exit(0); }
|
||||
}
|
||||
if (arguments.length == getopt.ind) {
|
||||
print("");
|
||||
print("Usage: k8 typeHLA.js [options] <exon-to-contig.sam>\n");
|
||||
print("Options: -n INT drop a contig if the edit distance to the closest gene is >INT ["+thres_nm+"]");
|
||||
print(" -l INT drop a contig if its match too short ["+thres_len+"]");
|
||||
print(" -f FLOAT drop inconsistent contigs if their length <FLOAT fraction of total length ["+thres_ratio.toFixed(2)+"]");
|
||||
print(" -d output extra info for debugging");
|
||||
print(" -v show version number");
|
||||
print("");
|
||||
print("Note: The output is TAB delimited with each GT line consisting of allele1, allele2,");
|
||||
print(" #mismatches/gaps on primary exons, #mismatches/gaps on other exons and #exons");
|
||||
print(" used in typing. If unusure, use the first GT line as the final genotype.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*********************************
|
||||
* Read gene-to-contig alignment *
|
||||
*********************************/
|
||||
|
||||
var file = new File(arguments[getopt.ind]);
|
||||
var buf = new Bytes();
|
||||
var re_cigar = /(\d+)([MIDSH])/g;
|
||||
|
||||
var len = {}, list = [], gcnt = [];
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, mm, line = buf.toString();
|
||||
var t = line.split("\t");
|
||||
var flag = parseInt(t[1]);
|
||||
// SAM header
|
||||
if (t[0].charAt(0) == '@') {
|
||||
if (t[0] == '@SQ' && (m = /LN:(\d+)/.exec(line)) != null && (mm = /SN:(\S+)/.exec(line)) != null)
|
||||
len[mm[1]] = parseInt(m[1]);
|
||||
continue;
|
||||
}
|
||||
// parse gene name and exon number
|
||||
var gene = null, exon = null;
|
||||
if ((m = /^(HLA-[^\s_]+)_(\d+)/.exec(t[0])) != null) {
|
||||
gene = m[1], exon = parseInt(m[2]) - 1;
|
||||
if (gcnt[exon] == null) gcnt[exon] = {};
|
||||
gcnt[exon][gene] = true;
|
||||
}
|
||||
if (gene == null || exon == null || t[2] == '*') continue;
|
||||
// parse clipping and aligned length
|
||||
var x = 0, ts = parseInt(t[3]) - 1, te = ts, clip = [0, 0];
|
||||
while ((m = re_cigar.exec(t[5])) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
if (m[2] == 'M') x += l, te += l;
|
||||
else if (m[2] == 'I') x += l;
|
||||
else if (m[2] == 'D') te += l;
|
||||
else if (m[2] == 'S' || m[2] == 'H') clip[x==0?0:1] = l;
|
||||
}
|
||||
var tl = len[t[2]];
|
||||
var left = ts < clip[0]? ts : clip[0];
|
||||
var right = tl - te < clip[1]? tl - te : clip[1];
|
||||
var qs, qe, ql = clip[0] + x + clip[1];
|
||||
if (flag & 16) qs = clip[1], qe = ql - clip[0];
|
||||
else qs = clip[0], qe = ql - clip[1];
|
||||
var nm = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
|
||||
list.push([t[2], gene, exon, ts, te, nm, left + right, qs, qe, ql]); // left+right should be 0 given a prefix-suffix alignment
|
||||
}
|
||||
|
||||
buf.destroy();
|
||||
file.close();
|
||||
|
||||
/**************************************
|
||||
* Prepare data structures for typing *
|
||||
**************************************/
|
||||
|
||||
// identify the primary exons, the exons associated with most genes
|
||||
var pri_exon = [], n_pri_exons;
|
||||
{
|
||||
var cnt = [], max = 0;
|
||||
// count the number of genes per exon and track the max
|
||||
for (var e = 0; e < gcnt.length; ++e) {
|
||||
if (gcnt[e] != null) {
|
||||
var c = 0, h = gcnt[e];
|
||||
for (var x in h) ++c;
|
||||
cnt[e] = c;
|
||||
max = max > c? max : c;
|
||||
} else cnt[e] = 0;
|
||||
}
|
||||
warn("- Number of genes for each exon: [" +cnt.join(",") + "]");
|
||||
// find primary exons
|
||||
var pri_list = [];
|
||||
for (var e = 0; e < cnt.length; ++e) {
|
||||
if (cnt[e] == max) pri_list.push(e + 1);
|
||||
pri_exon[e] = cnt[e] == max? 1 : 0;
|
||||
}
|
||||
warn("- List of primary exon(s): ["+pri_list.join(",")+"]");
|
||||
n_pri_exons = pri_list.length;
|
||||
}
|
||||
|
||||
// convert strings to integers (for performance)
|
||||
var ghash = {}, glist = [], chash = {}, clist = [], elist = [];
|
||||
for (var i = 0; i < list.length; ++i) {
|
||||
if (ghash[list[i][1]] == null) {
|
||||
ghash[list[i][1]] = glist.length;
|
||||
glist.push(list[i][1]);
|
||||
}
|
||||
if (chash[list[i][0]] == null) {
|
||||
chash[list[i][0]] = clist.length;
|
||||
clist.push(list[i][0]);
|
||||
}
|
||||
var g = ghash[list[i][1]];
|
||||
if (elist[g] == null) elist[g] = {};
|
||||
elist[g][list[i][2]] = true;
|
||||
}
|
||||
|
||||
// extract the 3rd and 4th digits
|
||||
var gsub = [], gsuf = [];
|
||||
for (var i = 0; i < glist.length; ++i) {
|
||||
var m = /^HLA-[^*\s]+\*\d+:(\d+).*([A-Z]?)$/.exec(glist[i]);
|
||||
gsub[i] = parseInt(m[1]);
|
||||
gsuf[i] = /[A-Z]$/.test(glist[i])? 1 : 0;
|
||||
}
|
||||
|
||||
/*************************************************
|
||||
* Collect genes with perfect matches on primary *
|
||||
*************************************************/
|
||||
|
||||
// collect exons with fully covered by perfect match(es)
|
||||
var perf_exons = [];
|
||||
|
||||
function push_perf_exons(matches, last)
|
||||
{
|
||||
matches.sort(function(a, b) { return a[0]-b[0]; });
|
||||
var cov = 0, start = 0, end = 0;
|
||||
for (var i = 0; i < matches.length; ++i) {
|
||||
if (matches[i][3] > 0) continue;
|
||||
if (matches[i][0] <= end)
|
||||
end = end > matches[i][1]? end : matches[i][1];
|
||||
else cov += end - start, start = matches[i][0], end = matches[i][1];
|
||||
}
|
||||
cov += end - start;
|
||||
if (matches[0][2] == cov) {
|
||||
if (perf_exons[last[1]] == null) perf_exons[last[1]] = [];
|
||||
//print(last[0], last[1], ghash[last[0]]);
|
||||
perf_exons[last[1]].push(ghash[last[0]]);
|
||||
}
|
||||
}
|
||||
|
||||
var last = [null, -1], matches = [];
|
||||
for (var i = 0; i < list.length; ++i) {
|
||||
var li = list[i];
|
||||
if (last[0] != li[1] || last[1] != li[2]) {
|
||||
if (matches.length) push_perf_exons(matches, last);
|
||||
matches = [];
|
||||
last = [li[1], li[2]];
|
||||
}
|
||||
matches.push([li[7], li[8], li[9], li[5]+li[6]]);
|
||||
}
|
||||
if (matches.length) push_perf_exons(matches, last);
|
||||
|
||||
// for each gene, count how many primary exons are perfect
|
||||
var pg_aux_cnt = {};
|
||||
for (var e = 0; e < perf_exons.length; ++e) {
|
||||
if (!pri_exon[e]) continue;
|
||||
var pe = perf_exons[e];
|
||||
var n = pe? pe.length : 0;
|
||||
for (var i = 0; i < n; ++i) {
|
||||
var g = pe[i];
|
||||
if (pg_aux_cnt[g] == null) pg_aux_cnt[g] = 1;
|
||||
else ++pg_aux_cnt[g];
|
||||
}
|
||||
}
|
||||
|
||||
// find genes with perfect matches on the primary exons
|
||||
var perf_genes = [];
|
||||
for (var g in pg_aux_cnt)
|
||||
if (pg_aux_cnt[g] == n_pri_exons)
|
||||
perf_genes.push(parseInt(g));
|
||||
warn("- Found " +perf_genes.length+ " genes fully covered by perfect matches on the primary exon(s)");
|
||||
|
||||
var h_perf_genes = {};
|
||||
for (var i = 0; i < perf_genes.length; ++i) {
|
||||
if (dbg) print("PG", glist[perf_genes[i]]);
|
||||
h_perf_genes[perf_genes[i]] = true;
|
||||
}
|
||||
|
||||
/*******************
|
||||
* Filter hit list *
|
||||
*******************/
|
||||
|
||||
// reorganize hits to exons
|
||||
function list2exons(list, flt_flag, perf_hash)
|
||||
{
|
||||
var exons = [];
|
||||
for (var i = 0; i < list.length; ++i) {
|
||||
var li = list[i], c = chash[li[0]], g = ghash[li[1]];
|
||||
if (flt_flag != null && flt_flag[c] == 1) continue;
|
||||
if (perf_hash != null && !perf_hash[g]) continue;
|
||||
if (exons[li[2]] == null) exons[li[2]] = [];
|
||||
exons[li[2]].push([c, g, li[5] + li[6], li[4] - li[3]]);
|
||||
}
|
||||
return exons;
|
||||
}
|
||||
|
||||
var exons = list2exons(list), flt_flag = [], ovlp_len = [];
|
||||
for (var c = 0; c < clist.length; ++c) flt_flag[c] = ovlp_len[c] = 0;
|
||||
for (var e = 0; e < exons.length; ++e) {
|
||||
if (!pri_exon[e]) continue;
|
||||
var ee = exons[e];
|
||||
var max_len = [];
|
||||
for (var c = 0; c < clist.length; ++c) max_len[c] = 0;
|
||||
for (var i = 0; i < ee.length; ++i) {
|
||||
var l = ee[i][3] - ee[i][2];
|
||||
if (l < 1) l = 1;
|
||||
if (max_len[ee[i][0]] < l) max_len[ee[i][0]] = l;
|
||||
}
|
||||
for (var c = 0; c < clist.length; ++c) ovlp_len[c] += max_len[c];
|
||||
for (var i = 0; i < ee.length; ++i)
|
||||
flt_flag[ee[i][0]] |= (!h_perf_genes[ee[i][1]] || ee[i][2])? 1 : 1<<1;
|
||||
}
|
||||
|
||||
var l_cons = 0, l_incons = 0;
|
||||
for (var c = 0; c < clist.length; ++c)
|
||||
if (flt_flag[c]&2) l_cons += ovlp_len[c];
|
||||
else if (flt_flag[c] == 1) l_incons += ovlp_len[c];
|
||||
|
||||
warn("- Total length of contigs consistent/inconsistent with perfect genes: " +l_cons+ "/" +l_incons);
|
||||
var attempt_perf = (l_incons/(l_cons+l_incons) < thres_frac);
|
||||
|
||||
/********************************
|
||||
* Core function for genotyping *
|
||||
********************************/
|
||||
|
||||
function type_gene(perf_mode)
|
||||
{
|
||||
if (perf_mode) {
|
||||
var flt_list = [];
|
||||
for (var c = 0; c < clist.length; ++c)
|
||||
if (flt_flag[c] == 1) flt_list.push(clist[c]);
|
||||
warn(" - Filtered " +flt_list.length+ " inconsistent contig(s): [" +flt_list.join(",")+ "]");
|
||||
exons = list2exons(list, flt_flag, h_perf_genes);
|
||||
} else exons = list2exons(list);
|
||||
|
||||
/***********************
|
||||
* Score each genotype *
|
||||
***********************/
|
||||
|
||||
// initialize genotype scores
|
||||
var pair = [];
|
||||
for (var i = 0; i < glist.length; ++i) {
|
||||
pair[i] = [];
|
||||
for (var j = 0; j <= i; ++j)
|
||||
pair[i][j] = 0;
|
||||
}
|
||||
|
||||
// these two arrays are used to output debugging information
|
||||
var score = [], ctg = [];
|
||||
|
||||
function type_exon(e, gt_list)
|
||||
{
|
||||
function update_pair(x, m, is_pri)
|
||||
{
|
||||
var y, z;
|
||||
y = (x>>14&0xff) + m < 0xff? (x>>14&0xff) + m : 0xff;
|
||||
if (is_pri) z = (x>>22) + m < 0xff? (x>>22) + m : 0xff;
|
||||
else z = x>>22;
|
||||
return z<<22 | y<<14 | ((x&0x3fff) + (1<<6|is_pri));
|
||||
}
|
||||
|
||||
score[e] = []; ctg[e] = [];
|
||||
if (exons[e] == null) return;
|
||||
var ee = exons[e], is_pri = pri_exon[e]? 1 : 0;
|
||||
// find contigs and genes associated with the current exon
|
||||
var ch = {}, gh = {};
|
||||
for (var i = 0; i < ee.length; ++i)
|
||||
if (elist[ee[i][1]][e] != null)
|
||||
ch[ee[i][0]] = true, gh[ee[i][1]] = true;
|
||||
var ga = [], ca = ctg[e];
|
||||
for (var c in ch) ca.push(parseInt(c));
|
||||
for (var g in gh) ga.push(parseInt(g));
|
||||
var named_ca = [];
|
||||
for (var i = 0; i < ca.length; ++i) named_ca.push(clist[ca[i]]);
|
||||
warn(" - Processing exon "+(e+1)+" (" +ga.length+ " genes; " +ca.length+ " contigs: [" +named_ca.join(", ")+ "])...");
|
||||
// set unmapped entries to high mismatch
|
||||
var sc = score[e];
|
||||
for (var k = 0; k < ga.length; ++k) {
|
||||
var g = ga[k];
|
||||
if (sc[g] == null) sc[g] = [];
|
||||
for (var i = 0; i < ca.length; ++i)
|
||||
sc[g][ca[i]] = 0xff;
|
||||
}
|
||||
// convert representation again and compute max_len[]
|
||||
var max_len = [];
|
||||
for (var i = 0; i < ee.length; ++i) {
|
||||
var c = ee[i][0], g = ee[i][1];
|
||||
if (gh[g] == null || ch[c] == null) continue;
|
||||
sc[g][c] = sc[g][c] < ee[i][2]? sc[g][c] : ee[i][2];
|
||||
if (max_len[c] == null) max_len[c] = 0;
|
||||
max_len[c] = max_len[c] > ee[i][3]? max_len[c] : ee[i][3];
|
||||
}
|
||||
// drop mismapped contigs
|
||||
var max_max_len = 0;
|
||||
for (var k = 0; k < ca.length; ++k)
|
||||
max_max_len = max_max_len > max_len[ca[k]]? max_max_len : max_len[ca[k]];
|
||||
var dropped = [];
|
||||
for (var k = 0; k < ca.length; ++k) {
|
||||
var min = 0x7fffffff, c = ca[k];
|
||||
for (var i = 0; i < ga.length; ++i) {
|
||||
var g = ga[i];
|
||||
min = min < sc[g][c]? min : sc[g][c];
|
||||
}
|
||||
dropped[c] = min > thres_nm? true : false;
|
||||
if (max_len[c] < thres_len && max_len[c] < thres_ratio * max_max_len) dropped[c] = true;
|
||||
if (dropped[c]) warn(" . Dropped low-quality contig " +clist[c]+ " (minNM=" +min+ "; maxLen=" +max_len[c]+ ")");
|
||||
}
|
||||
// fill the pair array
|
||||
if (gt_list == null) {
|
||||
for (var i = 0; i < ga.length; ++i) {
|
||||
var m = 0, gi = ga[i], g1 = sc[gi];
|
||||
// homozygous
|
||||
for (var k = 0; k < ca.length; ++k) {
|
||||
var c = ca[k];
|
||||
if (!dropped[c]) m += g1[c];
|
||||
}
|
||||
pair[gi][gi] = update_pair(pair[gi][gi], m, is_pri);
|
||||
// heterozygous
|
||||
for (var j = i + 1; j < ga.length; ++j) {
|
||||
var gj = ga[j], g2 = sc[gj], m = 0, a = [0, 0];
|
||||
for (var k = 0; k < ca.length; ++k) {
|
||||
var c = ca[k];
|
||||
if (!dropped[c]) {
|
||||
m += g1[c] < g2[c]? g1[c] : g2[c];
|
||||
++a[g1[c]<g2[c]? 0:1];
|
||||
}
|
||||
}
|
||||
if (a[0] == 0 || a[1] == 0) m = 0xff; // if all contigs are assigned to one gene, it is not good
|
||||
if (gi < gj) pair[gj][gi] = update_pair(pair[gj][gi], m, is_pri);
|
||||
else pair[gi][gj] = update_pair(pair[gi][gj], m, is_pri);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
var tmp_pairs = [], min = 0xff;
|
||||
for (var i = 0; i < gt_list.length; ++i) {
|
||||
var gt = gt_list[i], m = 0;
|
||||
var g1 = sc[gt[0]], g2 = sc[gt[1]], a = [0, 0];
|
||||
if (g1 == null || g2 == null) continue;
|
||||
if (gt[0] == gt[1]) {
|
||||
for (var k = 0; k < ca.length; ++k) {
|
||||
var c = ca[k];
|
||||
if (!dropped[c]) m += g1[c];
|
||||
}
|
||||
} else {
|
||||
var a = [0, 0];
|
||||
for (k = 0; k < ca.length; ++k) {
|
||||
var c = ca[k];
|
||||
if (!dropped[c]) {
|
||||
m += g1[c] < g2[c]? g1[c] : g2[c];
|
||||
++a[g1[c]<g2[c]? 0:1];
|
||||
}
|
||||
}
|
||||
if (a[0] == 0 || a[1] == 0) m = 0xff;
|
||||
}
|
||||
tmp_pairs.push([gt[0], gt[1], m]);
|
||||
min = min < m? min : m;
|
||||
}
|
||||
if (min < 0xff) {
|
||||
for (var i = 0; i < tmp_pairs.length; ++i) {
|
||||
var t = tmp_pairs[i];
|
||||
pair[t[0]][t[1]] = update_pair(pair[t[0]][t[1]], t[2], is_pri);
|
||||
}
|
||||
} else warn(" . Skipped exon " +(e+1)+ " as the assembly may be incomplete");
|
||||
}
|
||||
}
|
||||
|
||||
// type primary exons
|
||||
warn(" - Processing primary exon(s)...");
|
||||
for (var e = 0; e < exons.length; ++e)
|
||||
if (pri_exon[e]) type_exon(e);
|
||||
|
||||
// generate the list of best genotypes on primary exons
|
||||
var min_nm_pri = 0x7fffffff;
|
||||
for (var i = 0; i < glist.length; ++i)
|
||||
for (var j = 0; j <= i; ++j)
|
||||
if ((pair[i][j]&63) == n_pri_exons)
|
||||
min_nm_pri = min_nm_pri < pair[i][j]>>22? min_nm_pri : pair[i][j]>>22;
|
||||
|
||||
var gt_list = [];
|
||||
for (var i = 0; i < glist.length; ++i)
|
||||
for (var j = 0; j <= i; ++j)
|
||||
if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>22 == min_nm_pri)
|
||||
gt_list.push([i, j]);
|
||||
|
||||
warn(" - Collected " +gt_list.length+ " top genotypes on the primary exon(s); minimal edit distance: " +min_nm_pri);
|
||||
|
||||
// type other exons
|
||||
warn(" - Processing other exon(s)...");
|
||||
for (var e = 0; e < exons.length; ++e)
|
||||
if (!pri_exon[e]) type_exon(e, gt_list);
|
||||
|
||||
/*****************************
|
||||
* Choose the best genotypes *
|
||||
*****************************/
|
||||
|
||||
// genotyping
|
||||
var min_nm = 0x7fffffff;
|
||||
for (var i = 0; i < glist.length; ++i)
|
||||
for (var j = 0; j <= i; ++j)
|
||||
if ((pair[i][j]&63) == n_pri_exons)
|
||||
min_nm = min_nm < pair[i][j]>>14? min_nm : pair[i][j]>>14;
|
||||
|
||||
var out = [];
|
||||
for (var i = 0; i < glist.length; ++i)
|
||||
for (var j = 0; j <= i; ++j)
|
||||
if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>14 <= min_nm + 1)
|
||||
out.push([pair[i][j]>>14, pair[i][j]>>6&0xff, i, j, (gsuf[i] + gsuf[j])<<16|(gsub[i] + gsub[j])]);
|
||||
|
||||
out.sort(function(a, b) { return a[0]!=b[0]? a[0]-b[0] : a[1]!=b[1]? b[1]-a[1] : a[4]!=b[4]? a[4]-b[4] : a[2]!=b[2]? a[2]-b[2] : a[3]-b[3]});
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/**********************
|
||||
* Perform genotyping *
|
||||
**********************/
|
||||
|
||||
warn("- Typing in the imperfect mode...");
|
||||
var rst = type_gene(false);
|
||||
if (attempt_perf) {
|
||||
warn("- Typing in the perfect mode...");
|
||||
var rst_perf = type_gene(true);
|
||||
warn("- Imperfect vs perfect mode: [" +(rst[0][0]>>8&0xff)+ "," +(rst[0][0]&0xff)+ "] vs [" +(rst_perf[0][0]>>8&0xff)+ "," +(rst_perf[0][0]&0xff)+ "]");
|
||||
if (rst_perf[0][0] < rst[0][0]) {
|
||||
warn("- Chose the result from the perfect mode");
|
||||
rst = rst_perf;
|
||||
} else warn("- Chose the result from the imperfect mode");
|
||||
} else warn("- Perfect mode is not attempted");
|
||||
|
||||
/**********
|
||||
* Output *
|
||||
**********/
|
||||
|
||||
for (var i = 0; i < rst.length; ++i)
|
||||
print("GT", glist[rst[i][3]], glist[rst[i][2]], rst[i][0]>>8&0xff, rst[i][0]&0xff, rst[i][1]);
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
#!/bin/bash
|
||||
|
||||
is_ctg=0
|
||||
|
||||
if [ $# -gt 1 ] && [ $1 == '-A' ]; then
|
||||
is_ctg=1
|
||||
shift
|
||||
fi
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
echo "Usage: $0 [-A] <prefix> <gene>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
preres="resource-human-HLA"
|
||||
root=`dirname $0`
|
||||
pre=$1.$2
|
||||
touch $pre.gt
|
||||
|
||||
if [ ! -s $pre.fq ]; then
|
||||
echo '** Empty input file. Abort!' >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ $is_ctg -eq 0 ]; then
|
||||
echo "** De novo assembling..." >&2
|
||||
len=`$root/seqtk comp $pre.fq | awk '{++x;y+=$2}END{printf("%.0f\n", y/x)}'`
|
||||
$root/fermi2.pl unitig -f $root/fermi2 -r $root/ropebwt2 -t2 -l$len -p $pre.tmp $pre.fq > $pre.tmp.mak
|
||||
make -f $pre.tmp.mak >&2
|
||||
cp $pre.tmp.mag.gz $pre.mag.gz
|
||||
else
|
||||
rm -f $pre.tmp.mag.gz
|
||||
ln -s $pre.fq $pre.tmp.mag.gz
|
||||
fi
|
||||
|
||||
echo "** Selecting contigs overlapping target exons..." >&2
|
||||
(ls $root/$preres/HLA-ALT-idx/*.fa.bwt | sed s,.bwt,, | xargs -i $root/bwa mem -t2 -B1 -O1 -E1 {} $pre.tmp.mag.gz 2>/dev/null) | grep -v ^@ | sort -k3,3 -k4,4n | gzip > $pre.tmp.ALT.sam.gz
|
||||
$root/k8 $root/typeHLA-selctg.js $2 $root/$preres/HLA-ALT-exons.bed $pre.tmp.ALT.sam.gz | $root/seqtk subseq $pre.tmp.mag.gz - | gzip -1 > $pre.tmp.fq.gz
|
||||
|
||||
echo "** Mapping exons to de novo contigs..." >&2
|
||||
$root/bwa index -p $pre.tmp $pre.tmp.fq.gz 2>/dev/null
|
||||
$root/seqtk comp $root/$preres/HLA-CDS.fa | cut -f1 | grep ^$2 | $root/seqtk subseq $root/$preres/HLA-CDS.fa - | $root/bwa mem -aD.1 -t2 $pre.tmp - 2>/dev/null | gzip -1 > $pre.sam.gz
|
||||
|
||||
echo "** Typing..." >&2
|
||||
$root/k8 $root/typeHLA.js $pre.sam.gz > $pre.gt
|
||||
|
||||
# delete temporary files
|
||||
rm -f $pre.tmp.*
|
||||
[ $is_ctg -eq 1 ] && rm -f $pre.mag.gz
|
||||
|
|
@ -0,0 +1,213 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#ifndef BWAMEM_H_
|
||||
#define BWAMEM_H_
|
||||
|
||||
#include "bwt.h"
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
|
||||
#define MEM_MAPQ_COEF 30.0
|
||||
#define MEM_MAPQ_MAX 60
|
||||
|
||||
struct __smem_i;
|
||||
typedef struct __smem_i smem_i;
|
||||
|
||||
#define MEM_F_PE 0x2
|
||||
#define MEM_F_NOPAIRING 0x4
|
||||
#define MEM_F_ALL 0x8
|
||||
#define MEM_F_NO_MULTI 0x10
|
||||
#define MEM_F_NO_RESCUE 0x20
|
||||
#define MEM_F_REF_HDR 0x100
|
||||
#define MEM_F_SOFTCLIP 0x200
|
||||
#define MEM_F_SMARTPE 0x400
|
||||
#define MEM_F_PRIMARY5 0x800
|
||||
#define MEM_F_KEEP_SUPP_MAPQ 0x1000
|
||||
#define MEM_F_XB 0x2000
|
||||
|
||||
typedef struct {
|
||||
int a, b; // match score and mismatch penalty
|
||||
int o_del, e_del;
|
||||
int o_ins, e_ins;
|
||||
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
||||
int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score.
|
||||
int w; // band width
|
||||
int zdrop; // Z-dropoff
|
||||
|
||||
uint64_t max_mem_intv;
|
||||
|
||||
int T; // output score threshold; only affecting output
|
||||
int flag; // see MEM_F_* macros
|
||||
int min_seed_len; // minimum seed length
|
||||
int min_chain_weight;
|
||||
int max_chain_extend;
|
||||
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
|
||||
int split_width; // split into a seed if its occurence is smaller than this value
|
||||
int max_occ; // skip a seed if its occurence is larger than this value
|
||||
int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
|
||||
int n_threads; // number of threads
|
||||
int chunk_size; // process chunk_size-bp sequences in a batch
|
||||
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
|
||||
float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
|
||||
float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
|
||||
float mask_level_redun;
|
||||
float mapQ_coef_len;
|
||||
int mapQ_coef_fac;
|
||||
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||
int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
|
||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||
} mem_opt_t;
|
||||
|
||||
typedef struct {
|
||||
int64_t rb, re; // [rb,re): reference sequence in the alignment
|
||||
int qb, qe; // [qb,qe): query sequence in the alignment
|
||||
int rid; // reference seq ID
|
||||
int score; // best local SW score
|
||||
int truesc; // actual score corresponding to the aligned region; possibly smaller than $score
|
||||
int sub; // 2nd best SW score
|
||||
int alt_sc;
|
||||
int csub; // SW score of a tandem hit
|
||||
int sub_n; // approximate number of suboptimal hits
|
||||
int w; // actual band width used in extension
|
||||
int seedcov; // length of regions coverged by seeds
|
||||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||
int secondary_all;
|
||||
int seedlen0; // length of the starting seed
|
||||
int n_comp:30, is_alt:2; // number of sub-alignments chained together
|
||||
float frac_rep;
|
||||
uint64_t hash;
|
||||
} mem_alnreg_t;
|
||||
|
||||
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
|
||||
|
||||
typedef struct {
|
||||
int low, high; // lower and upper bounds within which a read pair is considered to be properly paired
|
||||
int failed; // non-zero if the orientation is not supported by sufficient data
|
||||
double avg, std; // mean and stddev of the insert size distribution
|
||||
} mem_pestat_t;
|
||||
|
||||
typedef struct { // This struct is only used for the convenience of API.
|
||||
int64_t pos; // forward strand 5'-end mapping position
|
||||
int rid; // reference sequence index in bntseq_t; <0 for unmapped
|
||||
int flag; // extra flag
|
||||
uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
|
||||
int n_cigar; // number of CIGAR operations
|
||||
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
|
||||
char *XA; // alternative mappings
|
||||
|
||||
int score, sub, alt_sc;
|
||||
} mem_aln_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
smem_i *smem_itr_init(const bwt_t *bwt);
|
||||
void smem_itr_destroy(smem_i *itr);
|
||||
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
|
||||
void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv);
|
||||
const bwtintv_v *smem_next(smem_i *itr);
|
||||
|
||||
mem_opt_t *mem_opt_init(void);
|
||||
void mem_fill_scmat(int a, int b, int8_t mat[25]);
|
||||
|
||||
/**
|
||||
* Align a batch of sequences and generate the alignments in the SAM format
|
||||
*
|
||||
* This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
|
||||
* Note that $seqs[i].sam may consist of several SAM lines if the
|
||||
* corresponding sequence has multiple primary hits.
|
||||
*
|
||||
* In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
|
||||
* sequences must be interleaved: $n must be an even number and the 2i-th
|
||||
* sequence and the (2i+1)-th sequence constitute a read pair. In this
|
||||
* mode, there should be enough (typically >50) unique pairs for the
|
||||
* routine to infer the orientation and insert size.
|
||||
*
|
||||
* @param opt alignment parameters
|
||||
* @param bwt FM-index of the reference sequence
|
||||
* @param bns Information of the reference
|
||||
* @param pac 2-bit encoded reference
|
||||
* @param n number of query sequences
|
||||
* @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
|
||||
* @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements,
|
||||
* corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info.
|
||||
*/
|
||||
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
|
||||
|
||||
/**
|
||||
* Find the aligned regions for one query sequence
|
||||
*
|
||||
* Note that this routine does not generate CIGAR. CIGAR should be
|
||||
* generated later by mem_reg2aln() below.
|
||||
*
|
||||
* @param opt alignment parameters
|
||||
* @param bwt FM-index of the reference sequence
|
||||
* @param bns Information of the reference
|
||||
* @param pac 2-bit encoded reference
|
||||
* @param l_seq length of query sequence
|
||||
* @param seq query sequence
|
||||
*
|
||||
* @return list of aligned regions.
|
||||
*/
|
||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq);
|
||||
|
||||
/**
|
||||
* Generate CIGAR and forward-strand position from alignment region
|
||||
*
|
||||
* @param opt alignment parameters
|
||||
* @param bns Information of the reference
|
||||
* @param pac 2-bit encoded reference
|
||||
* @param l_seq length of query sequence
|
||||
* @param seq query sequence
|
||||
* @param ar one alignment region
|
||||
*
|
||||
* @return CIGAR, strand, mapping quality and forward-strand position
|
||||
*/
|
||||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar);
|
||||
mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name);
|
||||
|
||||
/**
|
||||
* Infer the insert size distribution from interleaved alignment regions
|
||||
*
|
||||
* This function can be called after mem_align1(), as long as paired-end
|
||||
* reads are properly interleaved.
|
||||
*
|
||||
* @param opt alignment parameters
|
||||
* @param l_pac length of concatenated reference sequence
|
||||
* @param n number of query sequences; must be an even number
|
||||
* @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
|
||||
* @param pes inferred insert size distribution (output)
|
||||
*/
|
||||
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#include <limits.h>
|
||||
#include "bwa.h"
|
||||
#include "bwamem.h"
|
||||
#include "bntseq.h"
|
||||
#include "kstring.h"
|
||||
|
||||
/***************************
|
||||
* SMEM iterator interface *
|
||||
***************************/
|
||||
|
||||
struct __smem_i {
|
||||
const bwt_t *bwt;
|
||||
const uint8_t *query;
|
||||
int start, len;
|
||||
int min_intv, max_len;
|
||||
uint64_t max_intv;
|
||||
bwtintv_v *matches; // matches; to be returned by smem_next()
|
||||
bwtintv_v *sub; // sub-matches inside the longest match; temporary
|
||||
bwtintv_v *tmpvec[2]; // temporary arrays
|
||||
};
|
||||
|
||||
smem_i *smem_itr_init(const bwt_t *bwt)
|
||||
{
|
||||
smem_i *itr;
|
||||
itr = calloc(1, sizeof(smem_i));
|
||||
itr->bwt = bwt;
|
||||
itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
|
||||
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
|
||||
itr->matches = calloc(1, sizeof(bwtintv_v));
|
||||
itr->sub = calloc(1, sizeof(bwtintv_v));
|
||||
itr->min_intv = 1;
|
||||
itr->max_len = INT_MAX;
|
||||
itr->max_intv = 0;
|
||||
return itr;
|
||||
}
|
||||
|
||||
void smem_itr_destroy(smem_i *itr)
|
||||
{
|
||||
free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
|
||||
free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
|
||||
free(itr->matches->a); free(itr->matches);
|
||||
free(itr->sub->a); free(itr->sub);
|
||||
free(itr);
|
||||
}
|
||||
|
||||
void smem_set_query(smem_i *itr, int len, const uint8_t *query)
|
||||
{
|
||||
itr->query = query;
|
||||
itr->start = 0;
|
||||
itr->len = len;
|
||||
}
|
||||
|
||||
void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
|
||||
{
|
||||
itr->min_intv = min_intv;
|
||||
itr->max_len = max_len;
|
||||
itr->max_intv = max_intv;
|
||||
}
|
||||
|
||||
const bwtintv_v *smem_next(smem_i *itr)
|
||||
{
|
||||
int ori_start;
|
||||
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
|
||||
if (itr->start >= itr->len || itr->start < 0) return 0;
|
||||
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
|
||||
if (itr->start == itr->len) return 0;
|
||||
ori_start = itr->start;
|
||||
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
|
||||
return itr->matches;
|
||||
}
|
||||
|
||||
/***********************
|
||||
*** Extra functions ***
|
||||
***********************/
|
||||
|
||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_)
|
||||
{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
|
||||
extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf);
|
||||
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
|
||||
mem_alnreg_v ar;
|
||||
char *seq;
|
||||
seq = malloc(l_seq);
|
||||
memcpy(seq, seq_, l_seq); // makes a copy of seq_
|
||||
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0);
|
||||
mem_mark_primary_se(opt, ar.n, ar.a, lrand48());
|
||||
free(seq);
|
||||
return ar;
|
||||
}
|
||||
|
||||
static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
|
||||
{
|
||||
int k = a[i].secondary_all;
|
||||
if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
|
||||
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
|
||||
{
|
||||
int i, k, r, *cnt, tot;
|
||||
kstring_t *aln = 0, str = {0,0,0};
|
||||
char **XA = 0, *has_alt;
|
||||
|
||||
cnt = calloc(a->n, sizeof(int));
|
||||
has_alt = calloc(a->n, 1);
|
||||
for (i = 0, tot = 0; i < a->n; ++i) {
|
||||
r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
|
||||
if (r >= 0) {
|
||||
++cnt[r], ++tot;
|
||||
if (a->a[i].is_alt) has_alt[r] = 1;
|
||||
}
|
||||
}
|
||||
if (tot == 0) goto end_gen_alt;
|
||||
aln = calloc(a->n, sizeof(kstring_t));
|
||||
for (i = 0; i < a->n; ++i) {
|
||||
mem_aln_t t;
|
||||
if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
|
||||
if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
|
||||
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
|
||||
str.l = 0;
|
||||
kputs(bns->anns[t.rid].name, &str);
|
||||
kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str);
|
||||
kputc(',', &str);
|
||||
for (k = 0; k < t.n_cigar; ++k) {
|
||||
kputw(t.cigar[k]>>4, &str);
|
||||
kputc("MIDSHN"[t.cigar[k]&0xf], &str);
|
||||
}
|
||||
kputc(',', &str); kputw(t.NM, &str);
|
||||
if (opt->flag & MEM_F_XB) {
|
||||
kputc(',', &str);
|
||||
kputw(t.score, &str);
|
||||
kputc(',', &str);
|
||||
kputw(t.mapq, &str);
|
||||
}
|
||||
kputc(';', &str);
|
||||
free(t.cigar);
|
||||
kputsn(str.s, str.l, &aln[r]);
|
||||
}
|
||||
XA = calloc(a->n, sizeof(char*));
|
||||
for (k = 0; k < a->n; ++k)
|
||||
XA[k] = aln[k].s;
|
||||
|
||||
end_gen_alt:
|
||||
free(has_alt); free(cnt); free(aln); free(str.s);
|
||||
return XA;
|
||||
}
|
||||
|
|
@ -0,0 +1,419 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include "kstring.h"
|
||||
#include "bwamem.h"
|
||||
#include "kvec.h"
|
||||
#include "utils.h"
|
||||
#include "ksw.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
|
||||
#define MIN_RATIO 0.8
|
||||
#define MIN_DIR_CNT 10
|
||||
#define MIN_DIR_RATIO 0.05
|
||||
#define OUTLIER_BOUND 2.0
|
||||
#define MAPPING_BOUND 3.0
|
||||
#define MAX_STDDEV 4.0
|
||||
|
||||
static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
|
||||
{
|
||||
int64_t p2;
|
||||
int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
|
||||
p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
|
||||
*dist = p2 > b1? p2 - b1 : b1 - p2;
|
||||
return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);
|
||||
}
|
||||
|
||||
static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
|
||||
{
|
||||
int j;
|
||||
for (j = 1; j < r->n; ++j) { // choose unique alignment
|
||||
int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
|
||||
int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
|
||||
if (e_min > b_max) { // have overlap
|
||||
int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
|
||||
if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
|
||||
}
|
||||
}
|
||||
return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
|
||||
}
|
||||
|
||||
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
|
||||
{
|
||||
int i, d, max;
|
||||
uint64_v isize[4];
|
||||
memset(pes, 0, 4 * sizeof(mem_pestat_t));
|
||||
memset(isize, 0, sizeof(kvec_t(int)) * 4);
|
||||
for (i = 0; i < n>>1; ++i) {
|
||||
int dir;
|
||||
int64_t is;
|
||||
mem_alnreg_v *r[2];
|
||||
r[0] = (mem_alnreg_v*)®s[i<<1|0];
|
||||
r[1] = (mem_alnreg_v*)®s[i<<1|1];
|
||||
if (r[0]->n == 0 || r[1]->n == 0) continue;
|
||||
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
|
||||
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
|
||||
if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr
|
||||
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
|
||||
if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
|
||||
}
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
|
||||
for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
|
||||
mem_pestat_t *r = &pes[d];
|
||||
uint64_v *q = &isize[d];
|
||||
int p25, p50, p75, x;
|
||||
if (q->n < MIN_DIR_CNT) {
|
||||
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
||||
r->failed = 1;
|
||||
free(q->a);
|
||||
continue;
|
||||
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
||||
ks_introsort_64(q->n, q->a);
|
||||
p25 = q->a[(int)(.25 * q->n + .499)];
|
||||
p50 = q->a[(int)(.50 * q->n + .499)];
|
||||
p75 = q->a[(int)(.75 * q->n + .499)];
|
||||
r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
if (r->low < 1) r->low = 1;
|
||||
r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
|
||||
fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
|
||||
for (i = x = 0, r->avg = 0; i < q->n; ++i)
|
||||
if (q->a[i] >= r->low && q->a[i] <= r->high)
|
||||
r->avg += q->a[i], ++x;
|
||||
r->avg /= x;
|
||||
for (i = 0, r->std = 0; i < q->n; ++i)
|
||||
if (q->a[i] >= r->low && q->a[i] <= r->high)
|
||||
r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
|
||||
r->std = sqrt(r->std / x);
|
||||
fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
|
||||
r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
|
||||
r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
|
||||
if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499);
|
||||
if (r->high < r->avg + MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
|
||||
if (r->low < 1) r->low = 1;
|
||||
fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
|
||||
free(q->a);
|
||||
}
|
||||
for (d = 0, max = 0; d < 4; ++d)
|
||||
max = max > isize[d].n? max : isize[d].n;
|
||||
for (d = 0; d < 4; ++d)
|
||||
if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) {
|
||||
pes[d].failed = 1;
|
||||
fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
||||
}
|
||||
}
|
||||
|
||||
int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
|
||||
{
|
||||
extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
|
||||
int64_t l_pac = bns->l_pac;
|
||||
int i, r, skip[4], n = 0, rid;
|
||||
for (r = 0; r < 4; ++r)
|
||||
skip[r] = pes[r].failed? 1 : 0;
|
||||
for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
|
||||
int64_t dist;
|
||||
r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
|
||||
if (dist >= pes[r].low && dist <= pes[r].high)
|
||||
skip[r] = 1;
|
||||
}
|
||||
if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW
|
||||
for (r = 0; r < 4; ++r) {
|
||||
int is_rev, is_larger;
|
||||
uint8_t *seq, *rev = 0, *ref = 0;
|
||||
int64_t rb, re;
|
||||
if (skip[r]) continue;
|
||||
is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
|
||||
is_larger = !(r>>1); // whether the mate has larger coordinate
|
||||
if (is_rev) {
|
||||
rev = malloc(l_ms); // this is the reverse complement of $ms
|
||||
for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4;
|
||||
seq = rev;
|
||||
} else seq = (uint8_t*)ms;
|
||||
if (!is_rev) {
|
||||
rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high;
|
||||
re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
|
||||
} else {
|
||||
rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
|
||||
re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low;
|
||||
}
|
||||
if (rb < 0) rb = 0;
|
||||
if (re > l_pac<<1) re = l_pac<<1;
|
||||
if (rb < re) ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid);
|
||||
if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening
|
||||
kswr_t aln;
|
||||
mem_alnreg_t b;
|
||||
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
|
||||
aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
|
||||
memset(&b, 0, sizeof(mem_alnreg_t));
|
||||
if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
|
||||
b.rid = a->rid;
|
||||
b.is_alt = a->is_alt;
|
||||
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
|
||||
b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
|
||||
b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
|
||||
b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1;
|
||||
b.score = aln.score;
|
||||
b.csub = aln.score2;
|
||||
b.secondary = -1;
|
||||
b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
|
||||
// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
|
||||
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
|
||||
// move b s.t. ma is sorted
|
||||
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
||||
if (ma->a[i].score < b.score) break;
|
||||
tmp = i;
|
||||
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
|
||||
ma->a[i] = b;
|
||||
}
|
||||
++n;
|
||||
}
|
||||
if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
|
||||
if (rev) free(rev);
|
||||
free(ref);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2])
|
||||
{
|
||||
pair64_v v, u;
|
||||
int r, i, k, y[4], ret; // y[] keeps the last hit
|
||||
int64_t l_pac = bns->l_pac;
|
||||
kv_init(v); kv_init(u);
|
||||
for (r = 0; r < 2; ++r) { // loop through read number
|
||||
for (i = 0; i < n_pri[r]; ++i) {
|
||||
pair64_t key;
|
||||
mem_alnreg_t *e = &a[r].a[i];
|
||||
key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
|
||||
key.x = (uint64_t)e->rid<<32 | (key.x - bns->anns[e->rid].offset);
|
||||
key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r;
|
||||
kv_push(pair64_t, v, key);
|
||||
}
|
||||
}
|
||||
ks_introsort_128(v.n, v.a);
|
||||
y[0] = y[1] = y[2] = y[3] = -1;
|
||||
//for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
|
||||
for (i = 0; i < v.n; ++i) {
|
||||
for (r = 0; r < 2; ++r) { // loop through direction
|
||||
int dir = r<<1 | (v.a[i].y>>1&1), which;
|
||||
if (pes[dir].failed) continue; // invalid orientation
|
||||
which = r<<1 | ((v.a[i].y&1)^1);
|
||||
if (y[which] < 0) continue; // no previous hits
|
||||
for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt)
|
||||
int64_t dist;
|
||||
int q;
|
||||
double ns;
|
||||
pair64_t *p;
|
||||
if ((v.a[k].y&3) != which) continue;
|
||||
dist = (int64_t)v.a[i].x - v.a[k].x;
|
||||
//printf("%d: %lld\n", k, dist);
|
||||
if (dist > pes[dir].high) break;
|
||||
if (dist < pes[dir].low) continue;
|
||||
ns = (dist - pes[dir].avg) / pes[dir].std;
|
||||
q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4)
|
||||
if (q < 0) q = 0;
|
||||
p = kv_pushp(pair64_t, u);
|
||||
p->y = (uint64_t)k<<32 | i;
|
||||
p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU);
|
||||
//printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
|
||||
}
|
||||
}
|
||||
y[v.a[i].y&3] = i;
|
||||
}
|
||||
if (u.n) { // found at least one proper pair
|
||||
int tmp = opt->a + opt->b;
|
||||
tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del;
|
||||
tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins;
|
||||
ks_introsort_128(u.n, u.a);
|
||||
i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
|
||||
z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
|
||||
z[v.a[k].y&1] = v.a[k].y<<32>>34;
|
||||
ret = u.a[u.n-1].x >> 32;
|
||||
*sub = u.n > 1? u.a[u.n-2].x>>32 : 0;
|
||||
for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
|
||||
if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub;
|
||||
} else ret = 0, *sub = 0, *n_sub = 0;
|
||||
free(u.a); free(v.a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m);
|
||||
void mem_reorder_primary5(int T, mem_alnreg_v *a);
|
||||
|
||||
#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
|
||||
|
||||
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
|
||||
{
|
||||
extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
|
||||
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
|
||||
extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
|
||||
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
|
||||
|
||||
int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
|
||||
kstring_t str;
|
||||
mem_aln_t h[2], g[2], aa[2][2];
|
||||
|
||||
str.l = str.m = 0; str.s = 0;
|
||||
memset(h, 0, sizeof(mem_aln_t) * 2);
|
||||
memset(g, 0, sizeof(mem_aln_t) * 2);
|
||||
n_aa[0] = n_aa[1] = 0;
|
||||
if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
|
||||
mem_alnreg_v b[2];
|
||||
kv_init(b[0]); kv_init(b[1]);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < a[i].n; ++j)
|
||||
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
|
||||
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
|
||||
n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
|
||||
free(b[0].a); free(b[1].a);
|
||||
}
|
||||
n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
|
||||
n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
|
||||
if (opt->flag & MEM_F_PRIMARY5) {
|
||||
mem_reorder_primary5(opt->T, &a[0]);
|
||||
mem_reorder_primary5(opt->T, &a[1]);
|
||||
}
|
||||
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
|
||||
// pairing single-end hits
|
||||
if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
|
||||
int is_multi[2], q_pe, score_un, q_se[2];
|
||||
char **XA[2];
|
||||
// check if an end has multiple hits even after mate-SW
|
||||
for (i = 0; i < 2; ++i) {
|
||||
for (j = 1; j < n_pri[i]; ++j)
|
||||
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
|
||||
is_multi[i] = j < n_pri[i]? 1 : 0;
|
||||
}
|
||||
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
|
||||
// compute mapQ for the best SE hit
|
||||
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
|
||||
//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
|
||||
subo = subo > score_un? subo : score_un;
|
||||
q_pe = raw_mapq(o - subo, opt->a);
|
||||
if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
|
||||
if (q_pe < 0) q_pe = 0;
|
||||
if (q_pe > 60) q_pe = 60;
|
||||
q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499);
|
||||
// the following assumes no split hits
|
||||
if (o > score_un) { // paired alignment is preferred
|
||||
mem_alnreg_t *c[2];
|
||||
c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (c[i]->secondary >= 0)
|
||||
c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
|
||||
q_se[i] = mem_approx_mapq_se(opt, c[i]);
|
||||
}
|
||||
q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;
|
||||
q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
|
||||
extra_flag |= 2;
|
||||
// cap at the tandem repeat score
|
||||
q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
|
||||
q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
|
||||
} else { // the unpaired alignment is preferred
|
||||
z[0] = z[1] = 0;
|
||||
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
|
||||
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
|
||||
}
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int k = a[i].a[z[i]].secondary_all;
|
||||
if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT
|
||||
assert(a[i].a[k].secondary_all < 0);
|
||||
for (j = 0; j < a[i].n; ++j)
|
||||
if (a[i].a[j].secondary_all == k || j == k)
|
||||
a[i].a[j].secondary_all = z[i];
|
||||
a[i].a[z[i]].secondary_all = -1;
|
||||
}
|
||||
}
|
||||
if (!(opt->flag & MEM_F_ALL)) {
|
||||
for (i = 0; i < 2; ++i)
|
||||
XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);
|
||||
} else XA[0] = XA[1] = 0;
|
||||
// write SAM
|
||||
for (i = 0; i < 2; ++i) {
|
||||
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
|
||||
h[i].mapq = q_se[i];
|
||||
h[i].flag |= 0x40<<i | extra_flag;
|
||||
h[i].XA = XA[i]? XA[i][z[i]] : 0;
|
||||
aa[i][n_aa[i]++] = h[i];
|
||||
if (n_pri[i] < a[i].n) { // the read has ALT hits
|
||||
mem_alnreg_t *p = &a[i].a[n_pri[i]];
|
||||
if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) continue;
|
||||
g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p);
|
||||
g[i].flag |= 0x800 | 0x40<<i | extra_flag;
|
||||
g[i].XA = XA[i]? XA[i][n_pri[i]] : 0;
|
||||
aa[i][n_aa[i]++] = g[i];
|
||||
}
|
||||
}
|
||||
for (i = 0; i < n_aa[0]; ++i)
|
||||
mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
|
||||
s[0].sam = strdup(str.s); str.l = 0;
|
||||
for (i = 0; i < n_aa[1]; ++i)
|
||||
mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
|
||||
s[1].sam = str.s;
|
||||
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
|
||||
// free
|
||||
for (i = 0; i < 2; ++i) {
|
||||
free(h[i].cigar); free(g[i].cigar);
|
||||
if (XA[i] == 0) continue;
|
||||
for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
|
||||
free(XA[i]);
|
||||
}
|
||||
} else goto no_pairing;
|
||||
return n;
|
||||
|
||||
no_pairing:
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int which = -1;
|
||||
if (a[i].n) {
|
||||
if (a[i].a[0].score >= opt->T) which = 0;
|
||||
else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
|
||||
which = n_pri[i];
|
||||
}
|
||||
if (which >= 0) h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
|
||||
else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
|
||||
}
|
||||
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
|
||||
int64_t dist;
|
||||
int d;
|
||||
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
|
||||
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;
|
||||
}
|
||||
mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);
|
||||
mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);
|
||||
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
|
||||
free(h[0].cigar); free(h[1].cigar);
|
||||
return n;
|
||||
}
|
||||
|
|
@ -0,0 +1,784 @@
|
|||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "bwtaln.h"
|
||||
#include "kvec.h"
|
||||
#include "bntseq.h"
|
||||
#include "utils.h"
|
||||
#include "bwase.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int n;
|
||||
bwtint_t *a;
|
||||
} poslist_t;
|
||||
|
||||
typedef struct {
|
||||
double avg, std, ap_prior;
|
||||
bwtint_t low, high, high_bayesian;
|
||||
} isize_info_t;
|
||||
|
||||
#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
|
||||
#define b128_hash(a) ((uint32_t)(a).x)
|
||||
|
||||
#include "khash.h"
|
||||
KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
|
||||
|
||||
typedef struct {
|
||||
pair64_v arr;
|
||||
pair64_v pos[2];
|
||||
kvec_t(bwt_aln1_t) aln[2];
|
||||
} pe_data_t;
|
||||
|
||||
#define MIN_HASH_WIDTH 1000
|
||||
|
||||
extern int g_log_n[256]; // in bwase.c
|
||||
static kh_b128_t *g_hash;
|
||||
|
||||
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
||||
int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
|
||||
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
|
||||
bntseq_t *bwa_open_nt(const char *prefix);
|
||||
void bwa_print_sam_SQ(const bntseq_t *bns);
|
||||
|
||||
pe_opt_t *bwa_init_pe_opt()
|
||||
{
|
||||
pe_opt_t *po;
|
||||
po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t));
|
||||
po->max_isize = 500;
|
||||
po->force_isize = 0;
|
||||
po->max_occ = 100000;
|
||||
po->n_multi = 3;
|
||||
po->N_multi = 10;
|
||||
po->type = BWA_PET_STD;
|
||||
po->is_sw = 1;
|
||||
po->ap_prior = 1e-5;
|
||||
return po;
|
||||
}
|
||||
/*
|
||||
static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
|
||||
{
|
||||
const double a = 0.140012;
|
||||
double b, c;
|
||||
b = log(x * (2 - x));
|
||||
c = 2./M_PI/a + b / 2.;
|
||||
return sqrt(sqrt(c * c - b / a) - c);
|
||||
}
|
||||
*/
|
||||
|
||||
// for normal distribution, this is about 3std
|
||||
#define OUTLIER_BOUND 2.0
|
||||
|
||||
static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L)
|
||||
{
|
||||
uint64_t x, *isizes, n_ap = 0;
|
||||
int n, i, tot, p25, p75, p50, max_len = 1, tmp;
|
||||
double skewness = 0.0, kurtosis = 0.0, y;
|
||||
|
||||
ii->avg = ii->std = -1.0;
|
||||
ii->low = ii->high = ii->high_bayesian = 0;
|
||||
isizes = (uint64_t*)calloc(n_seqs, 8);
|
||||
for (i = 0, tot = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p[2];
|
||||
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
|
||||
if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) {
|
||||
x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos;
|
||||
if (x < 100000) isizes[tot++] = x;
|
||||
}
|
||||
if (p[0]->len > max_len) max_len = p[0]->len;
|
||||
if (p[1]->len > max_len) max_len = p[1]->len;
|
||||
}
|
||||
if (tot < 20) {
|
||||
fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n");
|
||||
free(isizes);
|
||||
return -1;
|
||||
}
|
||||
ks_introsort_64(tot, isizes);
|
||||
p25 = isizes[(int)(tot*0.25 + 0.5)];
|
||||
p50 = isizes[(int)(tot*0.50 + 0.5)];
|
||||
p75 = isizes[(int)(tot*0.75 + 0.5)];
|
||||
tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned
|
||||
ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
if (ii->low > ii->high) {
|
||||
fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n");
|
||||
free(isizes);
|
||||
return -1;
|
||||
}
|
||||
for (i = 0, x = n = 0; i < tot; ++i)
|
||||
if (isizes[i] >= ii->low && isizes[i] <= ii->high)
|
||||
++n, x += isizes[i];
|
||||
ii->avg = (double)x / n;
|
||||
for (i = 0; i < tot; ++i) {
|
||||
if (isizes[i] >= ii->low && isizes[i] <= ii->high) {
|
||||
double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg);
|
||||
ii->std += tmp;
|
||||
skewness += tmp * (isizes[i] - ii->avg);
|
||||
kurtosis += tmp * tmp;
|
||||
}
|
||||
}
|
||||
kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3;
|
||||
ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large
|
||||
skewness = skewness / n / (ii->std * ii->std * ii->std);
|
||||
for (y = 1.0; y < 10.0; y += 0.01)
|
||||
if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
|
||||
ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
|
||||
for (i = 0; i < tot; ++i)
|
||||
if (isizes[i] > ii->high_bayesian) ++n_ap;
|
||||
ii->ap_prior = .01 * (n_ap + .01) / tot;
|
||||
if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior;
|
||||
free(isizes);
|
||||
fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75);
|
||||
if (isnan(ii->std) || p75 > 100000) {
|
||||
ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0;
|
||||
fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n");
|
||||
return -1;
|
||||
}
|
||||
for (y = 1.0; y < 10.0; y += 0.01)
|
||||
if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
|
||||
ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
|
||||
fprintf(stderr, "[infer_isize] low and high boundaries: %ld and %ld for estimating avg and std\n", (long)ii->low, (long)ii->high);
|
||||
fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std);
|
||||
fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior);
|
||||
fprintf(stderr, "[infer_isize] inferred maximum insert size: %ld (%.2lf sigma)\n", (long)ii->high_bayesian, y);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii)
|
||||
{
|
||||
int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
|
||||
uint64_t o_score, subo_score;
|
||||
pair64_t last_pos[2][2], o_pos[2];
|
||||
max_len = p[0]->full_len;
|
||||
if (max_len < p[1]->full_len) max_len = p[1]->full_len;
|
||||
if (low_bound < max_len) low_bound = max_len;
|
||||
|
||||
// here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize
|
||||
#define __pairing_aux(u,v) do { \
|
||||
bwtint_t l = (v).x + p[(v).y&1]->len - ((u).x); \
|
||||
if ((u).x != (uint64_t)-1 && (v).x > (u).x && l >= max_len \
|
||||
&& ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \
|
||||
{ \
|
||||
uint64_t s = d->aln[(v).y&1].a[(v).y>>2].score + d->aln[(u).y&1].a[(u).y>>2].score; \
|
||||
s *= 10; \
|
||||
if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \
|
||||
s = s<<32 | (uint32_t)hash_64((u).x<<32 | (v).x); \
|
||||
if (s>>32 == o_score>>32) ++o_n; \
|
||||
else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \
|
||||
else ++subo_n; \
|
||||
if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u).y&1] = (u), o_pos[(v).y&1] = (v); \
|
||||
else if (s < subo_score) subo_score = s; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define __pairing_aux2(q, w) do { \
|
||||
const bwt_aln1_t *r = d->aln[(w).y&1].a + ((w).y>>2); \
|
||||
(q)->extra_flag |= SAM_FPP; \
|
||||
if ((q)->pos != (w).x || (q)->strand != ((w).y>>1&1)) { \
|
||||
(q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = (w).y>>1&1; \
|
||||
(q)->score = r->score; \
|
||||
(q)->pos = (w).x; \
|
||||
if ((q)->mapQ > 0) ++cnt_chg; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
o_score = subo_score = (uint64_t)-1;
|
||||
o_n = subo_n = 0;
|
||||
ks_introsort_128(d->arr.n, d->arr.a);
|
||||
for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
|
||||
if (opt->type == BWA_PET_STD) {
|
||||
for (i = 0; i < d->arr.n; ++i) {
|
||||
pair64_t x = d->arr.a[i];
|
||||
int strand = x.y>>1&1;
|
||||
if (strand == 1) { // reverse strand, then check
|
||||
int y = 1 - (x.y&1);
|
||||
__pairing_aux(last_pos[y][1], x);
|
||||
__pairing_aux(last_pos[y][0], x);
|
||||
} else { // forward strand, then push
|
||||
last_pos[x.y&1][0] = last_pos[x.y&1][1];
|
||||
last_pos[x.y&1][1] = x;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[paring] not implemented yet!\n");
|
||||
exit(1);
|
||||
}
|
||||
// set pairing
|
||||
//fprintf(stderr, "[%ld, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n);
|
||||
if (o_score != (uint64_t)-1) {
|
||||
int mapQ_p = 0; // this is the maximum mapping quality when one end is moved
|
||||
//fprintf(stderr, "%d, %d\n", o_n, subo_n);
|
||||
if (o_n == 1) {
|
||||
if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair
|
||||
else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair
|
||||
else {
|
||||
int n = subo_n > 255? 255 : subo_n;
|
||||
mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n];
|
||||
if (mapQ_p < 0) mapQ_p = 0;
|
||||
}
|
||||
}
|
||||
if ((p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) && (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1))) { // both ends not moved
|
||||
if (p[0]->mapQ > 0 && p[1]->mapQ > 0) {
|
||||
int mapQ = p[0]->mapQ + p[1]->mapQ;
|
||||
if (mapQ > 60) mapQ = 60;
|
||||
p[0]->mapQ = p[1]->mapQ = mapQ;
|
||||
} else {
|
||||
if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ;
|
||||
if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ;
|
||||
}
|
||||
} else if (p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) { // [1] moved
|
||||
p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ;
|
||||
if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p;
|
||||
} else if (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1)) { // [0] moved
|
||||
p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ;
|
||||
if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p;
|
||||
} else { // both ends moved
|
||||
p[0]->seQ = p[1]->seQ = 0;
|
||||
mapQ_p -= 20;
|
||||
if (mapQ_p < 0) mapQ_p = 0;
|
||||
p[0]->mapQ = p[1]->mapQ = mapQ_p;
|
||||
}
|
||||
__pairing_aux2(p[0], o_pos[0]);
|
||||
__pairing_aux2(p[1], o_pos[1]);
|
||||
}
|
||||
return cnt_chg;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
kvec_t(bwt_aln1_t) aln;
|
||||
} aln_buf_t;
|
||||
|
||||
int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
|
||||
const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
|
||||
{
|
||||
int i, j, cnt_chg = 0;
|
||||
char str[1024];
|
||||
bwt_t *bwt;
|
||||
pe_data_t *d;
|
||||
aln_buf_t *buf[2];
|
||||
|
||||
d = (pe_data_t*)calloc(1, sizeof(pe_data_t));
|
||||
buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
|
||||
buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
|
||||
|
||||
if (_bwt == 0) { // load forward SA
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
||||
} else bwt = _bwt;
|
||||
|
||||
// SE
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p[2];
|
||||
for (j = 0; j < 2; ++j) {
|
||||
int n_aln;
|
||||
p[j] = seqs[j] + i;
|
||||
p[j]->n_multi = 0;
|
||||
p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2);
|
||||
err_fread_noeof(&n_aln, 4, 1, fp_sa[j]);
|
||||
if (n_aln > kv_max(d->aln[j]))
|
||||
kv_resize(bwt_aln1_t, d->aln[j], n_aln);
|
||||
d->aln[j].n = n_aln;
|
||||
err_fread_noeof(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]);
|
||||
kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j]
|
||||
// generate SE alignment and mapping quality
|
||||
bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
|
||||
if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
|
||||
int strand;
|
||||
int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
|
||||
p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
|
||||
p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len + p[j]->ref_shift, &strand);
|
||||
p[j]->strand = strand;
|
||||
if (p[j]->pos == (bwtint_t)-1) p[j]->type = BWA_TYPE_NO_MATCH;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// infer isize
|
||||
infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2);
|
||||
if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
|
||||
if (opt->force_isize) {
|
||||
fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
|
||||
ii->low = ii->high = 0; ii->avg = ii->std = -1.0;
|
||||
}
|
||||
|
||||
// PE
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p[2];
|
||||
for (j = 0; j < 2; ++j) {
|
||||
p[j] = seqs[j] + i;
|
||||
kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln);
|
||||
}
|
||||
if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
|
||||
&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
|
||||
{ // only when both ends mapped
|
||||
pair64_t x;
|
||||
int j, k;
|
||||
long long n_occ[2];
|
||||
for (j = 0; j < 2; ++j) {
|
||||
n_occ[j] = 0;
|
||||
for (k = 0; k < d->aln[j].n; ++k)
|
||||
n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1;
|
||||
}
|
||||
if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue;
|
||||
d->arr.n = 0;
|
||||
for (j = 0; j < 2; ++j) {
|
||||
for (k = 0; k < d->aln[j].n; ++k) {
|
||||
bwt_aln1_t *r = d->aln[j].a + k;
|
||||
bwtint_t l;
|
||||
if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
|
||||
pair64_t key;
|
||||
int ret;
|
||||
key.x = r->k; key.y = r->l;
|
||||
khint_t iter = kh_put(b128, g_hash, key, &ret);
|
||||
if (ret) { // not in the hash table; ret must equal 1 as we never remove elements
|
||||
poslist_t *z = &kh_val(g_hash, iter);
|
||||
z->n = r->l - r->k + 1;
|
||||
z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
|
||||
for (l = r->k; l <= r->l; ++l) {
|
||||
int strand;
|
||||
z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand)<<1;
|
||||
z->a[l - r->k] |= strand;
|
||||
}
|
||||
}
|
||||
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
|
||||
x.x = kh_val(g_hash, iter).a[l]>>1;
|
||||
x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
|
||||
kv_push(pair64_t, d->arr, x);
|
||||
}
|
||||
} else { // then calculate on the fly
|
||||
for (l = r->k; l <= r->l; ++l) {
|
||||
int strand;
|
||||
x.x = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand);
|
||||
x.y = k<<2 | strand<<1 | j;
|
||||
kv_push(pair64_t, d->arr, x);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cnt_chg += pairing(p, d, opt, gopt->s_mm, ii);
|
||||
}
|
||||
|
||||
if (opt->N_multi || opt->n_multi) {
|
||||
for (j = 0; j < 2; ++j) {
|
||||
if (p[j]->type != BWA_TYPE_NO_MATCH) {
|
||||
int k, n_multi;
|
||||
if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) {
|
||||
bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
|
||||
} else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
|
||||
for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) {
|
||||
int strand;
|
||||
bwt_multi1_t *q = p[j]->multi + k;
|
||||
q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand);
|
||||
q->strand = strand;
|
||||
if (q->pos != p[j]->pos && q->pos != (bwtint_t)-1)
|
||||
p[j]->multi[n_multi++] = *q;
|
||||
}
|
||||
p[j]->n_multi = n_multi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// free
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
kv_destroy(buf[0][i].aln);
|
||||
kv_destroy(buf[1][i].aln);
|
||||
}
|
||||
free(buf[0]); free(buf[1]);
|
||||
if (_bwt == 0) bwt_destroy(bwt);
|
||||
kv_destroy(d->arr);
|
||||
kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
|
||||
kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
|
||||
free(d);
|
||||
return cnt_chg;
|
||||
}
|
||||
|
||||
#define SW_MIN_MATCH_LEN 20
|
||||
#define SW_MIN_MAPQ 17
|
||||
|
||||
// cnt = n_mm<<16 | n_gapo<<8 | n_gape
|
||||
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt)
|
||||
{
|
||||
kswr_t r;
|
||||
uint32_t *cigar32 = 0;
|
||||
bwa_cigar_t *cigar = 0;
|
||||
ubyte_t *ref_seq;
|
||||
bwtint_t k, x, y, l;
|
||||
int xtra, gscore;
|
||||
int8_t mat[25];
|
||||
|
||||
bwa_fill_scmat(1, 3, mat);
|
||||
// check whether there are too many N's
|
||||
if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
|
||||
for (k = 0, x = 0; k < len; ++k)
|
||||
if (seq[k] >= 4) ++x;
|
||||
if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0;
|
||||
|
||||
// get reference subsequence
|
||||
ref_seq = (ubyte_t*)calloc(reglen, 1);
|
||||
for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
|
||||
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
||||
|
||||
// do alignment
|
||||
xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0);
|
||||
r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0);
|
||||
gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32);
|
||||
cigar = (bwa_cigar_t*)cigar32;
|
||||
for (k = 0; k < *n_cigar; ++k)
|
||||
cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
|
||||
|
||||
if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment
|
||||
free(cigar); free(ref_seq); *n_cigar = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// check whether the alignment is good enough
|
||||
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
|
||||
bwa_cigar_t c = cigar[k];
|
||||
if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c);
|
||||
else if (__cigar_op(c) == FROM_D) x += __cigar_len(c);
|
||||
else y += __cigar_len(c);
|
||||
}
|
||||
if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
|
||||
free(cigar); free(ref_seq);
|
||||
*n_cigar = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
{ // update cigar and coordinate;
|
||||
int start = r.qb, end = r.qe + 1;
|
||||
*beg += r.tb;
|
||||
cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
|
||||
if (start) {
|
||||
memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
|
||||
cigar[0] = __cigar_create(3, start);
|
||||
++(*n_cigar);
|
||||
}
|
||||
if (end < len) {
|
||||
/*cigar[*n_cigar] = 3<<14 | (len - end);*/
|
||||
cigar[*n_cigar] = __cigar_create(3, (len - end));
|
||||
++(*n_cigar);
|
||||
}
|
||||
}
|
||||
|
||||
{ // set *cnt
|
||||
int n_mm, n_gapo, n_gape;
|
||||
n_mm = n_gapo = n_gape = 0;
|
||||
x = r.tb; y = r.qb;
|
||||
for (k = 0; k < *n_cigar; ++k) {
|
||||
bwa_cigar_t c = cigar[k];
|
||||
if (__cigar_op(c) == FROM_M) {
|
||||
for (l = 0; l < (__cigar_len(c)); ++l)
|
||||
if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm;
|
||||
x += __cigar_len(c), y += __cigar_len(c);
|
||||
} else if (__cigar_op(c) == FROM_D) {
|
||||
x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
|
||||
} else if (__cigar_op(c) == FROM_I) {
|
||||
y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
|
||||
}
|
||||
}
|
||||
*_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
|
||||
}
|
||||
|
||||
free(ref_seq);
|
||||
return cigar;
|
||||
}
|
||||
|
||||
ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
|
||||
{
|
||||
ubyte_t *pacseq;
|
||||
int i;
|
||||
uint64_t n_tot[2], n_mapped[2];
|
||||
|
||||
// load reference sequence
|
||||
if (_pacseq == 0) {
|
||||
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
||||
err_rewind(bns->fp_pac);
|
||||
err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
} else pacseq = (ubyte_t*)_pacseq;
|
||||
if (!popt->is_sw || ii->avg < 0.0) return pacseq;
|
||||
|
||||
// perform mate alignment
|
||||
n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0;
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p[2];
|
||||
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
|
||||
if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ
|
||||
int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2];
|
||||
int64_t beg[2], end[2];
|
||||
bwa_cigar_t *cigar[2];
|
||||
uint32_t cnt[2];
|
||||
|
||||
/* In the following, _pref points to the reference read
|
||||
* which must be aligned; _pmate points to its mate which is
|
||||
* considered to be modified. */
|
||||
|
||||
#define __set_rght_coor(_a, _b, _pref, _pmate) do { \
|
||||
(_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \
|
||||
(_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
|
||||
if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \
|
||||
if ((_b) > bns->l_pac) (_b) = bns->l_pac; \
|
||||
} while (0)
|
||||
|
||||
#define __set_left_coor(_a, _b, _pref, _pmate) do { \
|
||||
(_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \
|
||||
(_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
|
||||
if ((_a) < 0) (_a) = 0; \
|
||||
if ((_b) > _pref->pos) (_b) = _pref->pos; \
|
||||
} while (0)
|
||||
|
||||
#define __set_fixed(_pref, _pmate, _beg, _cnt) do { \
|
||||
_pmate->type = BWA_TYPE_MATESW; \
|
||||
_pmate->pos = _beg; \
|
||||
_pmate->seQ = _pref->seQ; \
|
||||
_pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \
|
||||
_pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \
|
||||
_pmate->extra_flag |= SAM_FPP; \
|
||||
_pref->extra_flag |= SAM_FPP; \
|
||||
} while (0)
|
||||
|
||||
mq_adjust[0] = mq_adjust[1] = 255; // not effective
|
||||
is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0;
|
||||
|
||||
++n_tot[is_singleton];
|
||||
cigar[0] = cigar[1] = 0;
|
||||
n_cigar[0] = n_cigar[1] = 0;
|
||||
if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
|
||||
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
|
||||
ubyte_t *seq;
|
||||
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
|
||||
{ // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
|
||||
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
|
||||
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
|
||||
seq = p[k]->rseq;
|
||||
} else { // then the mate is on forward stand and has smaller coordinate
|
||||
__set_left_coor(beg[k], end[k], p[1-k], p[k]);
|
||||
seq = p[k]->seq;
|
||||
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
|
||||
}
|
||||
}
|
||||
// perform SW alignment
|
||||
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
|
||||
if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k]
|
||||
int s_old, clip = 0, s_new;
|
||||
if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]);
|
||||
if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]);
|
||||
s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499);
|
||||
s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499);
|
||||
s_old += -4.343 * log(ii->ap_prior / bns->l_pac);
|
||||
s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma
|
||||
if (s_old < s_new) { // reject SW alignment
|
||||
mq_adjust[k] = s_new - s_old;
|
||||
free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0;
|
||||
} else mq_adjust[k] = s_old - s_new;
|
||||
}
|
||||
// now revserse sequence back such that p[*]->seq looks untouched
|
||||
if (popt->type == BWA_PET_STD) {
|
||||
if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0);
|
||||
} else {
|
||||
if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0);
|
||||
}
|
||||
}
|
||||
k = -1; // no read to be changed
|
||||
if (cigar[0] && cigar[1]) {
|
||||
k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed
|
||||
mapQ = abs(p[1]->mapQ - p[0]->mapQ);
|
||||
} else if (cigar[0]) k = 0, mapQ = p[1]->mapQ;
|
||||
else if (cigar[1]) k = 1, mapQ = p[0]->mapQ;
|
||||
if (k >= 0 && p[k]->pos != beg[k]) {
|
||||
++n_mapped[is_singleton];
|
||||
{ // recalculate mapping quality
|
||||
int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8;
|
||||
if (tmp <= 0) tmp = 1;
|
||||
if (mapQ > tmp) mapQ = tmp;
|
||||
p[k]->mapQ = p[1-k]->mapQ = mapQ;
|
||||
p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ;
|
||||
if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k];
|
||||
if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k];
|
||||
}
|
||||
// update CIGAR
|
||||
free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0;
|
||||
p[k]->n_cigar = n_cigar[k];
|
||||
// update the rest of information
|
||||
__set_fixed(p[1-k], p[k], beg[k], cnt[k]);
|
||||
}
|
||||
free(cigar[0]); free(cigar[1]);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n",
|
||||
(long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ);
|
||||
fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n",
|
||||
(long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ);
|
||||
return pacseq;
|
||||
}
|
||||
|
||||
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
|
||||
{
|
||||
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
||||
int i, j, n_seqs;
|
||||
long long tot_seqs = 0;
|
||||
bwa_seq_t *seqs[2];
|
||||
bwa_seqio_t *ks[2];
|
||||
clock_t t;
|
||||
bntseq_t *bns;
|
||||
FILE *fp_sa[2];
|
||||
gap_opt_t opt, opt0;
|
||||
khint_t iter;
|
||||
isize_info_t last_ii; // this is for the last batch of reads
|
||||
char str[1024], magic[2][4];
|
||||
bwt_t *bwt;
|
||||
uint8_t *pac;
|
||||
|
||||
// initialization
|
||||
bwase_initialize(); // initialize g_log_n[] in bwase.c
|
||||
pac = 0; bwt = 0;
|
||||
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
||||
bns = bns_restore(prefix);
|
||||
srand48(bns->seed);
|
||||
fp_sa[0] = xopen(fn_sa[0], "r");
|
||||
fp_sa[1] = xopen(fn_sa[1], "r");
|
||||
g_hash = kh_init(b128);
|
||||
last_ii.avg = -1.0;
|
||||
|
||||
err_fread_noeof(magic[0], 1, 4, fp_sa[0]);
|
||||
err_fread_noeof(magic[1], 1, 4, fp_sa[1]);
|
||||
if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) {
|
||||
fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
|
||||
ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
|
||||
opt0 = opt;
|
||||
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
|
||||
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
|
||||
{ // for Illumina alignment only
|
||||
if (popt->is_preload) {
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
||||
pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
||||
err_rewind(bns->fp_pac);
|
||||
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
}
|
||||
}
|
||||
|
||||
// core loop
|
||||
bwa_print_sam_hdr(bns, rg_line);
|
||||
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
|
||||
int cnt_chg;
|
||||
isize_info_t ii;
|
||||
ubyte_t *pacseq;
|
||||
|
||||
seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual);
|
||||
tot_seqs += n_seqs;
|
||||
t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
|
||||
cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);
|
||||
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n");
|
||||
pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii);
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
|
||||
for (j = 0; j < 2; ++j)
|
||||
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
if (pac == 0) free(pacseq);
|
||||
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... ");
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
bwa_seq_t *p[2];
|
||||
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
|
||||
if (p[0]->bc[0] || p[1]->bc[0]) {
|
||||
strcat(p[0]->bc, p[1]->bc);
|
||||
strcpy(p[1]->bc, p[0]->bc);
|
||||
}
|
||||
bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2);
|
||||
bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2);
|
||||
if (strcmp(p[0]->name, p[1]->name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", p[0]->name, p[1]->name);
|
||||
}
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
for (j = 0; j < 2; ++j)
|
||||
bwa_free_read_seq(n_seqs, seqs[j]);
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] %lld sequences have been processed.\n", tot_seqs);
|
||||
last_ii = ii;
|
||||
}
|
||||
|
||||
// destroy
|
||||
bns_destroy(bns);
|
||||
for (i = 0; i < 2; ++i) {
|
||||
bwa_seq_close(ks[i]);
|
||||
err_fclose(fp_sa[i]);
|
||||
}
|
||||
for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter)
|
||||
if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
|
||||
kh_destroy(b128, g_hash);
|
||||
if (pac) {
|
||||
free(pac); bwt_destroy(bwt);
|
||||
}
|
||||
}
|
||||
|
||||
int bwa_sai2sam_pe(int argc, char *argv[])
|
||||
{
|
||||
int c;
|
||||
pe_opt_t *popt;
|
||||
char *prefix, *rg_line = 0;
|
||||
|
||||
popt = bwa_init_pe_opt();
|
||||
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'r':
|
||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
|
||||
break;
|
||||
case 'a': popt->max_isize = atoi(optarg); break;
|
||||
case 'o': popt->max_occ = atoi(optarg); break;
|
||||
case 's': popt->is_sw = 0; break;
|
||||
case 'P': popt->is_preload = 1; break;
|
||||
case 'n': popt->n_multi = atoi(optarg); break;
|
||||
case 'N': popt->N_multi = atoi(optarg); break;
|
||||
case 'c': popt->ap_prior = atof(optarg); break;
|
||||
case 'f': xreopen(optarg, "w", stdout); break;
|
||||
case 'A': popt->force_isize = 1; break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (optind + 5 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
|
||||
fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
|
||||
fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
|
||||
fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);
|
||||
fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi);
|
||||
fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior);
|
||||
fprintf(stderr, " -f FILE sam file to output results to [stdout]\n");
|
||||
fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||
fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
|
||||
fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n");
|
||||
fprintf(stderr, " -A disable insert size estimate (force -s)\n\n");
|
||||
fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
|
||||
fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
|
||||
fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n");
|
||||
fprintf(stderr, "\n");
|
||||
return 1;
|
||||
}
|
||||
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
|
||||
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
|
||||
free(prefix); free(popt);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,606 @@
|
|||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <assert.h>
|
||||
#include "bwase.h"
|
||||
#include "bwtaln.h"
|
||||
#include "bntseq.h"
|
||||
#include "utils.h"
|
||||
#include "kstring.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
int g_log_n[256];
|
||||
|
||||
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
|
||||
{
|
||||
int i, cnt, best;
|
||||
if (n_aln == 0) {
|
||||
s->type = BWA_TYPE_NO_MATCH;
|
||||
s->c1 = s->c2 = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if (set_main) {
|
||||
best = aln[0].score;
|
||||
for (i = cnt = 0; i < n_aln; ++i) {
|
||||
const bwt_aln1_t *p = aln + i;
|
||||
if (p->score > best) break;
|
||||
if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
|
||||
s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape;
|
||||
s->ref_shift = (int)p->n_del - (int)p->n_ins;
|
||||
s->score = p->score;
|
||||
s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
|
||||
}
|
||||
cnt += p->l - p->k + 1;
|
||||
}
|
||||
s->c1 = cnt;
|
||||
for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
|
||||
s->c2 = cnt - s->c1;
|
||||
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
|
||||
}
|
||||
|
||||
if (n_multi) {
|
||||
int k, rest, n_occ, z = 0;
|
||||
for (k = n_occ = 0; k < n_aln; ++k) {
|
||||
const bwt_aln1_t *q = aln + k;
|
||||
n_occ += q->l - q->k + 1;
|
||||
}
|
||||
if (s->multi) free(s->multi);
|
||||
if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
|
||||
s->multi = 0; s->n_multi = 0;
|
||||
return;
|
||||
}
|
||||
/* The following code is more flexible than what is required
|
||||
* here. In principle, due to the requirement above, we can
|
||||
* simply output all hits, but the following samples "rest"
|
||||
* number of random hits. */
|
||||
rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
|
||||
s->multi = calloc(rest, sizeof(bwt_multi1_t));
|
||||
for (k = 0; k < n_aln; ++k) {
|
||||
const bwt_aln1_t *q = aln + k;
|
||||
if (q->l - q->k + 1 <= rest) {
|
||||
bwtint_t l;
|
||||
for (l = q->k; l <= q->l; ++l) {
|
||||
s->multi[z].pos = l;
|
||||
s->multi[z].gap = q->n_gapo + q->n_gape;
|
||||
s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins;
|
||||
s->multi[z++].mm = q->n_mm;
|
||||
}
|
||||
rest -= q->l - q->k + 1;
|
||||
} else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
|
||||
int j, i;
|
||||
for (j = rest, i = q->l - q->k + 1; j > 0; --j) {
|
||||
double p = 1.0, x = drand48();
|
||||
while (x < p) p -= p * j / (i--);
|
||||
s->multi[z].pos = q->l - i;
|
||||
s->multi[z].gap = q->n_gapo + q->n_gape;
|
||||
s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins;
|
||||
s->multi[z++].mm = q->n_mm;
|
||||
}
|
||||
rest = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
s->n_multi = z;
|
||||
}
|
||||
}
|
||||
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
|
||||
{
|
||||
bwa_aln2seq_core(n_aln, aln, s, 1, 0);
|
||||
}
|
||||
|
||||
int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
|
||||
{
|
||||
int n;
|
||||
if (p->c1 == 0) return 23;
|
||||
if (p->c1 > 1) return 0;
|
||||
if (p->n_mm == mm) return 25;
|
||||
if (p->c2 == 0) return 37;
|
||||
n = (p->c2 >= 255)? 255 : p->c2;
|
||||
return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
|
||||
}
|
||||
|
||||
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand)
|
||||
{
|
||||
bwtint_t pos_f;
|
||||
int is_rev;
|
||||
*strand = 0; // initialise strand to 0 otherwise we could return without setting it
|
||||
pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate
|
||||
if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1;
|
||||
pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base
|
||||
*strand = !is_rev;
|
||||
if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base
|
||||
return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive the actual position in the read from the given suffix array
|
||||
* coordinates. Note that the position will be approximate based on
|
||||
* whether indels appear in the read and whether calculations are
|
||||
* performed from the start or end of the read.
|
||||
*/
|
||||
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
|
||||
{
|
||||
int max_diff, strand;
|
||||
if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
|
||||
max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
|
||||
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
||||
//fprintf(stderr, "%d\n", seq->ref_shift);
|
||||
seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand);
|
||||
seq->strand = strand;
|
||||
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
||||
if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH;
|
||||
}
|
||||
|
||||
void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
|
||||
{
|
||||
int i, j, strand, n_multi;
|
||||
char str[1024];
|
||||
bwt_t *bwt;
|
||||
// load forward SA
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p = &seqs[i];
|
||||
bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr);
|
||||
for (j = n_multi = 0; j < p->n_multi; ++j) {
|
||||
bwt_multi1_t *q = p->multi + j;
|
||||
q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + q->ref_shift, &strand);
|
||||
q->strand = strand;
|
||||
if (q->pos != p->pos && q->pos != (bwtint_t)-1)
|
||||
p->multi[n_multi++] = *q;
|
||||
}
|
||||
p->n_multi = n_multi;
|
||||
}
|
||||
bwt_destroy(bwt);
|
||||
}
|
||||
|
||||
#define SW_BW 50
|
||||
|
||||
bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar)
|
||||
{
|
||||
bwa_cigar_t *cigar = 0;
|
||||
uint32_t *cigar32 = 0;
|
||||
ubyte_t *rseq;
|
||||
int64_t k, rb, re, rlen;
|
||||
int8_t mat[25];
|
||||
int w;
|
||||
|
||||
bwa_fill_scmat(1, 3, mat);
|
||||
rb = *_rb; re = rb + len + ref_shift;
|
||||
assert(re <= l_pac);
|
||||
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
|
||||
assert(re - rb == rlen);
|
||||
w = abs((int)rlen - len) * 1.5;
|
||||
ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > w? SW_BW : w, n_cigar, &cigar32);
|
||||
assert(*n_cigar > 0);
|
||||
if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping
|
||||
if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping
|
||||
if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del
|
||||
if ((cigar32[0]&0xf) == 2) { // delete beginning del
|
||||
*_rb += cigar32[0]>>4;
|
||||
--*n_cigar;
|
||||
memmove(cigar32, cigar32+1, (*n_cigar) * 4);
|
||||
}
|
||||
cigar = (bwa_cigar_t*)cigar32;
|
||||
for (k = 0; k < *n_cigar; ++k)
|
||||
cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
|
||||
free(rseq);
|
||||
return cigar;
|
||||
}
|
||||
|
||||
char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
|
||||
bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
|
||||
{
|
||||
bwtint_t x, y;
|
||||
int z, u, c, nm = 0;
|
||||
str->l = 0; // reset
|
||||
x = pos; y = 0;
|
||||
if (cigar) {
|
||||
int k, l;
|
||||
for (k = u = 0; k < n_cigar; ++k) {
|
||||
l = __cigar_len(cigar[k]);
|
||||
if (__cigar_op(cigar[k]) == FROM_M) {
|
||||
for (z = 0; z < l && x+z < l_pac; ++z) {
|
||||
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
||||
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
||||
ksprintf(str, "%d", u);
|
||||
kputc("ACGTN"[c], str);
|
||||
++nm;
|
||||
u = 0;
|
||||
} else ++u;
|
||||
}
|
||||
x += l; y += l;
|
||||
} else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
|
||||
y += l;
|
||||
if (__cigar_op(cigar[k]) == FROM_I) nm += l;
|
||||
} else if (__cigar_op(cigar[k]) == FROM_D) {
|
||||
ksprintf(str, "%d", u);
|
||||
kputc('^', str);
|
||||
for (z = 0; z < l && x+z < l_pac; ++z)
|
||||
kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
|
||||
u = 0;
|
||||
x += l; nm += l;
|
||||
}
|
||||
}
|
||||
} else { // no gaps
|
||||
for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) {
|
||||
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
||||
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
||||
ksprintf(str, "%d", u);
|
||||
kputc("ACGTN"[c], str);
|
||||
++nm;
|
||||
u = 0;
|
||||
} else ++u;
|
||||
}
|
||||
}
|
||||
ksprintf(str, "%d", u);
|
||||
*_nm = nm;
|
||||
return strdup(str->s);
|
||||
}
|
||||
|
||||
void bwa_correct_trimmed(bwa_seq_t *s)
|
||||
{
|
||||
if (s->len == s->full_len) return;
|
||||
if (s->strand == 0) { // forward
|
||||
if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
|
||||
s->cigar[s->n_cigar-1] += s->full_len - s->len;
|
||||
} else {
|
||||
if (s->cigar == 0) {
|
||||
s->n_cigar = 2;
|
||||
s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
|
||||
s->cigar[0] = __cigar_create(0, s->len);
|
||||
} else {
|
||||
++s->n_cigar;
|
||||
s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
|
||||
}
|
||||
s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
|
||||
}
|
||||
} else { // reverse
|
||||
if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
|
||||
s->cigar[0] += s->full_len - s->len;
|
||||
} else {
|
||||
if (s->cigar == 0) {
|
||||
s->n_cigar = 2;
|
||||
s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
|
||||
s->cigar[1] = __cigar_create(0, s->len);
|
||||
} else {
|
||||
++s->n_cigar;
|
||||
s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
|
||||
memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
|
||||
}
|
||||
s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
|
||||
}
|
||||
}
|
||||
s->len = s->full_len;
|
||||
}
|
||||
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
|
||||
{
|
||||
ubyte_t *pacseq;
|
||||
int i, j, k;
|
||||
kstring_t *str;
|
||||
|
||||
if (!_pacseq) {
|
||||
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
||||
err_rewind(bns->fp_pac);
|
||||
err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
} else pacseq = _pacseq;
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *s = seqs + i;
|
||||
seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
|
||||
for (j = k = 0; j < s->n_multi; ++j) {
|
||||
bwt_multi1_t *q = s->multi + j;
|
||||
int n_cigar;
|
||||
if (q->gap) { // gapped alignment
|
||||
q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar);
|
||||
q->n_cigar = n_cigar;
|
||||
if (q->cigar) s->multi[k++] = *q;
|
||||
} else s->multi[k++] = *q;
|
||||
}
|
||||
s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation
|
||||
if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
|
||||
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar);
|
||||
if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH;
|
||||
}
|
||||
// generate MD tag
|
||||
str = (kstring_t*)calloc(1, sizeof(kstring_t));
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *s = seqs + i;
|
||||
if (s->type != BWA_TYPE_NO_MATCH) {
|
||||
int nm;
|
||||
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm);
|
||||
s->nm = nm;
|
||||
}
|
||||
}
|
||||
free(str->s); free(str);
|
||||
|
||||
// correct for trimmed reads
|
||||
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
|
||||
|
||||
if (!_pacseq) free(pacseq);
|
||||
}
|
||||
|
||||
int64_t pos_end(const bwa_seq_t *p)
|
||||
{
|
||||
if (p->cigar) {
|
||||
int j;
|
||||
int64_t x = p->pos;
|
||||
for (j = 0; j != p->n_cigar; ++j) {
|
||||
int op = __cigar_op(p->cigar[j]);
|
||||
if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
|
||||
}
|
||||
return x;
|
||||
} else return p->pos + p->len;
|
||||
}
|
||||
|
||||
int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
|
||||
{
|
||||
if (p->cigar) {
|
||||
int j;
|
||||
int64_t x = p->pos;
|
||||
for (j = 0; j != p->n_cigar; ++j) {
|
||||
int op = __cigar_op(p->cigar[j]);
|
||||
if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
|
||||
}
|
||||
return x;
|
||||
} else return p->pos + len;
|
||||
}
|
||||
|
||||
static int64_t pos_5(const bwa_seq_t *p)
|
||||
{
|
||||
if (p->type != BWA_TYPE_NO_MATCH)
|
||||
return p->strand? pos_end(p) : p->pos;
|
||||
return -1;
|
||||
}
|
||||
|
||||
void bwa_print_seq(FILE *stream, bwa_seq_t *seq) {
|
||||
char buffer[4096];
|
||||
const int bsz = sizeof(buffer);
|
||||
int i, j, l;
|
||||
|
||||
if (seq->strand == 0) {
|
||||
for (i = 0; i < seq->full_len; i += bsz) {
|
||||
l = seq->full_len - i > bsz ? bsz : seq->full_len - i;
|
||||
for (j = 0; j < l; j++) buffer[j] = "ACGTN"[seq->seq[i + j]];
|
||||
err_fwrite(buffer, 1, l, stream);
|
||||
}
|
||||
} else {
|
||||
for (i = seq->full_len - 1; i >= 0; i -= bsz) {
|
||||
l = i + 1 > bsz ? bsz : i + 1;
|
||||
for (j = 0; j < l; j++) buffer[j] = "TGCAN"[seq->seq[i - j]];
|
||||
err_fwrite(buffer, 1, l, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
|
||||
{
|
||||
int j;
|
||||
if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
|
||||
int seqid, nn, am = 0, flag = p->extra_flag;
|
||||
char XT;
|
||||
|
||||
if (p->type == BWA_TYPE_NO_MATCH) {
|
||||
p->pos = mate->pos;
|
||||
p->strand = mate->strand;
|
||||
flag |= SAM_FSU;
|
||||
j = 1;
|
||||
} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
|
||||
|
||||
// get seqid
|
||||
nn = bns_cnt_ambi(bns, p->pos, j, &seqid);
|
||||
if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
|
||||
flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
|
||||
|
||||
// update flag and print it
|
||||
if (p->strand) flag |= SAM_FSR;
|
||||
if (mate) {
|
||||
if (mate->type != BWA_TYPE_NO_MATCH) {
|
||||
if (mate->strand) flag |= SAM_FMR;
|
||||
} else flag |= SAM_FMU;
|
||||
}
|
||||
err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
|
||||
err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
|
||||
|
||||
// print CIGAR
|
||||
if (p->cigar) {
|
||||
for (j = 0; j != p->n_cigar; ++j)
|
||||
err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
|
||||
} else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*");
|
||||
else err_printf("%dM", p->len);
|
||||
|
||||
// print mate coordinate
|
||||
if (mate && mate->type != BWA_TYPE_NO_MATCH) {
|
||||
int m_seqid;
|
||||
long long isize;
|
||||
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
|
||||
// redundant calculation here, but should not matter too much
|
||||
bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
|
||||
err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
|
||||
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
|
||||
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
|
||||
err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
|
||||
} else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
|
||||
else err_printf("\t*\t0\t0\t");
|
||||
|
||||
// print sequence and quality
|
||||
bwa_print_seq(stdout, p);
|
||||
err_putchar('\t');
|
||||
if (p->qual) {
|
||||
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
||||
err_printf("%s", p->qual);
|
||||
} else err_printf("*");
|
||||
|
||||
if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
|
||||
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
|
||||
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
|
||||
if (p->type != BWA_TYPE_NO_MATCH) {
|
||||
int i;
|
||||
// calculate XT tag
|
||||
XT = "NURM"[p->type];
|
||||
if (nn > 10) XT = 'N';
|
||||
// print tags
|
||||
err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
|
||||
if (nn) err_printf("\tXN:i:%d", nn);
|
||||
if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
|
||||
if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
|
||||
err_printf("\tX0:i:%d", p->c1);
|
||||
if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2);
|
||||
}
|
||||
err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
|
||||
if (p->md) err_printf("\tMD:Z:%s", p->md);
|
||||
// print multiple hits
|
||||
if (p->n_multi) {
|
||||
err_printf("\tXA:Z:");
|
||||
for (i = 0; i < p->n_multi; ++i) {
|
||||
bwt_multi1_t *q = p->multi + i;
|
||||
int k;
|
||||
j = pos_end_multi(q, p->len) - q->pos;
|
||||
nn = bns_cnt_ambi(bns, q->pos, j, &seqid);
|
||||
err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
|
||||
(int)(q->pos - bns->anns[seqid].offset + 1));
|
||||
if (q->cigar) {
|
||||
for (k = 0; k < q->n_cigar; ++k)
|
||||
err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
|
||||
} else err_printf("%dM", p->len);
|
||||
err_printf(",%d;", q->gap + q->mm);
|
||||
}
|
||||
}
|
||||
}
|
||||
err_putchar('\n');
|
||||
} else { // this read has no match
|
||||
//ubyte_t *s = p->strand? p->rseq : p->seq;
|
||||
int flag = p->extra_flag | SAM_FSU;
|
||||
if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
|
||||
err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
|
||||
//Why did this work differently to the version above??
|
||||
//for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
|
||||
bwa_print_seq(stdout, p);
|
||||
err_putchar('\t');
|
||||
if (p->qual) {
|
||||
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
||||
err_printf("%s", p->qual);
|
||||
} else err_printf("*");
|
||||
if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
|
||||
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
|
||||
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
|
||||
err_putchar('\n');
|
||||
}
|
||||
}
|
||||
|
||||
void bwase_initialize()
|
||||
{
|
||||
int i;
|
||||
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
||||
}
|
||||
|
||||
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
|
||||
{
|
||||
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
||||
int i, n_seqs, m_aln;
|
||||
long long tot_seqs = 0;
|
||||
bwt_aln1_t *aln = 0;
|
||||
bwa_seq_t *seqs;
|
||||
bwa_seqio_t *ks;
|
||||
clock_t t;
|
||||
bntseq_t *bns;
|
||||
FILE *fp_sa;
|
||||
gap_opt_t opt;
|
||||
char magic[4];
|
||||
|
||||
// initialization
|
||||
bwase_initialize();
|
||||
bns = bns_restore(prefix);
|
||||
srand48(bns->seed);
|
||||
fp_sa = xopen(fn_sa, "r");
|
||||
|
||||
m_aln = 0;
|
||||
err_fread_noeof(magic, 1, 4, fp_sa);
|
||||
if (strncmp(magic, SAI_MAGIC, 4) != 0) {
|
||||
fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
||||
bwa_print_sam_hdr(bns, rg_line);
|
||||
// set ks
|
||||
ks = bwa_open_reads(opt.mode, fn_fa);
|
||||
// core loop
|
||||
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
|
||||
tot_seqs += n_seqs;
|
||||
t = clock();
|
||||
|
||||
// read alignment
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
int n_aln;
|
||||
err_fread_noeof(&n_aln, 4, 1, fp_sa);
|
||||
if (n_aln > m_aln) {
|
||||
m_aln = n_aln;
|
||||
aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
|
||||
}
|
||||
err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
|
||||
bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
|
||||
}
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
|
||||
bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
|
||||
bwa_refine_gapped(bns, n_seqs, seqs, 0);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] print alignments... ");
|
||||
for (i = 0; i < n_seqs; ++i)
|
||||
bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
|
||||
bwa_free_read_seq(n_seqs, seqs);
|
||||
fprintf(stderr, "[bwa_aln_core] %lld sequences have been processed.\n", tot_seqs);
|
||||
}
|
||||
|
||||
// destroy
|
||||
bwa_seq_close(ks);
|
||||
bns_destroy(bns);
|
||||
err_fclose(fp_sa);
|
||||
free(aln);
|
||||
}
|
||||
|
||||
int bwa_sai2sam_se(int argc, char *argv[])
|
||||
{
|
||||
int c, n_occ = 3;
|
||||
char *prefix, *rg_line = 0;
|
||||
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'h': break;
|
||||
case 'r':
|
||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
|
||||
break;
|
||||
case 'n': n_occ = atoi(optarg); break;
|
||||
case 'f': xreopen(optarg, "w", stdout); break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (optind + 3 > argc) {
|
||||
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
|
||||
return 1;
|
||||
}
|
||||
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
|
||||
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
|
||||
free(prefix);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#ifndef BWASE_H
|
||||
#define BWASE_H
|
||||
|
||||
#include "bntseq.h"
|
||||
#include "bwt.h"
|
||||
#include "bwtaln.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Initialize mapping tables in the bwa single-end mapper.
|
||||
void bwase_initialize();
|
||||
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
|
||||
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
|
||||
// Refine the approximate position of the sequence to an actual placement for the sequence.
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
|
||||
// Backfill certain alignment properties mainly centering around number of matches.
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
||||
// Calculate the end position of a read given a certain sequence.
|
||||
int64_t pos_end(const bwa_seq_t *p);
|
||||
//
|
||||
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BWASE_H
|
||||
|
|
@ -0,0 +1,235 @@
|
|||
#include <zlib.h>
|
||||
#include <ctype.h>
|
||||
#include "bwtaln.h"
|
||||
#include "utils.h"
|
||||
#include "bamlite.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
extern unsigned char nst_nt4_table[256];
|
||||
static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
|
||||
|
||||
struct __bwa_seqio_t {
|
||||
// for BAM input
|
||||
int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE
|
||||
bamFile fp;
|
||||
// for fastq input
|
||||
kseq_t *ks;
|
||||
};
|
||||
|
||||
bwa_seqio_t *bwa_bam_open(const char *fn, int which)
|
||||
{
|
||||
bwa_seqio_t *bs;
|
||||
bam_header_t *h;
|
||||
bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
|
||||
bs->is_bam = 1;
|
||||
bs->which = which;
|
||||
bs->fp = bam_open(fn, "r");
|
||||
if (0 == bs->fp) err_fatal_simple("Couldn't open bam file");
|
||||
h = bam_header_read(bs->fp);
|
||||
bam_header_destroy(h);
|
||||
return bs;
|
||||
}
|
||||
|
||||
bwa_seqio_t *bwa_seq_open(const char *fn)
|
||||
{
|
||||
gzFile fp;
|
||||
bwa_seqio_t *bs;
|
||||
bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
|
||||
fp = xzopen(fn, "r");
|
||||
bs->ks = kseq_init(fp);
|
||||
return bs;
|
||||
}
|
||||
|
||||
void bwa_seq_close(bwa_seqio_t *bs)
|
||||
{
|
||||
if (bs == 0) return;
|
||||
if (bs->is_bam) {
|
||||
if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file");
|
||||
} else {
|
||||
err_gzclose(bs->ks->f->f);
|
||||
kseq_destroy(bs->ks);
|
||||
}
|
||||
free(bs);
|
||||
}
|
||||
|
||||
void seq_reverse(int len, ubyte_t *seq, int is_comp)
|
||||
{
|
||||
int i;
|
||||
if (is_comp) {
|
||||
for (i = 0; i < len>>1; ++i) {
|
||||
char tmp = seq[len-1-i];
|
||||
if (tmp < 4) tmp = 3 - tmp;
|
||||
seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
|
||||
seq[i] = tmp;
|
||||
}
|
||||
if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
|
||||
} else {
|
||||
for (i = 0; i < len>>1; ++i) {
|
||||
char tmp = seq[len-1-i];
|
||||
seq[len-1-i] = seq[i]; seq[i] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int bwa_trim_read(int trim_qual, bwa_seq_t *p)
|
||||
{
|
||||
int s = 0, l, max = 0, max_l = p->len;
|
||||
if (trim_qual < 1 || p->qual == 0) return 0;
|
||||
for (l = p->len - 1; l >= BWA_MIN_RDLEN; --l) {
|
||||
s += trim_qual - (p->qual[l] - 33);
|
||||
if (s < 0) break;
|
||||
if (s > max) max = s, max_l = l;
|
||||
}
|
||||
p->clip_len = p->len = max_l;
|
||||
return p->full_len - p->len;
|
||||
}
|
||||
|
||||
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
|
||||
{
|
||||
bwa_seq_t *seqs, *p;
|
||||
int n_seqs, l, i;
|
||||
long n_trimmed = 0, n_tot = 0;
|
||||
bam1_t *b;
|
||||
int res;
|
||||
|
||||
b = bam_init1();
|
||||
n_seqs = 0;
|
||||
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
|
||||
while ((res = bam_read1(bs->fp, b)) >= 0) {
|
||||
uint8_t *s, *q;
|
||||
int go = 0;
|
||||
if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
|
||||
if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
|
||||
if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
|
||||
if (go == 0) continue;
|
||||
l = b->core.l_qseq;
|
||||
p = &seqs[n_seqs++];
|
||||
p->tid = -1; // no assigned to a thread
|
||||
p->qual = 0;
|
||||
p->full_len = p->clip_len = p->len = l;
|
||||
n_tot += p->full_len;
|
||||
s = bam1_seq(b); q = bam1_qual(b);
|
||||
p->seq = (ubyte_t*)calloc(p->len + 1, 1);
|
||||
p->qual = (ubyte_t*)calloc(p->len + 1, 1);
|
||||
for (i = 0; i != p->full_len; ++i) {
|
||||
p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
|
||||
p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
|
||||
}
|
||||
if (bam1_strand(b)) { // then reverse
|
||||
seq_reverse(p->len, p->seq, 1);
|
||||
seq_reverse(p->len, p->qual, 0);
|
||||
}
|
||||
if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
|
||||
p->rseq = (ubyte_t*)calloc(p->full_len, 1);
|
||||
memcpy(p->rseq, p->seq, p->len);
|
||||
seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
|
||||
seq_reverse(p->len, p->rseq, is_comp);
|
||||
p->name = strdup((const char*)bam1_qname(b));
|
||||
if (n_seqs == n_needed) break;
|
||||
}
|
||||
if (res < 0 && res != -1) err_fatal_simple("Error reading bam file");
|
||||
*n = n_seqs;
|
||||
if (n_seqs && trim_qual >= 1)
|
||||
fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
|
||||
if (n_seqs == 0) {
|
||||
free(seqs);
|
||||
bam_destroy1(b);
|
||||
return 0;
|
||||
}
|
||||
bam_destroy1(b);
|
||||
return seqs;
|
||||
}
|
||||
|
||||
#define BARCODE_LOW_QUAL 13
|
||||
|
||||
bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
|
||||
{
|
||||
bwa_seq_t *seqs, *p;
|
||||
kseq_t *seq = bs->ks;
|
||||
int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
|
||||
long n_trimmed = 0, n_tot = 0;
|
||||
|
||||
if (l_bc > BWA_MAX_BCLEN) {
|
||||
fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN);
|
||||
return 0;
|
||||
}
|
||||
if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
|
||||
n_seqs = 0;
|
||||
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
|
||||
while ((l = kseq_read(seq)) >= 0) {
|
||||
if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) {
|
||||
// skip reads that are marked to be filtered by Casava
|
||||
char *s = index(seq->comment.s, ':');
|
||||
if (s && *(++s) == 'Y') {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (is_64 && seq->qual.l)
|
||||
for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
|
||||
if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
|
||||
p = &seqs[n_seqs++];
|
||||
if (l_bc) { // then trim barcode
|
||||
for (i = 0; i < l_bc; ++i)
|
||||
p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
|
||||
p->bc[i] = 0;
|
||||
for (; i < seq->seq.l; ++i)
|
||||
seq->seq.s[i - l_bc] = seq->seq.s[i];
|
||||
seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
|
||||
if (seq->qual.l) {
|
||||
for (i = l_bc; i < seq->qual.l; ++i)
|
||||
seq->qual.s[i - l_bc] = seq->qual.s[i];
|
||||
seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
|
||||
}
|
||||
l = seq->seq.l;
|
||||
} else p->bc[0] = 0;
|
||||
p->tid = -1; // no assigned to a thread
|
||||
p->qual = 0;
|
||||
p->full_len = p->clip_len = p->len = l;
|
||||
n_tot += p->full_len;
|
||||
p->seq = (ubyte_t*)calloc(p->full_len, 1);
|
||||
for (i = 0; i != p->full_len; ++i)
|
||||
p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
||||
if (seq->qual.l) { // copy quality
|
||||
p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
|
||||
if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
|
||||
}
|
||||
p->rseq = (ubyte_t*)calloc(p->full_len, 1);
|
||||
memcpy(p->rseq, p->seq, p->len);
|
||||
seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
|
||||
seq_reverse(p->len, p->rseq, is_comp);
|
||||
p->name = strdup((const char*)seq->name.s);
|
||||
{ // trim /[12]$
|
||||
int t = strlen(p->name);
|
||||
if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
|
||||
}
|
||||
if (n_seqs == n_needed) break;
|
||||
}
|
||||
*n = n_seqs;
|
||||
if (n_seqs && trim_qual >= 1)
|
||||
fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
|
||||
if (n_seqs == 0) {
|
||||
free(seqs);
|
||||
return 0;
|
||||
}
|
||||
return seqs;
|
||||
}
|
||||
|
||||
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
for (j = 0; j < p->n_multi; ++j)
|
||||
if (p->multi[j].cigar) free(p->multi[j].cigar);
|
||||
free(p->name);
|
||||
free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
|
||||
free(p->cigar);
|
||||
}
|
||||
free(seqs);
|
||||
}
|
||||
|
|
@ -0,0 +1,213 @@
|
|||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include "bwa.h"
|
||||
|
||||
int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
|
||||
{
|
||||
const char *name;
|
||||
uint8_t *shm, *shm_idx;
|
||||
uint16_t *cnt;
|
||||
int shmid, to_init = 0, l;
|
||||
char path[PATH_MAX + 1], *tmpfn = (char*)_tmpfn;
|
||||
|
||||
if (hint == 0 || hint[0] == 0) return -1;
|
||||
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
|
||||
++name;
|
||||
|
||||
if ((shmid = shm_open("/bwactl", O_RDWR, 0)) < 0) {
|
||||
shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
|
||||
to_init = 1;
|
||||
}
|
||||
if (shmid < 0) return -1;
|
||||
ftruncate(shmid, BWA_CTL_SIZE);
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
if (to_init) {
|
||||
memset(shm, 0, BWA_CTL_SIZE);
|
||||
cnt[1] = 4;
|
||||
}
|
||||
|
||||
if (idx->mem == 0) bwa_idx2mem(idx);
|
||||
|
||||
if (tmpfn) {
|
||||
FILE *fp;
|
||||
if ((fp = fopen(tmpfn, "wb")) != 0) {
|
||||
int64_t rest = idx->l_mem;
|
||||
while (rest > 0) {
|
||||
int64_t l = rest < 0x1000000? rest : 0x1000000;
|
||||
rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
free(idx->mem); idx->mem = 0;
|
||||
} else {
|
||||
fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__);
|
||||
tmpfn = 0;
|
||||
}
|
||||
}
|
||||
|
||||
strcat(strcpy(path, "/bwaidx-"), name);
|
||||
if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
|
||||
shm_unlink(path);
|
||||
perror("shm_open()");
|
||||
return -1;
|
||||
}
|
||||
l = 8 + strlen(name) + 1;
|
||||
if (cnt[1] + l > BWA_CTL_SIZE) return -1;
|
||||
memcpy(shm + cnt[1], &idx->l_mem, 8);
|
||||
memcpy(shm + cnt[1] + 8, name, l - 8);
|
||||
cnt[1] += l; ++cnt[0];
|
||||
ftruncate(shmid, idx->l_mem);
|
||||
shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
if (tmpfn) {
|
||||
FILE *fp;
|
||||
fp = fopen(tmpfn, "rb");
|
||||
int64_t rest = idx->l_mem;
|
||||
while (rest > 0) {
|
||||
int64_t l = rest < 0x1000000? rest : 0x1000000;
|
||||
rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
unlink(tmpfn);
|
||||
} else {
|
||||
memcpy(shm_idx, idx->mem, idx->l_mem);
|
||||
free(idx->mem);
|
||||
}
|
||||
bwa_mem2idx(idx->l_mem, shm_idx, idx);
|
||||
idx->is_shm = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_idx_load_from_shm(const char *hint)
|
||||
{
|
||||
const char *name;
|
||||
uint8_t *shm, *shm_idx;
|
||||
uint16_t *cnt, i;
|
||||
char *p, path[PATH_MAX + 1];
|
||||
int shmid;
|
||||
int64_t l_mem;
|
||||
bwaidx_t *idx;
|
||||
|
||||
if (hint == 0 || hint[0] == 0) return 0;
|
||||
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
|
||||
++name;
|
||||
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
if (cnt[0] == 0) return 0;
|
||||
for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) {
|
||||
memcpy(&l_mem, p, 8); p += 8;
|
||||
if (strcmp(p, name) == 0) break;
|
||||
p += strlen(p) + 1;
|
||||
}
|
||||
if (i == cnt[0]) return 0;
|
||||
|
||||
strcat(strcpy(path, "/bwaidx-"), name);
|
||||
if ((shmid = shm_open(path, O_RDONLY, 0)) < 0) return 0;
|
||||
shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0);
|
||||
idx = calloc(1, sizeof(bwaidx_t));
|
||||
bwa_mem2idx(l_mem, shm_idx, idx);
|
||||
idx->is_shm = 1;
|
||||
return idx;
|
||||
}
|
||||
|
||||
int bwa_shm_test(const char *hint)
|
||||
{
|
||||
int shmid;
|
||||
uint16_t *cnt, i;
|
||||
char *p, *shm;
|
||||
const char *name;
|
||||
|
||||
if (hint == 0 || hint[0] == 0) return 0;
|
||||
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
|
||||
++name;
|
||||
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
|
||||
if (strcmp(p + 8, name) == 0) return 1;
|
||||
p += strlen(p) + 9;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_shm_list(void)
|
||||
{
|
||||
int shmid;
|
||||
uint16_t *cnt, i;
|
||||
char *p, *shm;
|
||||
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
|
||||
int64_t l_mem;
|
||||
memcpy(&l_mem, p, 8); p += 8;
|
||||
printf("%s\t%ld\n", p, (long)l_mem);
|
||||
p += strlen(p) + 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_shm_destroy(void)
|
||||
{
|
||||
int shmid;
|
||||
uint16_t *cnt, i;
|
||||
char *p, *shm;
|
||||
char path[PATH_MAX + 1];
|
||||
|
||||
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
|
||||
int64_t l_mem;
|
||||
memcpy(&l_mem, p, 8); p += 8;
|
||||
strcat(strcpy(path, "/bwaidx-"), p);
|
||||
shm_unlink(path);
|
||||
p += strlen(p) + 1;
|
||||
}
|
||||
munmap(shm, BWA_CTL_SIZE);
|
||||
shm_unlink("/bwactl");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main_shm(int argc, char *argv[])
|
||||
{
|
||||
int c, to_list = 0, to_drop = 0, ret = 0;
|
||||
char *tmpfn = 0;
|
||||
while ((c = getopt(argc, argv, "ldf:")) >= 0) {
|
||||
if (c == 'l') to_list = 1;
|
||||
else if (c == 'd') to_drop = 1;
|
||||
else if (c == 'f') tmpfn = optarg;
|
||||
}
|
||||
if (optind == argc && !to_list && !to_drop) {
|
||||
fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
|
||||
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
|
||||
fprintf(stderr, " -l list names of indices in shared memory\n");
|
||||
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
|
||||
return 1;
|
||||
}
|
||||
if (optind < argc && (to_list || to_drop)) {
|
||||
fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (optind < argc) {
|
||||
if (bwa_shm_test(argv[optind]) == 0) {
|
||||
bwaidx_t *idx;
|
||||
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
|
||||
if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
|
||||
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
|
||||
ret = 1;
|
||||
}
|
||||
bwa_idx_destroy(idx);
|
||||
} else fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
|
||||
}
|
||||
if (to_list) bwa_shm_list();
|
||||
if (to_drop) bwa_shm_destroy();
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -0,0 +1,477 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008 Genome Research Ltd (GRL).
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <limits.h>
|
||||
#include "utils.h"
|
||||
#include "bwt.h"
|
||||
#include "kvec.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
// 计算一个字节构成的A,T,C,G序列,对应的每个碱基的个数,因为最多有4个相同的碱基,所以每次左移3位就行
|
||||
void bwt_gen_cnt_table(bwt_t *bwt)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i != 256; ++i) {
|
||||
uint32_t x = 0;
|
||||
for (j = 0; j != 4; ++j)
|
||||
x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
|
||||
bwt->cnt_table[i] = x;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
|
||||
{
|
||||
bwtint_t x = k - (k > bwt->primary); // bwt中不包含$,所以位置超过序列长度后,要减掉1,(因为后边加上了互补链)
|
||||
// x = bwt_B0(bwt, x); // 获取x位置对应的字符
|
||||
|
||||
uint32_t t1 = (bwt->bwt[((x) >> 7 << 4) + sizeof(bwtint_t) + (((x) & 0x7f) >> 4)]);
|
||||
uint32_t t2 = t1 >> ((~(x) & 0xf) << 1) & 3;
|
||||
|
||||
x = t2;
|
||||
|
||||
x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置(后缀索引),
|
||||
return k == bwt->primary? 0 : x;
|
||||
}
|
||||
|
||||
// bwt->bwt and bwt->occ must be precalculated
|
||||
void bwt_cal_sa(bwt_t *bwt, int intv)
|
||||
{
|
||||
bwtint_t isa, sa, i; // S(isa) = sa isa是后缀数组的索引,sa表示位置
|
||||
int intv_round = intv; // 间隔多少来保存一个位置信息
|
||||
|
||||
kv_roundup32(intv_round);
|
||||
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
|
||||
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
|
||||
|
||||
if (bwt->sa) free(bwt->sa);
|
||||
bwt->sa_intv = intv;
|
||||
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
||||
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); // 用64位无符号整数来保存位置信息,其实用33位就够了
|
||||
// calculate SA value
|
||||
isa = 0; sa = bwt->seq_len;
|
||||
for (i = 0; i < bwt->seq_len; ++i) {
|
||||
if (isa % intv == 0) bwt->sa[isa/intv] = sa; // 第一个位置是$,所以位置就是序列长度
|
||||
--sa; // 从后往前,一个位置一个位置的找对应的后缀数组,isa就是与sa对应的后缀数组排序后在sa数组中的相对位置
|
||||
isa = bwt_invPsi(bwt, isa); // 下一个后缀数组的相对位置
|
||||
}
|
||||
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
||||
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
||||
}
|
||||
|
||||
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
||||
{
|
||||
bwtint_t sa = 0, mask = bwt->sa_intv - 1;
|
||||
while (k & mask) {
|
||||
++sa;
|
||||
k = bwt_invPsi(bwt, k);
|
||||
}
|
||||
/* without setting bwt->sa[0] = -1, the following line should be
|
||||
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
|
||||
return sa + bwt->sa[k/bwt->sa_intv];
|
||||
}
|
||||
|
||||
static inline int __occ_aux(uint64_t y, int c)
|
||||
{
|
||||
// reduce nucleotide counting to bits counting
|
||||
y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
|
||||
// count the number of 1s in y
|
||||
y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
|
||||
return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
|
||||
}
|
||||
|
||||
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
|
||||
{
|
||||
bwtint_t n;
|
||||
uint32_t *p, *end;
|
||||
|
||||
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
|
||||
if (k == (bwtint_t)(-1)) return 0;
|
||||
k -= (k >= bwt->primary); // because $ is not in bwt
|
||||
|
||||
// retrieve Occ at k/OCC_INTERVAL
|
||||
n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
|
||||
p += sizeof(bwtint_t); // jump to the start of the first BWT cell
|
||||
|
||||
// calculate Occ up to the last k/32
|
||||
end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1);
|
||||
for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
||||
|
||||
// calculate Occ
|
||||
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
|
||||
if (c == 0) n -= ~k&31; // corrected for the masked bits
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
// an analogy to bwt_occ() but more efficient, requiring k <= l
|
||||
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
|
||||
{
|
||||
bwtint_t _k, _l;
|
||||
_k = (k >= bwt->primary)? k-1 : k;
|
||||
_l = (l >= bwt->primary)? l-1 : l;
|
||||
if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
||||
*ok = bwt_occ(bwt, k, c);
|
||||
*ol = bwt_occ(bwt, l, c);
|
||||
} else {
|
||||
bwtint_t m, n, i, j;
|
||||
uint32_t *p;
|
||||
if (k >= bwt->primary) --k;
|
||||
if (l >= bwt->primary) --l;
|
||||
n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
|
||||
p += sizeof(bwtint_t);
|
||||
// calculate *ok
|
||||
j = k >> 5 << 5;
|
||||
for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
|
||||
n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
||||
m = n;
|
||||
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
|
||||
if (c == 0) n -= ~k&31; // corrected for the masked bits
|
||||
*ok = n;
|
||||
// calculate *ol
|
||||
j = l >> 5 << 5;
|
||||
for (; i < j; i += 32, p += 2)
|
||||
m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
||||
m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
|
||||
if (c == 0) m -= ~l&31; // corrected for the masked bits
|
||||
*ol = m;
|
||||
}
|
||||
}
|
||||
|
||||
#define __occ_aux4(bwt, b) \
|
||||
((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
|
||||
+ (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
|
||||
|
||||
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
|
||||
{
|
||||
bwtint_t x;
|
||||
uint32_t *p, tmp, *end;
|
||||
if (k == (bwtint_t)(-1)) {
|
||||
memset(cnt, 0, 4 * sizeof(bwtint_t));
|
||||
return;
|
||||
}
|
||||
k -= (k >= bwt->primary); // because $ is not in bwt
|
||||
p = bwt_occ_intv(bwt, k);
|
||||
memcpy(cnt, p, 4 * sizeof(bwtint_t));
|
||||
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
|
||||
end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop
|
||||
for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p);
|
||||
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
|
||||
x += __occ_aux4(bwt, tmp) - (~k&15);
|
||||
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
|
||||
}
|
||||
|
||||
// an analogy to bwt_occ4() but more efficient, requiring k <= l
|
||||
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
|
||||
{
|
||||
bwtint_t _k, _l;
|
||||
_k = k - (k >= bwt->primary);
|
||||
_l = l - (l >= bwt->primary);
|
||||
if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
||||
bwt_occ4(bwt, k, cntk);
|
||||
bwt_occ4(bwt, l, cntl);
|
||||
} else {
|
||||
bwtint_t x, y;
|
||||
uint32_t *p, tmp, *endk, *endl;
|
||||
k -= (k >= bwt->primary); // because $ is not in bwt
|
||||
l -= (l >= bwt->primary);
|
||||
p = bwt_occ_intv(bwt, k);
|
||||
memcpy(cntk, p, 4 * sizeof(bwtint_t));
|
||||
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
|
||||
// prepare cntk[]
|
||||
endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4));
|
||||
endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4));
|
||||
for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p);
|
||||
y = x;
|
||||
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
|
||||
x += __occ_aux4(bwt, tmp) - (~k&15);
|
||||
// calculate cntl[] and finalize cntk[]
|
||||
for (; p < endl; ++p) y += __occ_aux4(bwt, *p);
|
||||
tmp = *p & ~((1U<<((~l&15)<<1)) - 1);
|
||||
y += __occ_aux4(bwt, tmp) - (~l&15);
|
||||
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
|
||||
cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
|
||||
cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
|
||||
}
|
||||
}
|
||||
|
||||
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
|
||||
{
|
||||
bwtint_t k, l, ok, ol;
|
||||
int i;
|
||||
k = 0; l = bwt->seq_len;
|
||||
for (i = len - 1; i >= 0; --i) {
|
||||
ubyte_t c = str[i];
|
||||
if (c > 3) return 0; // no match
|
||||
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
||||
k = bwt->L2[c] + ok + 1;
|
||||
l = bwt->L2[c] + ol;
|
||||
if (k > l) break; // no match
|
||||
}
|
||||
if (k > l) return 0; // no match
|
||||
if (sa_begin) *sa_begin = k;
|
||||
if (sa_end) *sa_end = l;
|
||||
return l - k + 1;
|
||||
}
|
||||
|
||||
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
|
||||
{
|
||||
int i;
|
||||
bwtint_t k, l, ok, ol;
|
||||
k = *k0; l = *l0;
|
||||
for (i = len - 1; i >= 0; --i) {
|
||||
ubyte_t c = str[i];
|
||||
if (c > 3) return 0; // there is an N here. no match
|
||||
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
||||
k = bwt->L2[c] + ok + 1;
|
||||
l = bwt->L2[c] + ol;
|
||||
if (k > l) return 0; // no match
|
||||
}
|
||||
*k0 = k; *l0 = l;
|
||||
return l - k + 1;
|
||||
}
|
||||
|
||||
/*********************
|
||||
* Bidirectional BWT *
|
||||
*********************/
|
||||
|
||||
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
|
||||
{
|
||||
bwtint_t tk[4], tl[4];
|
||||
int i;
|
||||
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
|
||||
for (i = 0; i != 4; ++i) {
|
||||
ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i];
|
||||
ok[i].x[2] = tl[i] - tk[i];
|
||||
}
|
||||
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary);
|
||||
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
|
||||
ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
|
||||
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
|
||||
}
|
||||
|
||||
static void bwt_reverse_intvs(bwtintv_v *p)
|
||||
{
|
||||
if (p->n > 1) {
|
||||
int j;
|
||||
for (j = 0; j < p->n>>1; ++j) {
|
||||
bwtintv_t tmp = p->a[p->n - 1 - j];
|
||||
p->a[p->n - 1 - j] = p->a[j];
|
||||
p->a[j] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
// NOTE: $max_intv is not currently used in BWA-MEM
|
||||
// 找smem(seed)
|
||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
{
|
||||
int i, j, c, ret;
|
||||
bwtintv_t ik, ok[4];
|
||||
bwtintv_v a[2], *prev, *curr, *swap;
|
||||
|
||||
mem->n = 0;
|
||||
if (q[x] > 3) return x + 1;
|
||||
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
|
||||
kv_init(a[0]); kv_init(a[1]);
|
||||
prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided
|
||||
curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1];
|
||||
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||
ik.info = x + 1;
|
||||
|
||||
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
|
||||
if (ik.x[2] < max_intv) { // an interval small enough
|
||||
kv_push(bwtintv_t, *curr, ik);
|
||||
break;
|
||||
} else if (q[i] < 4) { // an A/C/G/T base
|
||||
c = 3 - q[i]; // complement of q[i]
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
|
||||
kv_push(bwtintv_t, *curr, ik);
|
||||
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
|
||||
}
|
||||
ik = ok[c]; ik.info = i + 1;
|
||||
} else { // an ambiguous base
|
||||
kv_push(bwtintv_t, *curr, ik);
|
||||
break; // always terminate extension at an ambiguous base; in this case, i<len always stands
|
||||
}
|
||||
}
|
||||
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
|
||||
bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
|
||||
ret = curr->a[0].info; // this will be the returned value
|
||||
swap = curr; curr = prev; prev = swap;
|
||||
|
||||
for (i = x - 1; i >= -1; --i) { // backward search for MEMs
|
||||
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
|
||||
for (j = 0, curr->n = 0; j < prev->n; ++j) {
|
||||
bwtintv_t *p = &prev->a[j];
|
||||
if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
|
||||
if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
||||
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
|
||||
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
|
||||
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
|
||||
kv_push(bwtintv_t, *mem, ik);
|
||||
}
|
||||
} // otherwise the match is contained in another longer match
|
||||
} else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) {
|
||||
ok[c].info = p->info;
|
||||
kv_push(bwtintv_t, *curr, ok[c]);
|
||||
}
|
||||
}
|
||||
if (curr->n == 0) break;
|
||||
swap = curr; curr = prev; prev = swap;
|
||||
}
|
||||
bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
||||
|
||||
if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a);
|
||||
if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
{
|
||||
return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
|
||||
}
|
||||
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
|
||||
{
|
||||
int i, c;
|
||||
bwtintv_t ik, ok[4];
|
||||
|
||||
memset(mem, 0, sizeof(bwtintv_t));
|
||||
if (q[x] > 3) return x + 1;
|
||||
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||
for (i = x + 1; i < len; ++i) { // forward search
|
||||
if (q[i] < 4) { // an A/C/G/T base
|
||||
c = 3 - q[i]; // complement of q[i]
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
if (ok[c].x[2] < max_intv && i - x >= min_len) {
|
||||
*mem = ok[c];
|
||||
mem->info = (uint64_t)x<<32 | (i + 1);
|
||||
return i + 1;
|
||||
}
|
||||
ik = ok[c];
|
||||
} else return i + 1;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
/*************************
|
||||
* Read/write BWT and SA *
|
||||
*************************/
|
||||
|
||||
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
void bwt_dump_sa(const char *fn, const bwt_t *bwt)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
|
||||
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
|
||||
const int bufsize = 0x1000000; // 16M block
|
||||
bwtint_t offset = 0;
|
||||
while (size) {
|
||||
int x = bufsize < size? bufsize : size;
|
||||
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break;
|
||||
size -= x; offset += x;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
||||
{
|
||||
char skipped[256];
|
||||
FILE *fp;
|
||||
bwtint_t primary;
|
||||
|
||||
fp = xopen(fn, "rb");
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
|
||||
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
|
||||
err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
||||
|
||||
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
|
||||
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
||||
bwt->sa[0] = -1;
|
||||
|
||||
fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
bwt_t *bwt_restore_bwt(const char *fn)
|
||||
{
|
||||
bwt_t *bwt;
|
||||
FILE *fp;
|
||||
|
||||
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
|
||||
fp = xopen(fn, "rb");
|
||||
err_fseek(fp, 0, SEEK_END);
|
||||
bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
|
||||
bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
|
||||
err_fseek(fp, 0, SEEK_SET);
|
||||
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
fread_fix(fp, bwt->bwt_size<<2, bwt->bwt);
|
||||
bwt->seq_len = bwt->L2[4];
|
||||
err_fclose(fp);
|
||||
bwt_gen_cnt_table(bwt);
|
||||
|
||||
return bwt;
|
||||
}
|
||||
|
||||
void bwt_destroy(bwt_t *bwt)
|
||||
{
|
||||
if (bwt == 0) return;
|
||||
free(bwt->sa); free(bwt->bwt);
|
||||
free(bwt);
|
||||
}
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Contact: Heng Li <hli@jimmy.harvard.edu> */
|
||||
|
||||
#ifndef BWA_BWT_H
|
||||
#define BWA_BWT_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
|
||||
#define OCC_INTV_SHIFT 7
|
||||
#define OCC_INTERVAL (1LL<<OCC_INTV_SHIFT)
|
||||
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
|
||||
|
||||
#ifndef BWA_UBYTE
|
||||
#define BWA_UBYTE
|
||||
typedef unsigned char ubyte_t;
|
||||
#endif
|
||||
|
||||
typedef uint64_t bwtint_t;
|
||||
|
||||
typedef struct {
|
||||
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
|
||||
bwtint_t L2[5]; // C(), cumulative count
|
||||
bwtint_t seq_len; // sequence length
|
||||
bwtint_t bwt_size; // size of bwt, about seq_len/4
|
||||
uint32_t *bwt; // BWT
|
||||
// occurance array, separated to two parts
|
||||
uint32_t cnt_table[256];
|
||||
// suffix array
|
||||
int sa_intv;
|
||||
bwtint_t n_sa;
|
||||
bwtint_t *sa;
|
||||
} bwt_t;
|
||||
|
||||
typedef struct {
|
||||
bwtint_t x[3], info;
|
||||
} bwtintv_t;
|
||||
|
||||
typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
|
||||
|
||||
/* For general OCC_INTERVAL, the following is correct:
|
||||
#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16])
|
||||
#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4)
|
||||
*/
|
||||
|
||||
// The following two lines are ONLY correct when OCC_INTERVAL==0x80
|
||||
#define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)])
|
||||
#define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4))
|
||||
|
||||
/* retrieve a character from the $-removed BWT string. Note that
|
||||
* bwt_t::bwt is not exactly the BWT string and therefore this macro is
|
||||
* called bwt_B0 instead of bwt_B */
|
||||
#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
|
||||
|
||||
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
|
||||
void bwt_dump_sa(const char *fn, const bwt_t *bwt);
|
||||
|
||||
bwt_t *bwt_restore_bwt(const char *fn);
|
||||
void bwt_restore_sa(const char *fn, bwt_t *bwt);
|
||||
|
||||
void bwt_destroy(bwt_t *bwt);
|
||||
|
||||
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
|
||||
void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
|
||||
void bwt_cal_sa(bwt_t *bwt, int intv);
|
||||
|
||||
void bwt_bwtupdate_core(bwt_t *bwt);
|
||||
|
||||
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
|
||||
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
|
||||
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
|
||||
|
||||
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
|
||||
void bwt_gen_cnt_table(bwt_t *bwt);
|
||||
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
|
||||
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
|
||||
|
||||
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
|
||||
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
|
||||
|
||||
/**
|
||||
* Extend bi-SA-interval _ik_
|
||||
*/
|
||||
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
|
||||
|
||||
/**
|
||||
* Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
|
||||
* Return the end of the longest exact match starting from _x_.
|
||||
*/
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,98 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "bwt_lite.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
int is_sa(const uint8_t *T, int *SA, int n);
|
||||
int is_bwt(uint8_t *T, int n);
|
||||
|
||||
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
|
||||
{
|
||||
bwtl_t *b;
|
||||
int i;
|
||||
b = (bwtl_t*)calloc(1, sizeof(bwtl_t));
|
||||
b->seq_len = len;
|
||||
|
||||
{ // calculate b->bwt
|
||||
uint8_t *s;
|
||||
b->sa = (uint32_t*)calloc(len + 1, 4);
|
||||
is_sa(seq, (int*)b->sa, len);
|
||||
s = (uint8_t*)calloc(len + 1, 1);
|
||||
for (i = 0; i <= len; ++i) {
|
||||
if (b->sa[i] == 0) b->primary = i;
|
||||
else s[i] = seq[b->sa[i] - 1];
|
||||
}
|
||||
for (i = b->primary; i < len; ++i) s[i] = s[i + 1];
|
||||
b->bwt_size = (len + 15) / 16;
|
||||
b->bwt = (uint32_t*)calloc(b->bwt_size, 4);
|
||||
for (i = 0; i < len; ++i)
|
||||
b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1);
|
||||
free(s);
|
||||
}
|
||||
{ // calculate b->occ
|
||||
uint32_t c[4];
|
||||
b->n_occ = (len + 15) / 16 * 4;
|
||||
b->occ = (uint32_t*)calloc(b->n_occ, 4);
|
||||
memset(c, 0, 16);
|
||||
for (i = 0; i < len; ++i) {
|
||||
if (i % 16 == 0)
|
||||
memcpy(b->occ + (i/16) * 4, c, 16);
|
||||
++c[bwtl_B0(b, i)];
|
||||
}
|
||||
memcpy(b->L2+1, c, 16);
|
||||
for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1];
|
||||
}
|
||||
{ // generate cnt_table
|
||||
for (i = 0; i != 256; ++i) {
|
||||
uint32_t j, x = 0;
|
||||
for (j = 0; j != 4; ++j)
|
||||
x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
|
||||
b->cnt_table[i] = x;
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
|
||||
{
|
||||
uint32_t n, b;
|
||||
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
|
||||
if (k == (uint32_t)(-1)) return 0;
|
||||
if (k >= bwt->primary) --k; // because $ is not in bwt
|
||||
n = bwt->occ[k/16<<2|c];
|
||||
b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1);
|
||||
n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
|
||||
+ bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff;
|
||||
if (c == 0) n -= 15 - (k&15); // corrected for the masked bits
|
||||
return n;
|
||||
}
|
||||
void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
|
||||
{
|
||||
uint32_t x, b;
|
||||
if (k == (uint32_t)(-1)) {
|
||||
memset(cnt, 0, 16);
|
||||
return;
|
||||
}
|
||||
if (k >= bwt->primary) --k; // because $ is not in bwt
|
||||
memcpy(cnt, bwt->occ + (k>>4<<2), 16);
|
||||
b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1);
|
||||
x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
|
||||
+ bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24];
|
||||
x -= 15 - (k&15);
|
||||
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
|
||||
}
|
||||
void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
|
||||
{
|
||||
bwtl_occ4(bwt, k, cntk);
|
||||
bwtl_occ4(bwt, l, cntl);
|
||||
}
|
||||
void bwtl_destroy(bwtl_t *bwt)
|
||||
{
|
||||
if (bwt) {
|
||||
free(bwt->occ); free(bwt->bwt); free(bwt->sa);
|
||||
free(bwt);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#ifndef BWT_LITE_H_
|
||||
#define BWT_LITE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct {
|
||||
uint32_t seq_len, bwt_size, n_occ;
|
||||
uint32_t primary;
|
||||
uint32_t *bwt, *occ, *sa, L2[5];
|
||||
uint32_t cnt_table[256];
|
||||
} bwtl_t;
|
||||
|
||||
#define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
|
||||
uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
|
||||
void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
|
||||
void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
|
||||
void bwtl_destroy(bwtl_t *bwt);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,321 @@
|
|||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <stdint.h>
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#include "bwtaln.h"
|
||||
#include "bwtgap.h"
|
||||
#include "utils.h"
|
||||
#include "bwa.h"
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
gap_opt_t *gap_init_opt()
|
||||
{
|
||||
gap_opt_t *o;
|
||||
o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
|
||||
/* IMPORTANT: s_mm*10 should be about the average base error
|
||||
rate. Voilating this requirement will break pairing! */
|
||||
o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
|
||||
o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
|
||||
o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
|
||||
o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
|
||||
o->seed_len = 32; o->max_seed_diff = 2;
|
||||
o->fnr = 0.04;
|
||||
o->n_threads = 1;
|
||||
o->max_top2 = 30;
|
||||
o->trim_qual = 0;
|
||||
return o;
|
||||
}
|
||||
|
||||
int bwa_cal_maxdiff(int l, double err, double thres)
|
||||
{
|
||||
double elambda = exp(-l * err);
|
||||
double sum, y = 1.0;
|
||||
int k, x = 1;
|
||||
for (k = 1, sum = elambda; k < 1000; ++k) {
|
||||
y *= l * err;
|
||||
x *= k;
|
||||
sum += elambda * y / x;
|
||||
if (1.0 - sum < thres) return k;
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
// width must be filled as zero
|
||||
int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width)
|
||||
{
|
||||
bwtint_t k, l, ok, ol;
|
||||
int i, bid;
|
||||
bid = 0;
|
||||
k = 0; l = bwt->seq_len;
|
||||
for (i = 0; i < len; ++i) {
|
||||
ubyte_t c = str[i];
|
||||
if (c < 4) {
|
||||
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
||||
k = bwt->L2[c] + ok + 1;
|
||||
l = bwt->L2[c] + ol;
|
||||
}
|
||||
if (k > l || c > 3) { // then restart
|
||||
k = 0;
|
||||
l = bwt->seq_len;
|
||||
++bid;
|
||||
}
|
||||
width[i].w = l - k + 1;
|
||||
width[i].bid = bid;
|
||||
}
|
||||
width[len].w = 0;
|
||||
width[len].bid = ++bid;
|
||||
return bid;
|
||||
}
|
||||
|
||||
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
|
||||
{
|
||||
int i, j, max_l = 0, max_len;
|
||||
gap_stack_t *stack;
|
||||
bwt_width_t *w, *seed_w;
|
||||
gap_opt_t local_opt = *opt;
|
||||
|
||||
// initiate priority stack
|
||||
for (i = max_len = 0; i != n_seqs; ++i)
|
||||
if (seqs[i].len > max_len) max_len = seqs[i].len;
|
||||
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
|
||||
if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
|
||||
stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
|
||||
|
||||
seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
||||
w = 0;
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
#ifdef HAVE_PTHREAD
|
||||
if (i % opt->n_threads != tid) continue;
|
||||
#endif
|
||||
p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
|
||||
if (max_l < p->len) {
|
||||
max_l = p->len;
|
||||
w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t));
|
||||
memset(w, 0, (max_l + 1) * sizeof(bwt_width_t));
|
||||
}
|
||||
bwt_cal_width(bwt, p->len, p->seq, w);
|
||||
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
|
||||
local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
|
||||
if (p->len > opt->seed_len)
|
||||
bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w);
|
||||
// core function
|
||||
for (j = 0; j < p->len; ++j) // we need to complement
|
||||
p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j];
|
||||
p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
|
||||
//fprintf(stderr, "mm=%lld,ins=%lld,del=%lld,gapo=%lld\n", p->aln->n_mm, p->aln->n_ins, p->aln->n_del, p->aln->n_gapo);
|
||||
// clean up the unused data in the record
|
||||
free(p->name); free(p->seq); free(p->rseq); free(p->qual);
|
||||
p->name = 0; p->seq = p->rseq = p->qual = 0;
|
||||
}
|
||||
free(seed_w); free(w);
|
||||
gap_destroy_stack(stack);
|
||||
}
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
typedef struct {
|
||||
int tid;
|
||||
bwt_t *bwt;
|
||||
int n_seqs;
|
||||
bwa_seq_t *seqs;
|
||||
const gap_opt_t *opt;
|
||||
} thread_aux_t;
|
||||
|
||||
static void *worker(void *data)
|
||||
{
|
||||
thread_aux_t *d = (thread_aux_t*)data;
|
||||
bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
|
||||
{
|
||||
bwa_seqio_t *ks;
|
||||
if (mode & BWA_MODE_BAM) { // open BAM
|
||||
int which = 0;
|
||||
if (mode & BWA_MODE_BAM_SE) which |= 4;
|
||||
if (mode & BWA_MODE_BAM_READ1) which |= 1;
|
||||
if (mode & BWA_MODE_BAM_READ2) which |= 2;
|
||||
if (which == 0) which = 7; // then read all reads
|
||||
ks = bwa_bam_open(fn_fa, which);
|
||||
} else ks = bwa_seq_open(fn_fa);
|
||||
return ks;
|
||||
}
|
||||
|
||||
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
||||
{
|
||||
int i, n_seqs;
|
||||
long long tot_seqs = 0;
|
||||
bwa_seq_t *seqs;
|
||||
bwa_seqio_t *ks;
|
||||
clock_t t;
|
||||
bwt_t *bwt;
|
||||
|
||||
// initialization
|
||||
ks = bwa_open_reads(opt->mode, fn_fa);
|
||||
|
||||
{ // load BWT
|
||||
char *str = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
free(str);
|
||||
}
|
||||
|
||||
// core loop
|
||||
err_fwrite(SAI_MAGIC, 1, 4, stdout);
|
||||
err_fwrite(opt, sizeof(gap_opt_t), 1, stdout);
|
||||
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) {
|
||||
tot_seqs += n_seqs;
|
||||
t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
if (opt->n_threads <= 1) { // no multi-threading at all
|
||||
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
|
||||
} else {
|
||||
pthread_t *tid;
|
||||
pthread_attr_t attr;
|
||||
thread_aux_t *data;
|
||||
int j;
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
|
||||
data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
|
||||
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
|
||||
for (j = 0; j < opt->n_threads; ++j) {
|
||||
data[j].tid = j; data[j].bwt = bwt;
|
||||
data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
|
||||
pthread_create(&tid[j], &attr, worker, data + j);
|
||||
}
|
||||
for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
|
||||
free(data); free(tid);
|
||||
}
|
||||
#else
|
||||
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
|
||||
t = clock();
|
||||
fprintf(stderr, "[bwa_aln_core] write to the disk... ");
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
err_fwrite(&p->n_aln, 4, 1, stdout);
|
||||
if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
|
||||
}
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
|
||||
bwa_free_read_seq(n_seqs, seqs);
|
||||
fprintf(stderr, "[bwa_aln_core] %lld sequences have been processed.\n", tot_seqs);
|
||||
}
|
||||
|
||||
// destroy
|
||||
bwt_destroy(bwt);
|
||||
bwa_seq_close(ks);
|
||||
}
|
||||
|
||||
int bwa_aln(int argc, char *argv[])
|
||||
{
|
||||
int c, opte = -1;
|
||||
gap_opt_t *opt;
|
||||
char *prefix;
|
||||
|
||||
opt = gap_init_opt();
|
||||
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
|
||||
else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
|
||||
break;
|
||||
case 'o': opt->max_gapo = atoi(optarg); break;
|
||||
case 'e': opte = atoi(optarg); break;
|
||||
case 'M': opt->s_mm = atoi(optarg); break;
|
||||
case 'O': opt->s_gapo = atoi(optarg); break;
|
||||
case 'E': opt->s_gape = atoi(optarg); break;
|
||||
case 'd': opt->max_del_occ = atoi(optarg); break;
|
||||
case 'i': opt->indel_end_skip = atoi(optarg); break;
|
||||
case 'l': opt->seed_len = atoi(optarg); break;
|
||||
case 'k': opt->max_seed_diff = atoi(optarg); break;
|
||||
case 'm': opt->max_entries = atoi(optarg); break;
|
||||
case 't': opt->n_threads = atoi(optarg); break;
|
||||
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
|
||||
case 'R': opt->max_top2 = atoi(optarg); break;
|
||||
case 'q': opt->trim_qual = atoi(optarg); break;
|
||||
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
|
||||
case 'f': xreopen(optarg, "wb", stdout); break;
|
||||
case 'b': opt->mode |= BWA_MODE_BAM; break;
|
||||
case '0': opt->mode |= BWA_MODE_BAM_SE; break;
|
||||
case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
|
||||
case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
|
||||
case 'I': opt->mode |= BWA_MODE_IL13; break;
|
||||
case 'Y': opt->mode |= BWA_MODE_CFY; break;
|
||||
case 'B': opt->mode |= atoi(optarg) << 24; break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (opte > 0) {
|
||||
opt->max_gape = opte;
|
||||
opt->mode &= ~BWA_MODE_GAPE;
|
||||
}
|
||||
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
|
||||
fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
|
||||
BWA_AVG_ERR, opt->fnr);
|
||||
fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
|
||||
fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
|
||||
fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
|
||||
fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
|
||||
fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
|
||||
fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
|
||||
fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
|
||||
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
|
||||
fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
|
||||
fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
|
||||
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
|
||||
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
|
||||
fprintf(stderr, " -B INT length of barcode\n");
|
||||
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
|
||||
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
|
||||
fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
|
||||
fprintf(stderr, " -b the input read file is in the BAM format\n");
|
||||
fprintf(stderr, " -0 use single-end reads only (effective with -b)\n");
|
||||
fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n");
|
||||
fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n");
|
||||
fprintf(stderr, " -Y filter Casava-filtered sequences\n");
|
||||
fprintf(stderr, "\n");
|
||||
return 1;
|
||||
}
|
||||
if (opt->fnr > 0.0) {
|
||||
int i, k;
|
||||
for (i = 17, k = 0; i <= 250; ++i) {
|
||||
int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
|
||||
if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
|
||||
k = l;
|
||||
}
|
||||
}
|
||||
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
|
||||
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
|
||||
free(opt);
|
||||
return 1;
|
||||
}
|
||||
bwa_aln_core(prefix, argv[optind+1], opt);
|
||||
free(opt); free(prefix);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
#ifndef BWTALN_H
|
||||
#define BWTALN_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "bwt.h"
|
||||
|
||||
#define BWA_TYPE_NO_MATCH 0
|
||||
#define BWA_TYPE_UNIQUE 1
|
||||
#define BWA_TYPE_REPEAT 2
|
||||
#define BWA_TYPE_MATESW 3
|
||||
|
||||
#define SAM_FPD 1 // paired
|
||||
#define SAM_FPP 2 // properly paired
|
||||
#define SAM_FSU 4 // self-unmapped
|
||||
#define SAM_FMU 8 // mate-unmapped
|
||||
#define SAM_FSR 16 // self on the reverse strand
|
||||
#define SAM_FMR 32 // mate on the reverse strand
|
||||
#define SAM_FR1 64 // this is read one
|
||||
#define SAM_FR2 128 // this is read two
|
||||
#define SAM_FSC 256 // secondary alignment
|
||||
|
||||
#define BWA_AVG_ERR 0.02
|
||||
#define BWA_MIN_RDLEN 35 // for read trimming
|
||||
|
||||
#define BWA_MAX_BCLEN 63 // maximum barcode length; 127 is the maximum
|
||||
|
||||
#ifndef bns_pac
|
||||
#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
|
||||
#endif
|
||||
|
||||
#define FROM_M 0
|
||||
#define FROM_I 1
|
||||
#define FROM_D 2
|
||||
#define FROM_S 3
|
||||
|
||||
#define SAI_MAGIC "SAI\1"
|
||||
|
||||
typedef struct {
|
||||
bwtint_t w;
|
||||
int bid;
|
||||
} bwt_width_t;
|
||||
|
||||
typedef struct {
|
||||
uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10;
|
||||
bwtint_t k, l;
|
||||
} bwt_aln1_t;
|
||||
|
||||
typedef uint16_t bwa_cigar_t;
|
||||
/* rgoya: If changing order of bytes, beware of operations like:
|
||||
* s->cigar[0] += s->full_len - s->len;
|
||||
*/
|
||||
#define CIGAR_OP_SHIFT 14
|
||||
#define CIGAR_LN_MASK 0x3fff
|
||||
|
||||
#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT)
|
||||
#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK)
|
||||
#define __cigar_create(__op, __len) ((__op)<<CIGAR_OP_SHIFT | (__len))
|
||||
|
||||
typedef struct {
|
||||
uint32_t n_cigar:15, gap:8, mm:8, strand:1;
|
||||
int ref_shift;
|
||||
bwtint_t pos;
|
||||
bwa_cigar_t *cigar;
|
||||
} bwt_multi1_t;
|
||||
|
||||
typedef struct {
|
||||
char *name;
|
||||
ubyte_t *seq, *rseq, *qual;
|
||||
uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8;
|
||||
uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8;
|
||||
int score;
|
||||
int clip_len;
|
||||
// alignments in SA coordinates
|
||||
int n_aln;
|
||||
bwt_aln1_t *aln;
|
||||
// multiple hits
|
||||
int n_multi;
|
||||
bwt_multi1_t *multi;
|
||||
// alignment information
|
||||
bwtint_t sa, pos;
|
||||
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
|
||||
int ref_shift;
|
||||
int n_cigar;
|
||||
bwa_cigar_t *cigar;
|
||||
// for multi-threading only
|
||||
int tid;
|
||||
// barcode
|
||||
char bc[BWA_MAX_BCLEN+1]; // null terminated; up to BWA_MAX_BCLEN bases
|
||||
// NM and MD tags
|
||||
uint32_t full_len:20, nm:12;
|
||||
char *md;
|
||||
} bwa_seq_t;
|
||||
|
||||
#define BWA_MODE_GAPE 0x01
|
||||
#define BWA_MODE_COMPREAD 0x02
|
||||
#define BWA_MODE_LOGGAP 0x04
|
||||
#define BWA_MODE_CFY 0x08
|
||||
#define BWA_MODE_NONSTOP 0x10
|
||||
#define BWA_MODE_BAM 0x20
|
||||
#define BWA_MODE_BAM_SE 0x40
|
||||
#define BWA_MODE_BAM_READ1 0x80
|
||||
#define BWA_MODE_BAM_READ2 0x100
|
||||
#define BWA_MODE_IL13 0x200
|
||||
|
||||
typedef struct {
|
||||
int s_mm, s_gapo, s_gape;
|
||||
int mode; // bit 24-31 are the barcode length
|
||||
int indel_end_skip, max_del_occ, max_entries;
|
||||
float fnr;
|
||||
int max_diff, max_gapo, max_gape;
|
||||
int max_seed_diff, seed_len;
|
||||
int n_threads;
|
||||
int max_top2;
|
||||
int trim_qual;
|
||||
} gap_opt_t;
|
||||
|
||||
#define BWA_PET_STD 1
|
||||
|
||||
typedef struct {
|
||||
int max_isize, force_isize;
|
||||
int max_occ;
|
||||
int n_multi, N_multi;
|
||||
int type, is_sw, is_preload;
|
||||
double ap_prior;
|
||||
} pe_opt_t;
|
||||
|
||||
struct __bwa_seqio_t;
|
||||
typedef struct __bwa_seqio_t bwa_seqio_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
gap_opt_t *gap_init_opt();
|
||||
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt);
|
||||
|
||||
bwa_seqio_t *bwa_seq_open(const char *fn);
|
||||
bwa_seqio_t *bwa_bam_open(const char *fn, int which);
|
||||
void bwa_seq_close(bwa_seqio_t *bs);
|
||||
void seq_reverse(int len, ubyte_t *seq, int is_comp);
|
||||
bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int mode, int trim_qual);
|
||||
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
|
||||
|
||||
int bwa_cal_maxdiff(int l, double err, double thres);
|
||||
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
|
||||
|
||||
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,264 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "bwtgap.h"
|
||||
#include "bwtaln.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
#define STATE_M 0
|
||||
#define STATE_I 1
|
||||
#define STATE_D 2
|
||||
|
||||
#define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape)
|
||||
|
||||
gap_stack_t *gap_init_stack2(int max_score)
|
||||
{
|
||||
gap_stack_t *stack;
|
||||
stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t));
|
||||
stack->n_stacks = max_score;
|
||||
stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t));
|
||||
return stack;
|
||||
}
|
||||
|
||||
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt)
|
||||
{
|
||||
return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt));
|
||||
}
|
||||
|
||||
void gap_destroy_stack(gap_stack_t *stack)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack);
|
||||
free(stack->stacks);
|
||||
free(stack);
|
||||
}
|
||||
|
||||
static void gap_reset_stack(gap_stack_t *stack)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i != stack->n_stacks; ++i)
|
||||
stack->stacks[i].n_entries = 0;
|
||||
stack->best = stack->n_stacks;
|
||||
stack->n_entries = 0;
|
||||
}
|
||||
|
||||
static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int n_ins, int n_del,
|
||||
int state, int is_diff, const gap_opt_t *opt)
|
||||
{
|
||||
int score;
|
||||
gap_entry_t *p;
|
||||
gap_stack1_t *q;
|
||||
score = aln_score(n_mm, n_gapo, n_gape, opt);
|
||||
q = stack->stacks + score;
|
||||
if (q->n_entries == q->m_entries) {
|
||||
q->m_entries = q->m_entries? q->m_entries<<1 : 4;
|
||||
q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries);
|
||||
}
|
||||
p = q->stack + q->n_entries;
|
||||
p->info = (uint32_t)score<<21 | i; p->k = k; p->l = l;
|
||||
p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape;
|
||||
p->n_ins = n_ins; p->n_del = n_del;
|
||||
p->state = state;
|
||||
p->last_diff_pos = is_diff? i : 0;
|
||||
++(q->n_entries);
|
||||
++(stack->n_entries);
|
||||
if (stack->best > score) stack->best = score;
|
||||
}
|
||||
|
||||
static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e)
|
||||
{
|
||||
gap_stack1_t *q;
|
||||
q = stack->stacks + stack->best;
|
||||
*e = q->stack[q->n_entries - 1];
|
||||
--(q->n_entries);
|
||||
--(stack->n_entries);
|
||||
if (q->n_entries == 0 && stack->n_entries) { // reset best
|
||||
int i;
|
||||
for (i = stack->best + 1; i < stack->n_stacks; ++i)
|
||||
if (stack->stacks[i].n_entries != 0) break;
|
||||
stack->best = i;
|
||||
} else if (stack->n_entries == 0) stack->best = stack->n_stacks;
|
||||
}
|
||||
|
||||
static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w)
|
||||
{
|
||||
int i, j;
|
||||
for (i = j = 0; i < last_diff_pos; ++i) {
|
||||
if (w[i].w > x) w[i].w -= x;
|
||||
else if (w[i].w == x) {
|
||||
w[i].bid = 1;
|
||||
w[i].w = max - (++j);
|
||||
} // else should not happen
|
||||
}
|
||||
}
|
||||
|
||||
static inline int int_log2(uint32_t v)
|
||||
{
|
||||
int c = 0;
|
||||
if (v & 0xffff0000u) { v >>= 16; c |= 16; }
|
||||
if (v & 0xff00) { v >>= 8; c |= 8; }
|
||||
if (v & 0xf0) { v >>= 4; c |= 4; }
|
||||
if (v & 0xc) { v >>= 2; c |= 2; }
|
||||
if (v & 0x2) c |= 1;
|
||||
return c;
|
||||
}
|
||||
|
||||
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width,
|
||||
bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack)
|
||||
{ // $seq is the reverse complement of the input read
|
||||
int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt);
|
||||
int best_diff = opt->max_diff + 1, max_diff = opt->max_diff;
|
||||
int best_cnt = 0;
|
||||
int max_entries = 0, j, _j, n_aln, m_aln;
|
||||
bwt_aln1_t *aln;
|
||||
|
||||
m_aln = 4; n_aln = 0;
|
||||
aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t));
|
||||
|
||||
// check whether there are too many N
|
||||
for (j = _j = 0; j < len; ++j)
|
||||
if (seq[j] > 3) ++_j;
|
||||
if (_j > max_diff) {
|
||||
*_n_aln = n_aln;
|
||||
return aln;
|
||||
}
|
||||
|
||||
//for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w);
|
||||
gap_reset_stack(stack); // reset stack
|
||||
gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, 0, 0, opt);
|
||||
|
||||
while (stack->n_entries) {
|
||||
gap_entry_t e;
|
||||
int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp;
|
||||
bwtint_t k, l, cnt_k[4], cnt_l[4], occ;
|
||||
|
||||
if (max_entries < stack->n_entries) max_entries = stack->n_entries;
|
||||
if (stack->n_entries > opt->max_entries) break;
|
||||
gap_pop(stack, &e); // get the best entry
|
||||
k = e.k; l = e.l; // SA interval
|
||||
i = e.info&0xffff; // length
|
||||
if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed
|
||||
|
||||
m = max_diff - (e.n_mm + e.n_gapo);
|
||||
if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape;
|
||||
if (m < 0) continue;
|
||||
if (seed_width) { // apply seeding
|
||||
m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo);
|
||||
if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape;
|
||||
}
|
||||
//printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos);
|
||||
if (i > 0 && m < width[i-1].bid) continue;
|
||||
|
||||
// check whether a hit is found
|
||||
hit_found = 0;
|
||||
if (i == 0) hit_found = 1;
|
||||
else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed
|
||||
if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1;
|
||||
else continue; // no hit, skip
|
||||
}
|
||||
|
||||
if (hit_found) { // action for found hits
|
||||
int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt);
|
||||
int do_add = 1;
|
||||
//printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l);
|
||||
if (n_aln == 0) {
|
||||
best_score = score;
|
||||
best_diff = e.n_mm + e.n_gapo;
|
||||
if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape;
|
||||
if (!(opt->mode & BWA_MODE_NONSTOP))
|
||||
max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour
|
||||
}
|
||||
if (score == best_score) best_cnt += l - k + 1;
|
||||
else if (best_cnt > opt->max_top2) break; // top2b behaviour
|
||||
if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat
|
||||
for (j = 0; j != n_aln; ++j)
|
||||
if (aln[j].k == k && aln[j].l == l) break;
|
||||
if (j < n_aln) do_add = 0;
|
||||
}
|
||||
if (do_add) { // append
|
||||
bwt_aln1_t *p;
|
||||
gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width);
|
||||
if (n_aln == m_aln) {
|
||||
m_aln <<= 1;
|
||||
aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t));
|
||||
memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t));
|
||||
}
|
||||
p = aln + n_aln;
|
||||
p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape;
|
||||
p->n_ins = e.n_ins; p->n_del = e.n_del;
|
||||
p->k = k; p->l = l;
|
||||
p->score = score;
|
||||
//fprintf(stderr, "*** n_mm=%d,n_gapo=%d,n_gape=%d,n_ins=%d,n_del=%d\n", e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del);
|
||||
++n_aln;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
--i;
|
||||
bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values
|
||||
occ = l - k + 1;
|
||||
// test whether diff is allowed
|
||||
allow_diff = allow_M = 1;
|
||||
if (i > 0) {
|
||||
int ii = i - (len - opt->seed_len);
|
||||
if (width[i-1].bid > m-1) allow_diff = 0;
|
||||
else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0;
|
||||
if (seed_width && ii > 0) {
|
||||
if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0;
|
||||
else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1
|
||||
&& seed_width[ii-1].w == seed_width[ii].w) allow_M = 0;
|
||||
}
|
||||
}
|
||||
// indels
|
||||
tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape;
|
||||
if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) {
|
||||
if (e.state == STATE_M) { // gap open
|
||||
if (e.n_gapo < opt->max_gapo) { // gap open is allowed
|
||||
// insertion
|
||||
gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins + 1, e.n_del, STATE_I, 1, opt);
|
||||
// deletion
|
||||
for (j = 0; j != 4; ++j) {
|
||||
k = bwt->L2[j] + cnt_k[j] + 1;
|
||||
l = bwt->L2[j] + cnt_l[j];
|
||||
if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins, e.n_del + 1, STATE_D, 1, opt);
|
||||
}
|
||||
}
|
||||
} else if (e.state == STATE_I) { // extention of an insertion
|
||||
if (e.n_gape < opt->max_gape) // gap extention is allowed
|
||||
gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins + 1, e.n_del, STATE_I, 1, opt);
|
||||
} else if (e.state == STATE_D) { // extention of a deletion
|
||||
if (e.n_gape < opt->max_gape) { // gap extention is allowed
|
||||
if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) {
|
||||
for (j = 0; j != 4; ++j) {
|
||||
k = bwt->L2[j] + cnt_k[j] + 1;
|
||||
l = bwt->L2[j] + cnt_l[j];
|
||||
if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins, e.n_del + 1, STATE_D, 1, opt);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// mismatches
|
||||
if (allow_diff && allow_M) { // mismatch is allowed
|
||||
for (j = 1; j <= 4; ++j) {
|
||||
int c = (seq[i] + j) & 3;
|
||||
int is_mm = (j != 4 || seq[i] > 3);
|
||||
k = bwt->L2[c] + cnt_k[c] + 1;
|
||||
l = bwt->L2[c] + cnt_l[c];
|
||||
if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, is_mm, opt);
|
||||
}
|
||||
} else if (seq[i] < 4) { // try exact match only
|
||||
int c = seq[i] & 3;
|
||||
k = bwt->L2[c] + cnt_k[c] + 1;
|
||||
l = bwt->L2[c] + cnt_l[c];
|
||||
if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, 0, opt);
|
||||
}
|
||||
}
|
||||
|
||||
*_n_aln = n_aln;
|
||||
//fprintf(stderr, "max_entries = %d\n", max_entries);
|
||||
return aln;
|
||||
}
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
#ifndef BWTGAP_H_
|
||||
#define BWTGAP_H_
|
||||
|
||||
#include "bwt.h"
|
||||
#include "bwtaln.h"
|
||||
|
||||
typedef struct { // recursion stack
|
||||
uint32_t info; // score<<21 | i
|
||||
uint32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6;
|
||||
uint32_t n_ins:16, n_del:16;
|
||||
int last_diff_pos;
|
||||
bwtint_t k, l; // (k,l) is the SA region of [i,n-1]
|
||||
} gap_entry_t;
|
||||
|
||||
typedef struct {
|
||||
int n_entries, m_entries;
|
||||
gap_entry_t *stack;
|
||||
} gap_stack1_t;
|
||||
|
||||
typedef struct {
|
||||
int n_stacks, best, n_entries;
|
||||
gap_stack1_t *stacks;
|
||||
} gap_stack_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
gap_stack_t *gap_init_stack2(int max_score);
|
||||
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt);
|
||||
void gap_destroy_stack(gap_stack_t *stack);
|
||||
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w,
|
||||
bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack);
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,323 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#include <zlib.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
#include "bwt.h"
|
||||
#include "utils.h"
|
||||
#include "rle.h"
|
||||
#include "rope.h"
|
||||
|
||||
#ifdef _DIVBWT
|
||||
#include "divsufsort.h"
|
||||
#endif
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
|
||||
int is_bwt(ubyte_t *T, int n);
|
||||
|
||||
int64_t bwa_seq_len(const char *fn_pac)
|
||||
{
|
||||
FILE *fp;
|
||||
int64_t pac_len;
|
||||
ubyte_t c;
|
||||
fp = xopen(fn_pac, "rb");
|
||||
err_fseek(fp, -1, SEEK_END);
|
||||
pac_len = err_ftell(fp);
|
||||
err_fread_noeof(&c, 1, 1, fp);
|
||||
err_fclose(fp);
|
||||
return (pac_len - 1) * 4 + (int)c;
|
||||
}
|
||||
|
||||
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
||||
{
|
||||
bwt_t *bwt;
|
||||
ubyte_t *buf, *buf2;
|
||||
int64_t i, pac_size;
|
||||
FILE *fp;
|
||||
|
||||
// initialization
|
||||
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
|
||||
bwt->seq_len = bwa_seq_len(fn_pac);
|
||||
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
|
||||
fp = xopen(fn_pac, "rb");
|
||||
|
||||
// prepare sequence
|
||||
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
|
||||
buf2 = (ubyte_t*)calloc(pac_size, 1);
|
||||
err_fread_noeof(buf2, 1, pac_size, fp);
|
||||
err_fclose(fp);
|
||||
memset(bwt->L2, 0, 5 * 4);
|
||||
buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
|
||||
for (i = 0; i < bwt->seq_len; ++i) {
|
||||
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
|
||||
++bwt->L2[1+buf[i]];
|
||||
}
|
||||
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
|
||||
free(buf2);
|
||||
|
||||
// Burrows-Wheeler Transform
|
||||
if (use_is) {
|
||||
bwt->primary = is_bwt(buf, bwt->seq_len);
|
||||
} else {
|
||||
rope_t *r;
|
||||
int64_t x;
|
||||
rpitr_t itr;
|
||||
const uint8_t *blk;
|
||||
|
||||
r = rope_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN);
|
||||
for (i = bwt->seq_len - 1, x = 0; i >= 0; --i) {
|
||||
int c = buf[i] + 1;
|
||||
x = rope_insert_run(r, x, c, 1, 0) + 1;
|
||||
while (--c >= 0) x += r->c[c];
|
||||
}
|
||||
bwt->primary = x;
|
||||
rope_itr_first(r, &itr);
|
||||
x = 0;
|
||||
while ((blk = rope_itr_next_block(&itr)) != 0) {
|
||||
const uint8_t *q = blk + 2, *end = blk + 2 + *rle_nptr(blk);
|
||||
while (q < end) {
|
||||
int c = 0;
|
||||
int64_t l;
|
||||
rle_dec1(q, c, l);
|
||||
for (i = 0; i < l; ++i)
|
||||
buf[x++] = c - 1;
|
||||
}
|
||||
}
|
||||
rope_destroy(r);
|
||||
}
|
||||
bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
|
||||
for (i = 0; i < bwt->seq_len; ++i)
|
||||
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
|
||||
free(buf);
|
||||
return bwt;
|
||||
}
|
||||
|
||||
int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required!
|
||||
{
|
||||
bwt_t *bwt;
|
||||
int c, use_is = 1;
|
||||
while ((c = getopt(argc, argv, "d")) >= 0) {
|
||||
switch (c) {
|
||||
case 'd': use_is = 0; break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_pac2bwt(argv[optind], use_is);
|
||||
bwt_dump_bwt(argv[optind+1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
||||
|
||||
void bwt_bwtupdate_core(bwt_t *bwt)
|
||||
{
|
||||
bwtint_t i, k, c[4], n_occ;
|
||||
uint32_t *buf;
|
||||
|
||||
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
|
||||
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
|
||||
buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
|
||||
c[0] = c[1] = c[2] = c[3] = 0;
|
||||
for (i = k = 0; i < bwt->seq_len; ++i) {
|
||||
if (i % OCC_INTERVAL == 0) {
|
||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
|
||||
}
|
||||
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
|
||||
++c[bwt_B00(bwt, i)];
|
||||
}
|
||||
// the last element
|
||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
|
||||
// update bwt
|
||||
free(bwt->bwt); bwt->bwt = buf;
|
||||
}
|
||||
|
||||
int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
|
||||
{
|
||||
bwt_t *bwt;
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[1]);
|
||||
bwt_bwtupdate_core(bwt);
|
||||
bwt_dump_bwt(argv[1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
|
||||
{
|
||||
bwt_t *bwt;
|
||||
int c, sa_intv = 32;
|
||||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'i': sa_intv = atoi(optarg); break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
bwt_cal_sa(bwt, sa_intv);
|
||||
bwt_dump_sa(argv[optind+1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_index(int argc, char *argv[]) // the "index" command
|
||||
{
|
||||
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
|
||||
char *prefix = 0, *str;
|
||||
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'a': // if -a is not set, algo_type will be determined later
|
||||
if (strcmp(optarg, "rb2") == 0) algo_type = BWTALGO_RB2;
|
||||
else if (strcmp(optarg, "bwtsw") == 0) algo_type = BWTALGO_BWTSW;
|
||||
else if (strcmp(optarg, "is") == 0) algo_type = BWTALGO_IS;
|
||||
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
|
||||
break;
|
||||
case 'p': prefix = strdup(optarg); break;
|
||||
case '6': is_64 = 1; break;
|
||||
case 'b':
|
||||
block_size = strtol(optarg, &str, 10);
|
||||
if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
|
||||
else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024;
|
||||
else if (*str == 'K' || *str == 'k') block_size *= 1024;
|
||||
break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (optind + 1 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
|
||||
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
|
||||
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
|
||||
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
|
||||
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
|
||||
fprintf(stderr, " `-a div' do not work not for long genomes.\n\n");
|
||||
return 1;
|
||||
}
|
||||
if (prefix == 0) {
|
||||
prefix = malloc(strlen(argv[optind]) + 4);
|
||||
strcpy(prefix, argv[optind]);
|
||||
if (is_64) strcat(prefix, ".64");
|
||||
}
|
||||
bwa_idx_build(argv[optind], prefix, algo_type, block_size);
|
||||
free(prefix);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size)
|
||||
{
|
||||
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
|
||||
|
||||
char *str, *str2, *str3;
|
||||
clock_t t;
|
||||
int64_t l_pac;
|
||||
|
||||
str = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
str2 = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
str3 = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
|
||||
{ // nucleotide indexing
|
||||
gzFile fp = xzopen(fa, "r");
|
||||
t = clock();
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... ");
|
||||
l_pac = bns_fasta2bntseq(fp, prefix, 0);
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
err_gzclose(fp);
|
||||
}
|
||||
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
|
||||
{
|
||||
strcpy(str, prefix); strcat(str, ".pac");
|
||||
strcpy(str2, prefix); strcat(str2, ".bwt");
|
||||
t = clock();
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
|
||||
if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
|
||||
else if (algo_type == 1 || algo_type == 3) {
|
||||
bwt_t *bwt;
|
||||
bwt = bwt_pac2bwt(str, algo_type == 3);
|
||||
bwt_dump_bwt(str2, bwt);
|
||||
bwt_destroy(bwt);
|
||||
}
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
}
|
||||
{
|
||||
bwt_t *bwt;
|
||||
strcpy(str, prefix); strcat(str, ".bwt");
|
||||
t = clock();
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Update BWT... ");
|
||||
bwt = bwt_restore_bwt(str);
|
||||
bwt_bwtupdate_core(bwt);
|
||||
bwt_dump_bwt(str, bwt);
|
||||
bwt_destroy(bwt);
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
}
|
||||
{
|
||||
gzFile fp = xzopen(fa, "r");
|
||||
t = clock();
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
|
||||
l_pac = bns_fasta2bntseq(fp, prefix, 1);
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
err_gzclose(fp);
|
||||
}
|
||||
{
|
||||
bwt_t *bwt;
|
||||
strcpy(str, prefix); strcat(str, ".bwt");
|
||||
strcpy(str3, prefix); strcat(str3, ".sa");
|
||||
t = clock();
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
|
||||
bwt = bwt_restore_bwt(str);
|
||||
bwt_cal_sa(bwt, 32);
|
||||
bwt_dump_sa(str3, bwt);
|
||||
bwt_destroy(bwt);
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
}
|
||||
free(str3); free(str2); free(str);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
#ifndef LH3_BWTSW2_H
|
||||
#define LH3_BWTSW2_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwt_lite.h"
|
||||
#include "bwt.h"
|
||||
|
||||
#define BSW2_FLAG_MATESW 0x100
|
||||
#define BSW2_FLAG_TANDEM 0x200
|
||||
#define BSW2_FLAG_MOVED 0x400
|
||||
#define BSW2_FLAG_RESCUED 0x800
|
||||
|
||||
typedef struct {
|
||||
int skip_sw:8, cpy_cmt:8, hard_clip:16;
|
||||
int a, b, q, r, t, qr, bw, max_ins, max_chain_gap;
|
||||
int z, is, t_seeds, multi_2nd;
|
||||
float mask_level, coef;
|
||||
int n_threads, chunk_size;
|
||||
} bsw2opt_t;
|
||||
|
||||
typedef struct {
|
||||
bwtint_t k, l;
|
||||
uint32_t flag:18, n_seeds:13, is_rev:1;
|
||||
int len, G, G2;
|
||||
int beg, end;
|
||||
} bsw2hit_t;
|
||||
|
||||
typedef struct {
|
||||
int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize, nm;
|
||||
uint32_t *cigar;
|
||||
} bsw2aux_t;
|
||||
|
||||
typedef struct {
|
||||
int n, max;
|
||||
bsw2hit_t *hits;
|
||||
bsw2aux_t *aux;
|
||||
} bwtsw2_t;
|
||||
|
||||
typedef struct {
|
||||
void *stack;
|
||||
int max_l;
|
||||
uint8_t *aln_mem;
|
||||
} bsw2global_t;
|
||||
|
||||
typedef struct {
|
||||
int l, tid;
|
||||
char *name, *seq, *qual, *sam, *comment;
|
||||
} bsw2seq1_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bsw2opt_t *bsw2_init_opt();
|
||||
bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool);
|
||||
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2);
|
||||
void bsw2_destroy(bwtsw2_t *b);
|
||||
|
||||
bsw2global_t *bsw2_global_init();
|
||||
void bsw2_global_destroy(bsw2global_t *_pool);
|
||||
|
||||
void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,776 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
#ifdef HAVE_PTHREAD
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
#include "bntseq.h"
|
||||
#include "bwt_lite.h"
|
||||
#include "utils.h"
|
||||
#include "bwtsw2.h"
|
||||
#include "kstring.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
#include "ksort.h"
|
||||
#define __left_lt(a, b) ((a).end > (b).end)
|
||||
KSORT_INIT(hit, bsw2hit_t, __left_lt)
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
|
||||
extern unsigned char nst_nt4_table[256];
|
||||
|
||||
unsigned char nt_comp_table[256] = {
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
|
||||
'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
|
||||
'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n',
|
||||
'n','n','y','s', 'a','n','b','w', 'x','r','n','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
|
||||
};
|
||||
|
||||
extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS);
|
||||
extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level);
|
||||
|
||||
bsw2opt_t *bsw2_init_opt()
|
||||
{
|
||||
bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t));
|
||||
o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30;
|
||||
o->bw = 50;
|
||||
o->max_ins = 20000;
|
||||
o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0;
|
||||
o->mask_level = 0.50f; o->coef = 5.5f;
|
||||
o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000;
|
||||
o->max_chain_gap = 10000;
|
||||
o->cpy_cmt = 0;
|
||||
return o;
|
||||
}
|
||||
|
||||
void bsw2_destroy(bwtsw2_t *b)
|
||||
{
|
||||
int i;
|
||||
if (b == 0) return;
|
||||
if (b->aux)
|
||||
for (i = 0; i < b->n; ++i) free(b->aux[i].cigar);
|
||||
free(b->aux); free(b->hits);
|
||||
free(b);
|
||||
}
|
||||
|
||||
bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b)
|
||||
{
|
||||
bwtsw2_t *p;
|
||||
p = calloc(1, sizeof(bwtsw2_t));
|
||||
p->max = p->n = b->n;
|
||||
if (b->n) {
|
||||
kroundup32(p->max);
|
||||
p->hits = calloc(p->max, sizeof(bsw2hit_t));
|
||||
memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t));
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
#define __gen_ap(par, opt) do { \
|
||||
int i; \
|
||||
for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \
|
||||
for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \
|
||||
(par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \
|
||||
(par).gap_end = (opt)->r; \
|
||||
(par).row = 5; (par).band_width = opt->bw; \
|
||||
} while (0)
|
||||
|
||||
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
|
||||
{
|
||||
int i;
|
||||
bwtint_t k;
|
||||
uint8_t *target = 0, *query;
|
||||
int8_t mat[25];
|
||||
|
||||
bwa_fill_scmat(opt->a, opt->b, mat);
|
||||
query = calloc(lq, 1);
|
||||
// sort according to the descending order of query end
|
||||
ks_introsort(hit, b->n, b->hits);
|
||||
target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
|
||||
// reverse _query
|
||||
for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i];
|
||||
// core loop
|
||||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
|
||||
int score, j, qle, tle;
|
||||
p->n_seeds = 1;
|
||||
if (p->l || p->k == 0) continue;
|
||||
for (j = score = 0; j < i; ++j) {
|
||||
bsw2hit_t *q = b->hits + j;
|
||||
if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) {
|
||||
if (q->n_seeds < (1<<13) - 2) ++q->n_seeds;
|
||||
++score;
|
||||
}
|
||||
}
|
||||
if (score) continue;
|
||||
if (lt > p->k) lt = p->k;
|
||||
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
|
||||
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
||||
lt = j;
|
||||
score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0);
|
||||
if (score > p->G) { // extensible
|
||||
p->G = score;
|
||||
p->k -= tle;
|
||||
p->len += tle;
|
||||
p->beg -= qle;
|
||||
}
|
||||
}
|
||||
free(query); free(target);
|
||||
}
|
||||
|
||||
void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
|
||||
{
|
||||
int i;
|
||||
bwtint_t k;
|
||||
uint8_t *target;
|
||||
int8_t mat[25];
|
||||
|
||||
bwa_fill_scmat(opt->a, opt->b, mat);
|
||||
target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
|
||||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
|
||||
int j, score, qle, tle;
|
||||
if (p->l) continue;
|
||||
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
|
||||
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
||||
lt = j;
|
||||
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1;
|
||||
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
|
||||
if (score >= p->G) {
|
||||
p->G = score;
|
||||
p->len = tle;
|
||||
p->end = p->beg + qle;
|
||||
}
|
||||
}
|
||||
free(target);
|
||||
}
|
||||
|
||||
/* generate CIGAR array(s) in b->cigar[] */
|
||||
static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name)
|
||||
{
|
||||
int i;
|
||||
int8_t mat[25];
|
||||
|
||||
bwa_fill_scmat(opt->a, opt->b, mat);
|
||||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
bsw2aux_t *q = b->aux + i;
|
||||
uint8_t *query;
|
||||
int beg, end, score;
|
||||
if (p->l) continue;
|
||||
beg = (p->flag & 0x10)? lq - p->end : p->beg;
|
||||
end = (p->flag & 0x10)? lq - p->beg : p->end;
|
||||
query = seq[(p->flag & 0x10)? 1 : 0] + beg;
|
||||
q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm);
|
||||
#if 0
|
||||
if (name && score != p->G) { // debugging only
|
||||
int j, glen = 0;
|
||||
for (j = 0; j < q->n_cigar; ++j)
|
||||
if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2)
|
||||
glen += q->cigar[j]>>4;
|
||||
fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n",
|
||||
__func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw);
|
||||
}
|
||||
#endif
|
||||
if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping
|
||||
q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2));
|
||||
if (beg != 0) {
|
||||
memmove(q->cigar + 1, q->cigar, q->n_cigar * 4);
|
||||
q->cigar[0] = beg<<4 | 4;
|
||||
++q->n_cigar;
|
||||
}
|
||||
if (end < lq) {
|
||||
q->cigar[q->n_cigar] = (lq - end)<<4 | 4;
|
||||
++q->n_cigar;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* this is for the debugging purpose only */
|
||||
void bsw2_debug_hits(const bwtsw2_t *b)
|
||||
{
|
||||
int i;
|
||||
printf("# raw hits: %d\n", b->n);
|
||||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
if (p->G > 0)
|
||||
printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev);
|
||||
}
|
||||
}
|
||||
|
||||
static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse)
|
||||
{
|
||||
int i;
|
||||
if (b[0]->n + b[1]->n > b[0]->max) {
|
||||
b[0]->max = b[0]->n + b[1]->n;
|
||||
b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t));
|
||||
}
|
||||
for (i = 0; i < b[1]->n; ++i) {
|
||||
bsw2hit_t *p = b[0]->hits + b[0]->n + i;
|
||||
*p = b[1]->hits[i];
|
||||
if (is_reverse) {
|
||||
int x = p->beg;
|
||||
p->beg = l - p->end;
|
||||
p->end = l - x;
|
||||
p->flag |= 0x10;
|
||||
}
|
||||
}
|
||||
b[0]->n += b[1]->n;
|
||||
bsw2_destroy(b[1]);
|
||||
b[1] = 0;
|
||||
}
|
||||
/* seq[0] is the forward sequence and seq[1] is the reverse complement. */
|
||||
static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target,
|
||||
int l, uint8_t *seq[2], bsw2global_t *pool)
|
||||
{
|
||||
extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]);
|
||||
bwtsw2_t *b[2], **bb[2], **_b, *p;
|
||||
int k, j;
|
||||
bwtl_t *query;
|
||||
query = bwtl_seq2bwtl(l, seq[0]);
|
||||
_b = bsw2_core(bns, opt, query, target, pool);
|
||||
bwtl_destroy(query);
|
||||
for (k = 0; k < 2; ++k) {
|
||||
bb[k] = calloc(2, sizeof(void*));
|
||||
bb[k][0] = calloc(1, sizeof(bwtsw2_t));
|
||||
bb[k][1] = calloc(1, sizeof(bwtsw2_t));
|
||||
}
|
||||
for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand
|
||||
for (j = 0; j < _b[k]->n; ++j) {
|
||||
bsw2hit_t *q;
|
||||
p = bb[_b[k]->hits[j].is_rev][k];
|
||||
if (p->n == p->max) {
|
||||
p->max = p->max? p->max<<1 : 8;
|
||||
p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t));
|
||||
}
|
||||
q = &p->hits[p->n++];
|
||||
*q = _b[k]->hits[j];
|
||||
if (_b[k]->hits[j].is_rev) {
|
||||
int x = q->beg;
|
||||
q->beg = l - q->end;
|
||||
q->end = l - x;
|
||||
}
|
||||
}
|
||||
}
|
||||
b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits"
|
||||
bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained
|
||||
for (k = 0; k < 2; ++k) {
|
||||
bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem);
|
||||
merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here
|
||||
bsw2_resolve_duphits(0, 0, bb[k][0], 0);
|
||||
bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem);
|
||||
bsw2_resolve_duphits(0, 0, bb[k][0], 0);
|
||||
b[k] = bb[k][0];
|
||||
free(bb[k]);
|
||||
}
|
||||
merge_hits(b, l, 1); // again, b[1] is merged to b[0]
|
||||
bsw2_resolve_query_overlaps(b[0], opt->mask_level);
|
||||
bsw2_destroy(_b[0]); bsw2_destroy(_b[1]); free(_b);
|
||||
return b[0];
|
||||
}
|
||||
|
||||
/* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */
|
||||
static void flag_fr(bwtsw2_t *b[2])
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < b[0]->n; ++i) {
|
||||
bsw2hit_t *p = b[0]->hits + i;
|
||||
p->flag |= 0x10000;
|
||||
}
|
||||
for (i = 0; i < b[1]->n; ++i) {
|
||||
bsw2hit_t *p = b[1]->hits + i;
|
||||
p->flag |= 0x20000;
|
||||
}
|
||||
for (i = 0; i < b[0]->n; ++i) {
|
||||
bsw2hit_t *p = b[0]->hits + i;
|
||||
for (j = 0; j < b[1]->n; ++j) {
|
||||
bsw2hit_t *q = b[1]->hits + j;
|
||||
if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) {
|
||||
q->flag |= 0x30000; p->flag |= 0x30000;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int n, max;
|
||||
bsw2seq1_t *seq;
|
||||
} bsw2seq_t;
|
||||
|
||||
static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar)
|
||||
{
|
||||
// FIXME: this routine does not work if the query bridge three reference sequences
|
||||
int32_t coor, refl, lq;
|
||||
int x, y, i, seqid;
|
||||
bns_cnt_ambi(bns, p->k, p->len, &seqid);
|
||||
coor = p->k - bns->anns[seqid].offset;
|
||||
refl = bns->anns[seqid].len;
|
||||
x = coor; y = 0;
|
||||
// test if the alignment goes beyond the boundary
|
||||
for (i = 0; i < n_cigar; ++i) {
|
||||
int op = cigar[i]&0xf, ln = cigar[i]>>4;
|
||||
if (op == 1 || op == 4 || op == 5) y += ln;
|
||||
else if (op == 2) x += ln;
|
||||
else x += ln, y += ln;
|
||||
}
|
||||
lq = y; // length of the query sequence
|
||||
if (x > refl) { // then fix it
|
||||
int j, nc, mq[2], nlen[2];
|
||||
uint32_t *cn;
|
||||
bwtint_t kk = 0;
|
||||
nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0;
|
||||
cn = calloc(n_cigar + 3, 4);
|
||||
x = coor; y = 0;
|
||||
for (i = j = 0; i < n_cigar; ++i) {
|
||||
int op = cigar[i]&0xf, ln = cigar[i]>>4;
|
||||
if (op == 4 || op == 5 || op == 1) { // ins or clipping
|
||||
y += ln;
|
||||
cn[j++] = cigar[i];
|
||||
} else if (op == 2) { // del
|
||||
if (x + ln >= refl && nc == 0) {
|
||||
cn[j++] = (uint32_t)(lq - y)<<4 | 4;
|
||||
nc = j;
|
||||
cn[j++] = (uint32_t)y<<4 | 4;
|
||||
kk = p->k + (x + ln - refl);
|
||||
nlen[0] = x - coor;
|
||||
nlen[1] = p->len - nlen[0] - ln;
|
||||
} else cn[j++] = cigar[i];
|
||||
x += ln;
|
||||
} else if (op == 0) { // match
|
||||
if (x + ln >= refl && nc == 0) {
|
||||
// FIXME: not consider a special case where a split right between M and I
|
||||
cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M
|
||||
cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S
|
||||
nc = j;
|
||||
mq[0] += refl - x;
|
||||
cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4;
|
||||
if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0;
|
||||
mq[1] += x + ln - refl;
|
||||
kk = bns->anns[seqid].offset + refl;
|
||||
nlen[0] = refl - coor;
|
||||
nlen[1] = p->len - nlen[0];
|
||||
} else {
|
||||
cn[j++] = cigar[i];
|
||||
mq[nc?1:0] += ln;
|
||||
}
|
||||
x += ln; y += ln;
|
||||
}
|
||||
}
|
||||
if (mq[0] > mq[1]) { // then take the first alignment
|
||||
n_cigar = nc;
|
||||
memcpy(cigar, cn, 4 * nc);
|
||||
p->len = nlen[0];
|
||||
} else {
|
||||
p->k = kk; p->len = nlen[1];
|
||||
n_cigar = j - nc;
|
||||
memcpy(cigar, cn + nc, 4 * (j - nc));
|
||||
}
|
||||
free(cn);
|
||||
}
|
||||
return n_cigar;
|
||||
}
|
||||
|
||||
static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name)
|
||||
{
|
||||
int i;
|
||||
// allocate for b->aux
|
||||
if (b->n<<1 < b->max) {
|
||||
b->max = b->n;
|
||||
kroundup32(b->max);
|
||||
b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t));
|
||||
}
|
||||
b->aux = calloc(b->n, sizeof(bsw2aux_t));
|
||||
// generate CIGAR
|
||||
gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name);
|
||||
// fix CIGAR, generate mapQ, and write chromosomal position
|
||||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2hit_t *p = &b->hits[i];
|
||||
bsw2aux_t *q = &b->aux[i];
|
||||
q->flag = p->flag & 0xfe;
|
||||
q->isize = 0;
|
||||
if (p->l == 0) { // unique hit
|
||||
float c = 1.0;
|
||||
int subo;
|
||||
// fix out-of-boundary CIGAR
|
||||
q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar);
|
||||
// compute mapQ
|
||||
subo = p->G2 > opt->t? p->G2 : opt->t;
|
||||
if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
|
||||
if (p->n_seeds < 2) c *= .2;
|
||||
q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499);
|
||||
if (q->qual > 250) q->qual = 250;
|
||||
if (q->qual < 0) q->qual = 0;
|
||||
if (p->flag&1) q->qual = 0; // this is a random hit
|
||||
q->pqual = q->qual; // set the paired qual as qual
|
||||
// get the chromosomal position
|
||||
q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr);
|
||||
q->pos = p->k - bns->anns[q->chr].offset;
|
||||
} else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m)
|
||||
{
|
||||
int i;
|
||||
if (m == 0) return;
|
||||
// update flag, mchr and mpos
|
||||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2aux_t *q = &b->aux[i];
|
||||
q->flag |= 1; // paired
|
||||
if (m->n == 0) q->flag |= 8; // mate unmapped
|
||||
if (m->n == 1) {
|
||||
q->mchr = m->aux[0].chr;
|
||||
q->mpos = m->aux[0].pos;
|
||||
if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand
|
||||
if (q->chr == q->mchr) { // set insert size
|
||||
if (q->mpos + m->hits[0].len > q->pos)
|
||||
q->isize = q->mpos + m->hits[0].len - q->pos;
|
||||
else q->isize = q->mpos - q->pos - b->hits[0].len;
|
||||
} else q->isize = 0;
|
||||
} else q->mchr = q->mpos = -1;
|
||||
}
|
||||
// update mapping quality
|
||||
if (b->n == 1 && m->n == 1) {
|
||||
bsw2hit_t *p = &b->hits[0];
|
||||
if (p->flag & BSW2_FLAG_MATESW) { // this alignment is found by Smith-Waterman
|
||||
if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < 20)
|
||||
b->aux[0].pqual = 20;
|
||||
if (b->aux[0].pqual >= m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual;
|
||||
} else if ((p->flag & 2) && !(m->hits[0].flag & BSW2_FLAG_MATESW)) { // properly paired
|
||||
if (!(p->flag & BSW2_FLAG_TANDEM)) { // pqual is bounded by [b->aux[0].qual,m->aux[0].qual]
|
||||
b->aux[0].pqual += 20;
|
||||
if (b->aux[0].pqual > m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual;
|
||||
if (b->aux[0].pqual < b->aux[0].qual) b->aux[0].pqual = b->aux[0].qual;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* generate SAM lines for a sequence in ks with alignment stored in
|
||||
* b. ks->name and ks->seq will be freed and set to NULL in the end. */
|
||||
static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate)
|
||||
{
|
||||
int i, k;
|
||||
kstring_t str;
|
||||
memset(&str, 0, sizeof(kstring_t));
|
||||
if (b == 0 || b->n == 0) { // no hits
|
||||
ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name);
|
||||
for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str);
|
||||
if (ks->qual) {
|
||||
kputc('\t', &str);
|
||||
for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str);
|
||||
} else kputs("\t*", &str);
|
||||
kputc('\n', &str);
|
||||
}
|
||||
for (i = 0; b && i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
bsw2aux_t *q = b->aux + i;
|
||||
int j, beg, end, type = 0;
|
||||
// print mandatory fields before SEQ
|
||||
if (q->cigar == 0) q->flag |= 0x4;
|
||||
ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0));
|
||||
ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1);
|
||||
if (p->l == 0 && q->cigar) { // not a repetitive hit
|
||||
ksprintf(&str, "\t%d\t", q->pqual);
|
||||
for (k = 0; k < q->n_cigar; ++k)
|
||||
ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]);
|
||||
} else ksprintf(&str, "\t0\t*");
|
||||
if (!is_pe) kputs("\t*\t0\t0\t", &str);
|
||||
else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize);
|
||||
// get the sequence begin and end
|
||||
beg = 0; end = ks->l;
|
||||
if (opt->hard_clip && q->cigar) {
|
||||
if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4;
|
||||
if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4;
|
||||
}
|
||||
for (j = beg; j < end; ++j) {
|
||||
if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str);
|
||||
else kputc(ks->seq[j], &str);
|
||||
}
|
||||
// print base quality if present
|
||||
if (ks->qual) {
|
||||
kputc('\t', &str);
|
||||
for (j = beg; j < end; ++j) {
|
||||
if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str);
|
||||
else kputc(ks->qual[j], &str);
|
||||
}
|
||||
} else kputs("\t*", &str);
|
||||
// print optional tags
|
||||
ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm);
|
||||
if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn);
|
||||
if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1);
|
||||
if (p->flag&BSW2_FLAG_MATESW) type |= 1;
|
||||
if (p->flag&BSW2_FLAG_TANDEM) type |= 2;
|
||||
if (type) ksprintf(&str, "\tXT:i:%d", type);
|
||||
if (opt->cpy_cmt && ks->comment) {
|
||||
int l = strlen(ks->comment);
|
||||
if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') {
|
||||
kputc('\t', &str); kputs(ks->comment, &str);
|
||||
}
|
||||
}
|
||||
kputc('\n', &str);
|
||||
}
|
||||
ks->sam = str.s;
|
||||
free(ks->seq); ks->seq = 0;
|
||||
free(ks->qual); ks->qual = 0;
|
||||
free(ks->name); ks->name = 0;
|
||||
}
|
||||
|
||||
static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen)
|
||||
{
|
||||
double ll = log(qlen);
|
||||
int i, k;
|
||||
*dst = *src;
|
||||
if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499);
|
||||
// set band width: the query length sets a boundary on the maximum band width
|
||||
k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a);
|
||||
i = (qlen * dst->a - dst->a - dst->t) / dst->r;
|
||||
if (k > i) k = i;
|
||||
if (k < 1) k = 1; // I do not know if k==0 causes troubles
|
||||
dst->bw = src->bw < k? src->bw : k;
|
||||
}
|
||||
|
||||
/* Core routine to align reads in _seq. It is separated from
|
||||
* process_seqs() to realize multi-threading */
|
||||
static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe)
|
||||
{
|
||||
int x;
|
||||
bsw2opt_t opt;
|
||||
bsw2global_t *pool = bsw2_global_init();
|
||||
bwtsw2_t **buf;
|
||||
buf = calloc(_seq->n, sizeof(void*));
|
||||
for (x = 0; x < _seq->n; ++x) {
|
||||
bsw2seq1_t *p = _seq->seq + x;
|
||||
uint8_t *seq[2], *rseq[2];
|
||||
int i, l, k;
|
||||
bwtsw2_t *b[2];
|
||||
l = p->l;
|
||||
update_opt(&opt, _opt, p->l);
|
||||
if (pool->max_l < l) { // then enlarge working space for aln_extend_core()
|
||||
int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l;
|
||||
pool->max_l = l;
|
||||
pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24);
|
||||
}
|
||||
// set seq[2] and rseq[2]
|
||||
seq[0] = calloc(l * 4, 1);
|
||||
seq[1] = seq[0] + l;
|
||||
rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l;
|
||||
// convert sequences to 2-bit representation
|
||||
for (i = k = 0; i < l; ++i) {
|
||||
int c = nst_nt4_table[(int)p->seq[i]];
|
||||
if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled
|
||||
seq[0][i] = c;
|
||||
seq[1][l-1-i] = 3 - c;
|
||||
rseq[0][l-1-i] = 3 - c;
|
||||
rseq[1][i] = c;
|
||||
}
|
||||
if (l - k < opt.t) { // too few unambiguous bases
|
||||
buf[x] = calloc(1, sizeof(bwtsw2_t));
|
||||
free(seq[0]); continue;
|
||||
}
|
||||
// alignment
|
||||
b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool);
|
||||
for (k = 0; k < b[0]->n; ++k)
|
||||
if (b[0]->hits[k].n_seeds < opt.t_seeds) break;
|
||||
if (k < b[0]->n) {
|
||||
b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool);
|
||||
for (i = 0; i < b[1]->n; ++i) {
|
||||
bsw2hit_t *p = &b[1]->hits[i];
|
||||
int x = p->beg;
|
||||
p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand
|
||||
p->beg = l - p->end;
|
||||
p->end = l - x;
|
||||
}
|
||||
flag_fr(b);
|
||||
merge_hits(b, l, 0);
|
||||
bsw2_resolve_duphits(0, 0, b[0], 0);
|
||||
bsw2_resolve_query_overlaps(b[0], opt.mask_level);
|
||||
} else b[1] = 0;
|
||||
// generate CIGAR and print SAM
|
||||
buf[x] = bsw2_dup_no_cigar(b[0]);
|
||||
// free
|
||||
free(seq[0]);
|
||||
bsw2_destroy(b[0]);
|
||||
}
|
||||
if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf);
|
||||
for (x = 0; x < _seq->n; ++x) {
|
||||
bsw2seq1_t *p = _seq->seq + x;
|
||||
uint8_t *seq[2];
|
||||
int i;
|
||||
seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l;
|
||||
for (i = 0; i < p->l; ++i) {
|
||||
int c = nst_nt4_table[(int)p->seq[i]];
|
||||
if (c >= 4) c = (int)(drand48() * 4);
|
||||
seq[0][i] = c;
|
||||
seq[1][p->l-1-i] = 3 - c;
|
||||
}
|
||||
update_opt(&opt, _opt, p->l);
|
||||
write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name);
|
||||
free(seq[0]);
|
||||
}
|
||||
for (x = 0; x < _seq->n; ++x) {
|
||||
if (is_pe) update_mate_aux(buf[x], buf[x^1]);
|
||||
print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]);
|
||||
}
|
||||
for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]);
|
||||
free(buf);
|
||||
bsw2_global_destroy(pool);
|
||||
}
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
typedef struct {
|
||||
int tid, is_pe;
|
||||
bsw2seq_t *_seq;
|
||||
const bsw2opt_t *_opt;
|
||||
const bntseq_t *bns;
|
||||
uint8_t *pac;
|
||||
const bwt_t *target;
|
||||
} thread_aux_t;
|
||||
|
||||
/* another interface to bsw2_aln_core() to facilitate pthread_create() */
|
||||
static void *worker(void *data)
|
||||
{
|
||||
thread_aux_t *p = (thread_aux_t*)data;
|
||||
bsw2_aln_core(p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* process sequences stored in _seq, generate SAM lines for these
|
||||
* sequences and reset _seq afterwards. */
|
||||
static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe)
|
||||
{
|
||||
int i;
|
||||
is_pe = is_pe? 1 : 0;
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
if (opt->n_threads <= 1) {
|
||||
bsw2_aln_core(_seq, opt, bns, pac, target, is_pe);
|
||||
} else {
|
||||
pthread_t *tid;
|
||||
pthread_attr_t attr;
|
||||
thread_aux_t *data;
|
||||
int j;
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
|
||||
data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
|
||||
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
|
||||
for (j = 0; j < opt->n_threads; ++j) {
|
||||
thread_aux_t *p = data + j;
|
||||
p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe;
|
||||
p->pac = pac; p->target = target;
|
||||
p->_seq = calloc(1, sizeof(bsw2seq_t));
|
||||
p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1;
|
||||
p->_seq->n = 0;
|
||||
p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t));
|
||||
}
|
||||
for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread
|
||||
bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq;
|
||||
p->seq[p->n++] = _seq->seq[i];
|
||||
}
|
||||
for (j = 0; j < opt->n_threads; ++j) pthread_create(&tid[j], &attr, worker, &data[j]);
|
||||
for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
|
||||
for (j = 0; j < opt->n_threads; ++j) data[j]._seq->n = 0;
|
||||
for (i = 0; i < _seq->n; ++i) { // copy the result from each thread back
|
||||
bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq;
|
||||
_seq->seq[i] = p->seq[p->n++];
|
||||
}
|
||||
for (j = 0; j < opt->n_threads; ++j) {
|
||||
thread_aux_t *p = data + j;
|
||||
free(p->_seq->seq);
|
||||
free(p->_seq);
|
||||
}
|
||||
free(data); free(tid);
|
||||
}
|
||||
#else
|
||||
bsw2_aln_core(_seq, opt, bns, pac, target, is_pe);
|
||||
#endif
|
||||
|
||||
// print and reset
|
||||
for (i = 0; i < _seq->n; ++i) {
|
||||
bsw2seq1_t *p = _seq->seq + i;
|
||||
if (p->sam) err_printf("%s", p->sam);
|
||||
free(p->name); free(p->seq); free(p->qual); free(p->sam);
|
||||
p->tid = -1; p->l = 0;
|
||||
p->name = p->seq = p->qual = p->sam = 0;
|
||||
}
|
||||
err_fflush(stdout);
|
||||
_seq->n = 0;
|
||||
}
|
||||
|
||||
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2)
|
||||
{
|
||||
gzFile fp, fp2;
|
||||
kseq_t *ks, *ks2;
|
||||
int l, is_pe = 0, i, n;
|
||||
uint8_t *pac;
|
||||
bsw2seq_t *_seq;
|
||||
bseq1_t *bseq;
|
||||
|
||||
pac = calloc(bns->l_pac/4+1, 1);
|
||||
for (l = 0; l < bns->n_seqs; ++l)
|
||||
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
|
||||
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
fp = xzopen(fn, "r");
|
||||
ks = kseq_init(fp);
|
||||
_seq = calloc(1, sizeof(bsw2seq_t));
|
||||
if (fn2) {
|
||||
fp2 = xzopen(fn2, "r");
|
||||
ks2 = kseq_init(fp2);
|
||||
is_pe = 1;
|
||||
} else fp2 = 0, ks2 = 0, is_pe = 0;
|
||||
while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
|
||||
int size = 0;
|
||||
if (n > _seq->max) {
|
||||
_seq->max = n;
|
||||
kroundup32(_seq->max);
|
||||
_seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
|
||||
}
|
||||
_seq->n = n;
|
||||
for (i = 0; i < n; ++i) {
|
||||
bseq1_t *b = &bseq[i];
|
||||
bsw2seq1_t *p = &_seq->seq[i];
|
||||
p->tid = -1; p->l = b->l_seq;
|
||||
p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0;
|
||||
size += p->l;
|
||||
}
|
||||
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size);
|
||||
free(bseq);
|
||||
process_seqs(_seq, opt, bns, pac, target, is_pe);
|
||||
}
|
||||
// free
|
||||
free(pac);
|
||||
free(_seq->seq); free(_seq);
|
||||
kseq_destroy(ks);
|
||||
err_gzclose(fp);
|
||||
if (fn2) {
|
||||
kseq_destroy(ks2);
|
||||
err_gzclose(fp2);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
#include <stdio.h>
|
||||
#include "bwtsw2.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t tbeg, tend;
|
||||
int qbeg, qend;
|
||||
uint32_t flag:1, idx:31;
|
||||
int chain; // also reuse as a counter
|
||||
} hsaip_t;
|
||||
|
||||
#define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg)
|
||||
|
||||
#include "ksort.h"
|
||||
KSORT_INIT(hsaip, hsaip_t, _hsaip_lt)
|
||||
|
||||
static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain)
|
||||
{
|
||||
int j, k, m = 0;
|
||||
ks_introsort(hsaip, n, z);
|
||||
for (j = 0; j < n; ++j) {
|
||||
hsaip_t *p = z + j;
|
||||
for (k = m - 1; k >= 0; --k) {
|
||||
hsaip_t *q = chain + k;
|
||||
int x = p->qbeg - q->qbeg; // always positive
|
||||
int y = p->tbeg - q->tbeg;
|
||||
if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained
|
||||
if (p->qend > q->qend) q->qend = p->qend;
|
||||
if (p->tend > q->tend) q->tend = p->tend;
|
||||
++q->chain;
|
||||
p->chain = shift + k;
|
||||
break;
|
||||
} else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains
|
||||
}
|
||||
if (k < 0) { // not added to any previous chains
|
||||
chain[m] = *p;
|
||||
chain[m].chain = 1;
|
||||
chain[m].idx = p->chain = shift + m;
|
||||
++m;
|
||||
}
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
|
||||
{
|
||||
hsaip_t *z[2], *chain[2];
|
||||
int i, j, k, n[2], m[2], thres = opt->t_seeds * 2;
|
||||
char *flag;
|
||||
// initialization
|
||||
n[0] = b[0]->n; n[1] = b[1]->n;
|
||||
z[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
|
||||
z[1] = z[0] + n[0];
|
||||
chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
|
||||
for (k = j = 0; k < 2; ++k) {
|
||||
for (i = 0; i < b[k]->n; ++i) {
|
||||
bsw2hit_t *p = b[k]->hits + i;
|
||||
hsaip_t *q = z[k] + i;
|
||||
q->flag = k; q->idx = i;
|
||||
q->tbeg = p->k; q->tend = p->k + p->len;
|
||||
q->chain = -1;
|
||||
q->qbeg = p->beg; q->qend = p->end;
|
||||
}
|
||||
}
|
||||
// chaining
|
||||
m[0] = chaining(opt, 0, n[0], z[0], chain[0]);
|
||||
chain[1] = chain[0] + m[0];
|
||||
m[1] = chaining(opt, m[0], n[1], z[1], chain[1]);
|
||||
// change query coordinate on the reverse strand
|
||||
for (k = 0; k < m[1]; ++k) {
|
||||
hsaip_t *p = chain[1] + k;
|
||||
int tmp = p->qbeg;
|
||||
p->qbeg = len - p->qend; p->qend = len - tmp;
|
||||
}
|
||||
//for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend);
|
||||
// filtering
|
||||
flag = calloc(m[0] + m[1], 1);
|
||||
ks_introsort(hsaip, m[0] + m[1], chain[0]);
|
||||
for (k = 1; k < m[0] + m[1]; ++k) {
|
||||
hsaip_t *p = chain[0] + k;
|
||||
for (j = 0; j < k; ++j) {
|
||||
hsaip_t *q = chain[0] + j;
|
||||
if (flag[q->idx]) continue;
|
||||
if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) {
|
||||
flag[p->idx] = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (k = 0; k < n[0] + n[1]; ++k) {
|
||||
hsaip_t *p = z[0] + k;
|
||||
if (flag[p->chain])
|
||||
b[p->flag]->hits[p->idx].G = 0;
|
||||
}
|
||||
free(flag);
|
||||
// squeeze out filtered elements in b[2]
|
||||
for (k = 0; k < 2; ++k) {
|
||||
for (j = i = 0; j < n[k]; ++j) {
|
||||
bsw2hit_t *p = b[k]->hits + j;
|
||||
if (p->G) {
|
||||
if (i != j) b[k]->hits[i++] = *p;
|
||||
else ++i;
|
||||
}
|
||||
}
|
||||
b[k]->n = i;
|
||||
}
|
||||
// free
|
||||
free(z[0]); free(chain[0]);
|
||||
}
|
||||
|
|
@ -0,0 +1,619 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/resource.h>
|
||||
#include <assert.h>
|
||||
#include "bwt_lite.h"
|
||||
#include "bwtsw2.h"
|
||||
#include "bwt.h"
|
||||
#include "kvec.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
bwtint_t k, l;
|
||||
} qintv_t;
|
||||
|
||||
#define qintv_eq(a, b) ((a).k == (b).k && (a).l == (b).l)
|
||||
#define qintv_hash(a) ((a).k>>7^(a).l<<17)
|
||||
|
||||
#include "khash.h"
|
||||
KHASH_INIT(qintv, qintv_t, uint64_t, 1, qintv_hash, qintv_eq)
|
||||
KHASH_MAP_INIT_INT64(64, uint64_t)
|
||||
|
||||
#define MINUS_INF -0x3fffffff
|
||||
#define MASK_LEVEL 0.90f
|
||||
|
||||
struct __mempool_t;
|
||||
static void mp_destroy(struct __mempool_t*);
|
||||
typedef struct {
|
||||
bwtint_t qk, ql;
|
||||
int I, D, G;
|
||||
uint32_t pj:2, qlen:30;
|
||||
int tlen;
|
||||
int ppos, upos;
|
||||
int cpos[4];
|
||||
} bsw2cell_t;
|
||||
|
||||
#include "ksort.h"
|
||||
KSORT_INIT_GENERIC(int)
|
||||
#define __hitG_lt(a, b) (((a).G + ((int)(a).n_seeds<<2)) > (b).G + ((int)(b).n_seeds<<2))
|
||||
KSORT_INIT(hitG, bsw2hit_t, __hitG_lt)
|
||||
|
||||
static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} };
|
||||
|
||||
typedef struct {
|
||||
int n, max;
|
||||
uint32_t tk, tl; // this is fine
|
||||
bsw2cell_t *array;
|
||||
} bsw2entry_t, *bsw2entry_p;
|
||||
|
||||
/* --- BEGIN: Stack operations --- */
|
||||
typedef struct {
|
||||
int n_pending;
|
||||
kvec_t(bsw2entry_p) stack0, pending;
|
||||
struct __mempool_t *pool;
|
||||
} bsw2stack_t;
|
||||
|
||||
#define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0)
|
||||
static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); }
|
||||
inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); }
|
||||
inline static bsw2entry_p stack_pop(bsw2stack_t *s)
|
||||
{
|
||||
assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0));
|
||||
return kv_pop(s->stack0);
|
||||
}
|
||||
/* --- END: Stack operations --- */
|
||||
|
||||
/* --- BEGIN: memory pool --- */
|
||||
typedef struct __mempool_t {
|
||||
int cnt; // if cnt!=0, then there must be memory leak
|
||||
kvec_t(bsw2entry_p) pool;
|
||||
} mempool_t;
|
||||
inline static bsw2entry_p mp_alloc(mempool_t *mp)
|
||||
{
|
||||
++mp->cnt;
|
||||
if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t));
|
||||
else return kv_pop(mp->pool);
|
||||
}
|
||||
inline static void mp_free(mempool_t *mp, bsw2entry_p e)
|
||||
{
|
||||
--mp->cnt; e->n = 0;
|
||||
kv_push(bsw2entry_p, mp->pool, e);
|
||||
}
|
||||
static void mp_destroy(struct __mempool_t *mp)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i != kv_size(mp->pool); ++i) {
|
||||
free(kv_A(mp->pool, i)->array);
|
||||
free(kv_A(mp->pool, i));
|
||||
}
|
||||
kv_destroy(mp->pool);
|
||||
free(mp);
|
||||
}
|
||||
/* --- END: memory pool --- */
|
||||
|
||||
/* --- BEGIN: utilities --- */
|
||||
static khash_t(64) *bsw2_connectivity(const bwtl_t *b)
|
||||
{
|
||||
khash_t(64) *h;
|
||||
uint32_t k, l, cntk[4], cntl[4]; // this is fine
|
||||
uint64_t x;
|
||||
khiter_t iter;
|
||||
int j, ret;
|
||||
kvec_t(uint64_t) stack;
|
||||
|
||||
kv_init(stack);
|
||||
h = kh_init(64);
|
||||
kh_resize(64, h, b->seq_len * 4);
|
||||
x = b->seq_len;
|
||||
kv_push(uint64_t, stack, x);
|
||||
while (kv_size(stack)) {
|
||||
x = kv_pop(stack);
|
||||
k = x>>32; l = (uint32_t)x;
|
||||
bwtl_2occ4(b, k-1, l, cntk, cntl);
|
||||
for (j = 0; j != 4; ++j) {
|
||||
k = b->L2[j] + cntk[j] + 1;
|
||||
l = b->L2[j] + cntl[j];
|
||||
if (k > l) continue;
|
||||
x = (uint64_t)k << 32 | l;
|
||||
iter = kh_put(64, h, x, &ret);
|
||||
if (ret) { // if not present
|
||||
kh_value(h, iter) = 1;
|
||||
kv_push(uint64_t, stack, x);
|
||||
} else ++kh_value(h, iter);
|
||||
}
|
||||
}
|
||||
kv_destroy(stack);
|
||||
//fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h));
|
||||
return h;
|
||||
}
|
||||
// pick up top T matches at a node
|
||||
static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux)
|
||||
{
|
||||
int i, *a, n, x;
|
||||
if (u->n <= T) return;
|
||||
if (aux->max < u->n) {
|
||||
aux->max = u->n;
|
||||
aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t));
|
||||
}
|
||||
a = (int*)aux->array;
|
||||
for (i = n = 0; i != u->n; ++i)
|
||||
if (u->array[i].ql && u->array[i].G > 0)
|
||||
a[n++] = -u->array[i].G;
|
||||
if (n <= T) return;
|
||||
x = -ks_ksmall(int, n, a, T);
|
||||
n = 0;
|
||||
for (i = 0; i < u->n; ++i) {
|
||||
bsw2cell_t *p = u->array + i;
|
||||
if (p->G == x) ++n;
|
||||
if (p->G < x || (p->G == x && n >= T)) {
|
||||
p->qk = p->ql = 0; p->G = 0;
|
||||
if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// remove duplicated cells
|
||||
static inline void remove_duplicate(bsw2entry_t *u, khash_t(qintv) *hash)
|
||||
{
|
||||
int i, ret, j;
|
||||
khiter_t k;
|
||||
qintv_t key;
|
||||
kh_clear(qintv, hash);
|
||||
for (i = 0; i != u->n; ++i) {
|
||||
bsw2cell_t *p = u->array + i;
|
||||
if (p->ql == 0) continue;
|
||||
key.k = p->qk; key.l = p->ql;
|
||||
k = kh_put(qintv, hash, key, &ret);
|
||||
j = -1;
|
||||
if (ret == 0) {
|
||||
if ((uint32_t)kh_value(hash, k) >= p->G) j = i;
|
||||
else {
|
||||
j = kh_value(hash, k)>>32;
|
||||
kh_value(hash, k) = (uint64_t)i<<32 | p->G;
|
||||
}
|
||||
} else kh_value(hash, k) = (uint64_t)i<<32 | p->G;
|
||||
if (j >= 0) {
|
||||
p = u->array + j;
|
||||
p->qk = p->ql = 0; p->G = 0;
|
||||
if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
|
||||
}
|
||||
}
|
||||
}
|
||||
// merge two entries
|
||||
static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b)
|
||||
{
|
||||
int i;
|
||||
if (u->n + v->n >= u->max) {
|
||||
u->max = u->n + v->n;
|
||||
u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t));
|
||||
}
|
||||
for (i = 0; i != v->n; ++i) {
|
||||
bsw2cell_t *p = v->array + i;
|
||||
if (p->ppos >= 0) p->ppos += u->n;
|
||||
if (p->cpos[0] >= 0) p->cpos[0] += u->n;
|
||||
if (p->cpos[1] >= 0) p->cpos[1] += u->n;
|
||||
if (p->cpos[2] >= 0) p->cpos[2] += u->n;
|
||||
if (p->cpos[3] >= 0) p->cpos[3] += u->n;
|
||||
}
|
||||
memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t));
|
||||
u->n += v->n;
|
||||
}
|
||||
|
||||
static inline bsw2cell_t *push_array_p(bsw2entry_t *e)
|
||||
{
|
||||
if (e->n == e->max) {
|
||||
e->max = e->max? e->max<<1 : 256;
|
||||
e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max);
|
||||
}
|
||||
return e->array + e->n;
|
||||
}
|
||||
|
||||
static inline double time_elapse(const struct rusage *curr, const struct rusage *last)
|
||||
{
|
||||
long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec);
|
||||
long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec);
|
||||
return (double)t1 + t2 * 1e-6;
|
||||
}
|
||||
/* --- END: utilities --- */
|
||||
|
||||
/* --- BEGIN: processing partial hits --- */
|
||||
static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u)
|
||||
{
|
||||
int i;
|
||||
uint32_t k; // this is fine
|
||||
for (i = 0; i < u->n; ++i) {
|
||||
bsw2cell_t *p = u->array + i;
|
||||
if (p->G < thres) continue;
|
||||
for (k = u->tk; k <= u->tl; ++k) {
|
||||
int beg, end;
|
||||
bsw2hit_t *q = 0;
|
||||
beg = bwt->sa[k]; end = beg + p->tlen;
|
||||
if (p->G > hits[beg*2].G) {
|
||||
hits[beg*2+1] = hits[beg*2];
|
||||
q = hits + beg * 2;
|
||||
} else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1;
|
||||
if (q) {
|
||||
q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G;
|
||||
q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G;
|
||||
q->flag = q->n_seeds = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* "narrow hits" are node-to-node hits that have a high score and
|
||||
* are not so repetitive (|SA interval|<=IS). */
|
||||
static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < u->n; ++i) {
|
||||
bsw2hit_t *q;
|
||||
bsw2cell_t *p = u->array + i;
|
||||
if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit
|
||||
if (b1->max == b1->n) {
|
||||
b1->max = b1->max? b1->max<<1 : 4;
|
||||
b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t));
|
||||
}
|
||||
q = &b1->hits[b1->n++];
|
||||
q->k = p->qk; q->l = p->ql;
|
||||
q->len = p->qlen;
|
||||
q->G = p->G; q->G2 = 0;
|
||||
q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen;
|
||||
q->flag = 0;
|
||||
// delete p
|
||||
p->qk = p->ql = 0; p->G = 0;
|
||||
if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* after this, "narrow SA hits" will be expanded and the coordinates
|
||||
* will be obtained and stored in b->hits[*].k. */
|
||||
int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS)
|
||||
{
|
||||
int i, j, n, is_rev;
|
||||
if (b->n == 0) return 0;
|
||||
if (bwt && bns) { // convert to chromosomal coordinates if requested
|
||||
int old_n = b->n;
|
||||
bsw2hit_t *old_hits = b->hits;
|
||||
for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated
|
||||
bsw2hit_t *p = old_hits + i;
|
||||
if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1;
|
||||
else if (p->G > 0) ++n;
|
||||
}
|
||||
b->n = b->max = n;
|
||||
b->hits = calloc(b->max, sizeof(bsw2hit_t));
|
||||
for (i = j = 0; i < old_n; ++i) {
|
||||
bsw2hit_t *p = old_hits + i;
|
||||
if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive
|
||||
bwtint_t k;
|
||||
if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue;
|
||||
for (k = p->k; k <= p->l; ++k) {
|
||||
b->hits[j] = *p;
|
||||
b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev);
|
||||
b->hits[j].l = 0;
|
||||
b->hits[j].is_rev = is_rev;
|
||||
if (is_rev) b->hits[j].k -= p->len - 1;
|
||||
++j;
|
||||
}
|
||||
} else if (p->G > 0) {
|
||||
b->hits[j] = *p;
|
||||
b->hits[j].k = bns_depos(bns, bwt_sa(bwt, p->k), &is_rev);
|
||||
b->hits[j].l = 0;
|
||||
b->hits[j].flag |= 1;
|
||||
b->hits[j].is_rev = is_rev;
|
||||
if (is_rev) b->hits[j].k -= p->len - 1;
|
||||
++j;
|
||||
}
|
||||
}
|
||||
free(old_hits);
|
||||
}
|
||||
for (i = j = 0; i < b->n; ++i) // squeeze out empty elements
|
||||
if (b->hits[i].G) b->hits[j++] = b->hits[i];
|
||||
b->n = j;
|
||||
ks_introsort(hitG, b->n, b->hits);
|
||||
for (i = 1; i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
for (j = 0; j < i; ++j) {
|
||||
bsw2hit_t *q = b->hits + j;
|
||||
int compatible = 1;
|
||||
if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates
|
||||
if (p->l == 0 && q->l == 0) {
|
||||
int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap
|
||||
if (qol < 0) qol = 0;
|
||||
if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) {
|
||||
int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
|
||||
- (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap
|
||||
if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL)
|
||||
compatible = 0;
|
||||
}
|
||||
}
|
||||
if (!compatible) {
|
||||
p->G = 0;
|
||||
if (q->G2 < p->G2) q->G2 = p->G2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
n = i;
|
||||
for (i = j = 0; i < n; ++i) {
|
||||
if (b->hits[i].G == 0) continue;
|
||||
if (i != j) b->hits[j++] = b->hits[i];
|
||||
else ++j;
|
||||
}
|
||||
b->n = j;
|
||||
return b->n;
|
||||
}
|
||||
|
||||
int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level)
|
||||
{
|
||||
int i, j, n;
|
||||
if (b->n == 0) return 0;
|
||||
ks_introsort(hitG, b->n, b->hits);
|
||||
{ // choose a random one
|
||||
int G0 = b->hits[0].G;
|
||||
for (i = 1; i < b->n; ++i)
|
||||
if (b->hits[i].G != G0) break;
|
||||
j = (int)(i * drand48());
|
||||
if (j) {
|
||||
bsw2hit_t tmp;
|
||||
tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp;
|
||||
}
|
||||
}
|
||||
for (i = 1; i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
int all_compatible = 1;
|
||||
if (p->G == 0) break;
|
||||
for (j = 0; j < i; ++j) {
|
||||
bsw2hit_t *q = b->hits + j;
|
||||
int64_t tol = 0;
|
||||
int qol, compatible = 0;
|
||||
float fol;
|
||||
if (q->G == 0) continue;
|
||||
qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg);
|
||||
if (qol < 0) qol = 0;
|
||||
if (p->l == 0 && q->l == 0) {
|
||||
tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
|
||||
- (p->k > q->k? p->k : q->k);
|
||||
if (tol < 0) tol = 0;
|
||||
}
|
||||
fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg);
|
||||
if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1;
|
||||
if (!compatible) {
|
||||
if (q->G2 < p->G) q->G2 = p->G;
|
||||
all_compatible = 0;
|
||||
}
|
||||
}
|
||||
if (!all_compatible) p->G = 0;
|
||||
}
|
||||
n = i;
|
||||
for (i = j = 0; i < n; ++i) {
|
||||
if (b->hits[i].G == 0) continue;
|
||||
if (i != j) b->hits[j++] = b->hits[i];
|
||||
else ++j;
|
||||
}
|
||||
b->n = j;
|
||||
return j;
|
||||
}
|
||||
/* --- END: processing partial hits --- */
|
||||
|
||||
/* --- BEGIN: global mem pool --- */
|
||||
bsw2global_t *bsw2_global_init()
|
||||
{
|
||||
bsw2global_t *pool;
|
||||
bsw2stack_t *stack;
|
||||
pool = calloc(1, sizeof(bsw2global_t));
|
||||
stack = calloc(1, sizeof(bsw2stack_t));
|
||||
stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t));
|
||||
pool->stack = (void*)stack;
|
||||
return pool;
|
||||
}
|
||||
|
||||
void bsw2_global_destroy(bsw2global_t *pool)
|
||||
{
|
||||
stack_destroy((bsw2stack_t*)pool->stack);
|
||||
free(pool->aln_mem);
|
||||
free(pool);
|
||||
}
|
||||
/* --- END: global mem pool --- */
|
||||
|
||||
static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4])
|
||||
{
|
||||
int G = c[3]? c[3]->G + match_score : MINUS_INF;
|
||||
if (c[1]) {
|
||||
c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr;
|
||||
if (c[0]->I > G) G = c[0]->I;
|
||||
} else c[0]->I = MINUS_INF;
|
||||
if (c[2]) {
|
||||
c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr;
|
||||
if (c[0]->D > G) G = c[0]->D;
|
||||
} else c[0]->D = MINUS_INF;
|
||||
return(c[0]->G = G);
|
||||
}
|
||||
|
||||
static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s)
|
||||
{
|
||||
bsw2entry_t *u;
|
||||
bsw2cell_t *x;
|
||||
|
||||
u = mp_alloc(s->pool);
|
||||
u->tk = 0; u->tl = target->seq_len;
|
||||
x = push_array_p(u);
|
||||
*x = g_default_cell;
|
||||
x->G = 0; x->qk = 0; x->ql = query->seq_len;
|
||||
u->n++;
|
||||
stack_push0(s, u);
|
||||
}
|
||||
/* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */
|
||||
bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool)
|
||||
{
|
||||
bsw2stack_t *stack = (bsw2stack_t*)pool->stack;
|
||||
bwtsw2_t *b, *b1, **b_ret;
|
||||
int i, j, score_mat[16], *heap, heap_size, n_tot = 0;
|
||||
struct rusage curr, last;
|
||||
khash_t(qintv) *rhash;
|
||||
khash_t(64) *chash;
|
||||
|
||||
// initialize connectivity hash (chash)
|
||||
chash = bsw2_connectivity(target);
|
||||
// calculate score matrix
|
||||
for (i = 0; i != 4; ++i)
|
||||
for (j = 0; j != 4; ++j)
|
||||
score_mat[i<<2|j] = (i == j)? opt->a : -opt->b;
|
||||
// initialize other variables
|
||||
rhash = kh_init(qintv);
|
||||
init_bwtsw2(target, query, stack);
|
||||
heap_size = opt->z;
|
||||
heap = calloc(heap_size, sizeof(int));
|
||||
// initialize the return struct
|
||||
b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
|
||||
b->n = b->max = target->seq_len * 2;
|
||||
b->hits = calloc(b->max, sizeof(bsw2hit_t));
|
||||
b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
|
||||
b_ret = calloc(2, sizeof(void*));
|
||||
b_ret[0] = b; b_ret[1] = b1;
|
||||
// initialize timer
|
||||
getrusage(0, &last);
|
||||
// the main loop: traversal of the DAG
|
||||
while (!stack_isempty(stack)) {
|
||||
int old_n, tj;
|
||||
bsw2entry_t *v;
|
||||
uint32_t tcntk[4], tcntl[4];
|
||||
bwtint_t k, l;
|
||||
|
||||
v = stack_pop(stack); old_n = v->n;
|
||||
n_tot += v->n;
|
||||
|
||||
for (i = 0; i < v->n; ++i) { // test max depth and band width
|
||||
bsw2cell_t *p = v->array + i;
|
||||
if (p->ql == 0) continue;
|
||||
if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) {
|
||||
p->qk = p->ql = 0;
|
||||
if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5;
|
||||
}
|
||||
}
|
||||
|
||||
// get Occ for the DAG
|
||||
bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl);
|
||||
for (tj = 0; tj != 4; ++tj) { // descend to the children
|
||||
bwtint_t qcntk[4], qcntl[4];
|
||||
int qj, *curr_score_mat = score_mat + tj * 4;
|
||||
khiter_t iter;
|
||||
bsw2entry_t *u;
|
||||
|
||||
k = target->L2[tj] + tcntk[tj] + 1;
|
||||
l = target->L2[tj] + tcntl[tj];
|
||||
if (k > l) continue;
|
||||
// update counter
|
||||
iter = kh_get(64, chash, (uint64_t)k<<32 | l);
|
||||
--kh_value(chash, iter);
|
||||
// initialization
|
||||
u = mp_alloc(stack->pool);
|
||||
u->tk = k; u->tl = l;
|
||||
memset(heap, 0, sizeof(int) * opt->z);
|
||||
// loop through all the nodes in v
|
||||
for (i = 0; i < v->n; ++i) {
|
||||
bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G
|
||||
int is_added = 0;
|
||||
if (p->ql == 0) continue; // deleted node
|
||||
c[0] = x = push_array_p(u);
|
||||
x->G = MINUS_INF;
|
||||
p->upos = x->upos = -1;
|
||||
if (p->ppos >= 0) { // parent has been visited
|
||||
c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0;
|
||||
c[3] = v->array + p->ppos; c[2] = p;
|
||||
if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x
|
||||
x->ppos = v->array[p->ppos].upos; // the parent pos in u
|
||||
p->upos = u->n++; // the current pos in u
|
||||
if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u
|
||||
is_added = 1;
|
||||
}
|
||||
} else {
|
||||
x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr;
|
||||
if (x->D > 0) {
|
||||
x->G = x->D;
|
||||
x->I = MINUS_INF; x->ppos = -1;
|
||||
p->upos = u->n++;
|
||||
is_added = 1;
|
||||
}
|
||||
}
|
||||
if (is_added) { // x has been added to u->array. fill the remaining variables
|
||||
x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
|
||||
x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1;
|
||||
if (x->G > -heap[0]) {
|
||||
heap[0] = -x->G;
|
||||
ks_heapadjust(int, 0, heap_size, heap);
|
||||
}
|
||||
}
|
||||
if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v
|
||||
if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) {
|
||||
bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl);
|
||||
for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie
|
||||
if (p->cpos[qj] != -1) continue; // this node will be visited later
|
||||
k = query->L2[qj] + qcntk[qj] + 1;
|
||||
l = query->L2[qj] + qcntl[qj];
|
||||
if (k > l) { p->cpos[qj] = -2; continue; }
|
||||
x = push_array_p(v);
|
||||
p = v->array + i; // p may not point to the correct position after realloc
|
||||
x->G = x->I = x->D = MINUS_INF;
|
||||
x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen;
|
||||
x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
|
||||
p->cpos[qj] = v->n++;
|
||||
} // ~for(qj)
|
||||
} // ~if(p->cpos[])
|
||||
} // ~if
|
||||
} // ~for(i)
|
||||
if (u->n) save_hits(target, opt->t, b->hits, u);
|
||||
{ // push u to the stack (or to the pending array)
|
||||
uint32_t cnt, pos;
|
||||
cnt = (uint32_t)kh_value(chash, iter);
|
||||
pos = kh_value(chash, iter)>>32;
|
||||
if (pos) { // something in the pending array, then merge
|
||||
bsw2entry_t *w = kv_A(stack->pending, pos-1);
|
||||
if (u->n) {
|
||||
if (w->n < u->n) { // swap
|
||||
w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w;
|
||||
}
|
||||
merge_entry(opt, w, u, b);
|
||||
}
|
||||
if (cnt == 0) { // move from pending to stack0
|
||||
remove_duplicate(w, rhash);
|
||||
save_narrow_hits(target, w, b1, opt->t, opt->is);
|
||||
cut_tail(w, opt->z, u);
|
||||
stack_push0(stack, w);
|
||||
kv_A(stack->pending, pos-1) = 0;
|
||||
--stack->n_pending;
|
||||
}
|
||||
mp_free(stack->pool, u);
|
||||
} else if (cnt) { // the first time
|
||||
if (u->n) { // push to the pending queue
|
||||
++stack->n_pending;
|
||||
kv_push(bsw2entry_p, stack->pending, u);
|
||||
kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt;
|
||||
} else mp_free(stack->pool, u);
|
||||
} else { // cnt == 0, then push to the stack
|
||||
bsw2entry_t *w = mp_alloc(stack->pool);
|
||||
save_narrow_hits(target, u, b1, opt->t, opt->is);
|
||||
cut_tail(u, opt->z, w);
|
||||
mp_free(stack->pool, w);
|
||||
stack_push0(stack, u);
|
||||
}
|
||||
}
|
||||
} // ~for(tj)
|
||||
mp_free(stack->pool, v);
|
||||
} // while(top)
|
||||
getrusage(0, &curr);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < b_ret[i]->n; ++j)
|
||||
b_ret[i]->hits[j].n_seeds = 0;
|
||||
bsw2_resolve_duphits(bns, query, b, opt->is);
|
||||
bsw2_resolve_duphits(bns, query, b1, opt->is);
|
||||
//fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot);
|
||||
// free
|
||||
free(heap);
|
||||
kh_destroy(qintv, rhash);
|
||||
kh_destroy(64, chash);
|
||||
stack->pending.n = stack->stack0.n = 0;
|
||||
return b_ret;
|
||||
}
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include "bwt.h"
|
||||
#include "bwtsw2.h"
|
||||
#include "utils.h"
|
||||
#include "bwa.h"
|
||||
|
||||
int bwa_bwtsw2(int argc, char *argv[])
|
||||
{
|
||||
bsw2opt_t *opt;
|
||||
bwaidx_t *idx;
|
||||
int c;
|
||||
|
||||
opt = bsw2_init_opt();
|
||||
srand48(11);
|
||||
while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) {
|
||||
switch (c) {
|
||||
case 'q': opt->q = atoi(optarg); break;
|
||||
case 'r': opt->r = atoi(optarg); break;
|
||||
case 'a': opt->a = atoi(optarg); break;
|
||||
case 'b': opt->b = atoi(optarg); break;
|
||||
case 'w': opt->bw = atoi(optarg); break;
|
||||
case 'T': opt->t = atoi(optarg); break;
|
||||
case 't': opt->n_threads = atoi(optarg); break;
|
||||
case 'z': opt->z = atoi(optarg); break;
|
||||
case 's': opt->is = atoi(optarg); break;
|
||||
case 'm': opt->mask_level = atof(optarg); break;
|
||||
case 'c': opt->coef = atof(optarg); break;
|
||||
case 'N': opt->t_seeds = atoi(optarg); break;
|
||||
case 'M': opt->multi_2nd = 1; break;
|
||||
case 'H': opt->hard_clip = 1; break;
|
||||
case 'f': xreopen(optarg, "w", stdout); break;
|
||||
case 'I': opt->max_ins = atoi(optarg); break;
|
||||
case 'S': opt->skip_sw = 1; break;
|
||||
case 'C': opt->cpy_cmt = 1; break;
|
||||
case 'G': opt->max_chain_gap = atoi(optarg); break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
opt->qr = opt->q + opt->r;
|
||||
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa bwasw [options] <target.prefix> <query.fa> [query2.fa]\n\n");
|
||||
fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a);
|
||||
fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b);
|
||||
fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q);
|
||||
fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r);
|
||||
fprintf(stderr, " -w INT band width [%d]\n", opt->bw);
|
||||
fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -f FILE file to output results to instead of stdout\n");
|
||||
fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n");
|
||||
fprintf(stderr, " -C copy FASTA/Q comment to SAM output\n");
|
||||
fprintf(stderr, " -M mark multi-part alignments as secondary\n");
|
||||
fprintf(stderr, " -S skip Smith-Waterman read pairing\n");
|
||||
fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t);
|
||||
fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef);
|
||||
fprintf(stderr, " -z INT Z-best [%d]\n", opt->z);
|
||||
fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is);
|
||||
fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds);
|
||||
fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n");
|
||||
fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n");
|
||||
fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n");
|
||||
fprintf(stderr, " increase '-z' for better sensitivity.\n");
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
// adjust opt for opt->a
|
||||
opt->t *= opt->a;
|
||||
opt->coef *= opt->a;
|
||||
|
||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
|
||||
bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
|
||||
bwa_idx_destroy(idx);
|
||||
free(opt);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,274 @@
|
|||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "utils.h"
|
||||
#include "bwt.h"
|
||||
#include "bntseq.h"
|
||||
#include "bwtsw2.h"
|
||||
#include "kstring.h"
|
||||
#include "ksw.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
#define MIN_RATIO 0.8
|
||||
#define OUTLIER_BOUND 2.0
|
||||
#define MAX_STDDEV 4.0
|
||||
#define EXT_STDDEV 4.0
|
||||
|
||||
typedef struct {
|
||||
int low, high, failed;
|
||||
double avg, std;
|
||||
} bsw2pestat_t;
|
||||
|
||||
bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
|
||||
{
|
||||
int i, k, x, p25, p50, p75, tmp, max_len = 0;
|
||||
uint64_t *isize;
|
||||
bsw2pestat_t r;
|
||||
|
||||
memset(&r, 0, sizeof(bsw2pestat_t));
|
||||
isize = calloc(n, 8);
|
||||
for (i = k = 0; i < n; i += 2) {
|
||||
bsw2hit_t *t[2];
|
||||
int l;
|
||||
if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits
|
||||
t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0];
|
||||
if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough
|
||||
if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough
|
||||
l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len;
|
||||
if (l >= max_ins) continue; // skip pairs with excessively large insert
|
||||
max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg;
|
||||
max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
|
||||
isize[k++] = l;
|
||||
}
|
||||
ks_introsort_64(k, isize);
|
||||
p25 = isize[(int)(.25 * k + .499)];
|
||||
p50 = isize[(int)(.50 * k + .499)];
|
||||
p75 = isize[(int)(.75 * k + .499)];
|
||||
ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k);
|
||||
if (k < 8) {
|
||||
ksprintf(msg, "[%s] fail to infer the insert size distribution: too few good pairs.\n", __func__);
|
||||
free(isize);
|
||||
r.failed = 1;
|
||||
return r;
|
||||
}
|
||||
tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
r.low = tmp > max_len? tmp : max_len;
|
||||
if (r.low < 1) r.low = 1;
|
||||
r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
if (r.low > r.high) {
|
||||
ksprintf(msg, "[%s] fail to infer the insert size distribution: upper bound is smaller than max read length.\n", __func__);
|
||||
free(isize);
|
||||
r.failed = 1;
|
||||
return r;
|
||||
}
|
||||
ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
|
||||
ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high);
|
||||
for (i = x = 0, r.avg = 0; i < k; ++i)
|
||||
if (isize[i] >= r.low && isize[i] <= r.high)
|
||||
r.avg += isize[i], ++x;
|
||||
if (x == 0) {
|
||||
ksprintf(msg, "[%s] fail to infer the insert size distribution: no pairs within boundaries.\n", __func__);
|
||||
free(isize);
|
||||
r.failed = 1;
|
||||
return r;
|
||||
}
|
||||
r.avg /= x;
|
||||
for (i = 0, r.std = 0; i < k; ++i)
|
||||
if (isize[i] >= r.low && isize[i] <= r.high)
|
||||
r.std += (isize[i] - r.avg) * (isize[i] - r.avg);
|
||||
r.std = sqrt(r.std / x);
|
||||
ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std);
|
||||
tmp = (int)(p25 - 3. * (p75 - p25) + .499);
|
||||
r.low = tmp > max_len? tmp : max_len;
|
||||
if (r.low < 1) r.low = 1;
|
||||
r.high = (int)(p75 + 3. * (p75 - p25) + .499);
|
||||
if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499);
|
||||
r.low = tmp > max_len? tmp : max_len;
|
||||
if (r.high < r.avg + MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499);
|
||||
ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
|
||||
free(isize);
|
||||
return r;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int n_cigar, beg, end, len;
|
||||
int64_t pos;
|
||||
uint32_t *cigar;
|
||||
} pairaux_t;
|
||||
|
||||
extern unsigned char nst_nt4_table[256];
|
||||
|
||||
void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25])
|
||||
{
|
||||
extern void seq_reverse(int len, ubyte_t *seq, int is_comp);
|
||||
int64_t k, beg, end;
|
||||
uint8_t *seq, *ref;
|
||||
int i;
|
||||
// compute the region start and end
|
||||
a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7
|
||||
if (h->is_rev == 0) {
|
||||
beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499);
|
||||
if (beg < h->k) beg = h->k;
|
||||
end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499);
|
||||
a->is_rev = 1; a->flag |= 16;
|
||||
} else {
|
||||
beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499);
|
||||
end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499);
|
||||
if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg);
|
||||
a->is_rev = 0;
|
||||
}
|
||||
if (beg < 1) beg = 1;
|
||||
if (end > l_pac) end = l_pac;
|
||||
if (end - beg < l_mseq) return;
|
||||
// generate the sequence
|
||||
seq = malloc(l_mseq + (end - beg));
|
||||
ref = seq + l_mseq;
|
||||
for (k = beg; k < end; ++k)
|
||||
ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3;
|
||||
if (h->is_rev == 0) {
|
||||
for (i = 0; i < l_mseq; ++i) { // on the reverse strand
|
||||
int c = nst_nt4_table[(int)mseq[i]];
|
||||
seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < l_mseq; ++i) // on the forward strand
|
||||
seq[i] = nst_nt4_table[(int)mseq[i]];
|
||||
}
|
||||
{
|
||||
int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t;
|
||||
kswr_t aln;
|
||||
aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0);
|
||||
a->G = aln.score;
|
||||
a->G2 = aln.score2;
|
||||
if (a->G < opt->t) a->G = 0;
|
||||
if (a->G2 < opt->t) a->G2 = 0;
|
||||
if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
|
||||
a->k = beg + aln.tb;
|
||||
a->len = aln.te - aln.tb + 1;
|
||||
a->beg = aln.qb;
|
||||
a->end = aln.qe + 1;
|
||||
/*
|
||||
printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n');
|
||||
printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n');
|
||||
printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len);
|
||||
*/
|
||||
}
|
||||
if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i;
|
||||
free(seq);
|
||||
}
|
||||
|
||||
void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits)
|
||||
{
|
||||
extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS);
|
||||
bsw2pestat_t pes;
|
||||
int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0;
|
||||
int8_t g_mat[25];
|
||||
kstring_t msg;
|
||||
memset(&msg, 0, sizeof(kstring_t));
|
||||
pes = bsw2_stat(n, hits, &msg, opt->max_ins);
|
||||
for (i = k = 0; i < 5; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
g_mat[k++] = i == j? opt->a : -opt->b;
|
||||
g_mat[k++] = 0;
|
||||
}
|
||||
for (i = 0; i < n; i += 2) {
|
||||
bsw2hit_t a[2];
|
||||
memset(&a, 0, sizeof(bsw2hit_t) * 2);
|
||||
a[0].flag = 1<<6; a[1].flag = 1<<7;
|
||||
for (j = 0; j < 2; ++j) { // set the read1/2 flag
|
||||
if (hits[i+j] == 0) continue;
|
||||
for (k = 0; k < hits[i+j]->n; ++k) {
|
||||
bsw2hit_t *p = &hits[i+j]->hits[k];
|
||||
p->flag |= 1<<(6+j);
|
||||
}
|
||||
}
|
||||
if (pes.failed) continue;
|
||||
if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N
|
||||
if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit
|
||||
if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit
|
||||
if (!opt->skip_sw) {
|
||||
if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat);
|
||||
if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat);
|
||||
} // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0
|
||||
// the following enumerate all possibilities. It is tedious but necessary...
|
||||
if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not;
|
||||
bwtsw2_t *p[2];
|
||||
int which;
|
||||
if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1;
|
||||
else p[0] = hits[i+1], p[1] = hits[i], which = 0;
|
||||
if (a[which].G == 0) continue;
|
||||
a[which].flag |= BSW2_FLAG_RESCUED;
|
||||
if (p[1]->max == 0) {
|
||||
p[1]->max = 1;
|
||||
p[1]->hits = malloc(sizeof(bsw2hit_t));
|
||||
}
|
||||
p[1]->hits[0] = a[which];
|
||||
p[1]->n = 1;
|
||||
p[0]->hits[0].flag |= 2;
|
||||
p[1]->hits[0].flag |= 2;
|
||||
++n_rescued;
|
||||
} else { // then both ends mapped
|
||||
int is_fixed = 0;
|
||||
//fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end);
|
||||
for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score
|
||||
bsw2hit_t *p = &hits[i+j]->hits[0];
|
||||
if (p->G < a[j].G) { // the orginal mapping is suboptimal
|
||||
a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM?
|
||||
*p = a[j];
|
||||
++n_fixed;
|
||||
is_fixed = 1;
|
||||
} else if (p->k != a[j].k && p->G2 < a[j].G) {
|
||||
p->G2 = a[j].G;
|
||||
} else if (p->k == a[j].k && p->G2 < a[j].G2) {
|
||||
p->G2 = a[j].G2;
|
||||
}
|
||||
}
|
||||
if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved
|
||||
for (j = 0; j < 2; ++j)
|
||||
hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM);
|
||||
} else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match
|
||||
for (j = 0; j < 2; ++j) {
|
||||
hits[i+j]->hits[0].flag |= 2;
|
||||
if (hits[i+j]->hits[0].k != a[j].k)
|
||||
hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM;
|
||||
}
|
||||
} else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end
|
||||
if (a[0].G && a[1].G) { // now we have two "proper pairs"
|
||||
int G[2];
|
||||
double diff;
|
||||
G[0] = hits[i]->hits[0].G + a[1].G;
|
||||
G[1] = hits[i+1]->hits[0].G + a[0].G;
|
||||
diff = fabs((double)(G[0] - G[1])) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.);
|
||||
if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0;
|
||||
}
|
||||
if (a[0].G == 0 || a[1].G == 0) { // one proper pair only
|
||||
bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved
|
||||
int which, isize;
|
||||
double dev, diff;
|
||||
if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0;
|
||||
else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1;
|
||||
isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k;
|
||||
dev = fabs(isize - pes.avg) / pes.std;
|
||||
diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0;
|
||||
if (diff < dev * 2.) { // then move (heuristic)
|
||||
a[which].G2 = a[which].G;
|
||||
p[1][0] = a[which];
|
||||
p[1]->flag |= BSW2_FLAG_MOVED | 2;
|
||||
p[0]->flag |= 2;
|
||||
++n_moved;
|
||||
}
|
||||
}
|
||||
} else if (is_fixed) {
|
||||
hits[i+0]->hits[0].flag |= 2;
|
||||
hits[i+1]->hits[0].flag |= 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved);
|
||||
fputs(msg.s, stderr);
|
||||
free(msg.s);
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
## Contributor Code of Conduct
|
||||
|
||||
As contributors and maintainers of this project, we pledge to respect all
|
||||
people who contribute through reporting issues, posting feature requests,
|
||||
updating documentation, submitting pull requests or patches, and other
|
||||
activities.
|
||||
|
||||
We are committed to making participation in this project a harassment-free
|
||||
experience for everyone, regardless of level of experience, gender, gender
|
||||
identity and expression, sexual orientation, disability, personal appearance,
|
||||
body size, race, age, or religion.
|
||||
|
||||
Examples of unacceptable behavior by participants include the use of sexual
|
||||
language or imagery, derogatory comments or personal attacks, trolling, public
|
||||
or private harassment, insults, or other unprofessional conduct.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or
|
||||
reject comments, commits, code, wiki edits, issues, and other contributions
|
||||
that are not aligned to this Code of Conduct. Project maintainers or
|
||||
contributors who do not follow the Code of Conduct may be removed from the
|
||||
project team.
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported by opening an issue or contacting the maintainer via email.
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
|
||||
1.0.0][v1].
|
||||
|
||||
[cc]: http://contributor-covenant.org/
|
||||
[v1]: http://contributor-covenant.org/version/1/0/0/
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
#include "bwamem.h"
|
||||
#include "kseq.h" // for the FASTA/Q parser
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
bwaidx_t *idx;
|
||||
gzFile fp;
|
||||
kseq_t *ks;
|
||||
mem_opt_t *opt;
|
||||
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
|
||||
if (NULL == idx) {
|
||||
fprintf(stderr, "Index load failed.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Couldn't open %s : %s\n",
|
||||
strcmp(argv[2], "-") ? argv[2] : "stdin",
|
||||
errno ? strerror(errno) : "Out of memory");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ks = kseq_init(fp); // initialize the FASTA/Q parser
|
||||
opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
|
||||
|
||||
while (kseq_read(ks) >= 0) { // read one sequence
|
||||
mem_alnreg_v ar;
|
||||
int i, k;
|
||||
ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
|
||||
for (i = 0; i < ar.n; ++i) { // traverse each hit
|
||||
mem_aln_t a;
|
||||
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
|
||||
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
|
||||
// print alignment
|
||||
printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
|
||||
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
|
||||
printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
|
||||
printf("\t%d\n", a.NM); // print edit distance
|
||||
free(a.cigar); // don't forget to deallocate CIGAR
|
||||
}
|
||||
free(ar.a); // and deallocate the hit list
|
||||
}
|
||||
|
||||
free(opt);
|
||||
kseq_destroy(ks);
|
||||
gzclose(fp);
|
||||
bwa_idx_destroy(idx);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,483 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#include <zlib.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <ctype.h>
|
||||
#include <math.h>
|
||||
#include "bwa.h"
|
||||
#include "bwamem.h"
|
||||
#include "kvec.h"
|
||||
#include "utils.h"
|
||||
#include "bntseq.h"
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
extern unsigned char nst_nt4_table[256];
|
||||
|
||||
void *kopen(const char *fn, int *_fd);
|
||||
int kclose(void *a);
|
||||
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
|
||||
|
||||
typedef struct {
|
||||
kseq_t *ks, *ks2;
|
||||
mem_opt_t *opt;
|
||||
mem_pestat_t *pes0;
|
||||
int64_t n_processed;
|
||||
int copy_comment, actual_chunk_size;
|
||||
bwaidx_t *idx;
|
||||
} ktp_aux_t;
|
||||
|
||||
typedef struct {
|
||||
ktp_aux_t *aux;
|
||||
int n_seqs;
|
||||
bseq1_t *seqs;
|
||||
} ktp_data_t;
|
||||
|
||||
static void *process(void *shared, int step, void *_data)
|
||||
{
|
||||
ktp_aux_t *aux = (ktp_aux_t*)shared;
|
||||
ktp_data_t *data = (ktp_data_t*)_data;
|
||||
int i;
|
||||
if (step == 0) {
|
||||
ktp_data_t *ret;
|
||||
int64_t size = 0;
|
||||
ret = calloc(1, sizeof(ktp_data_t));
|
||||
ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2);
|
||||
if (ret->seqs == 0) {
|
||||
free(ret);
|
||||
return 0;
|
||||
}
|
||||
if (!aux->copy_comment)
|
||||
for (i = 0; i < ret->n_seqs; ++i) {
|
||||
free(ret->seqs[i].comment);
|
||||
ret->seqs[i].comment = 0;
|
||||
}
|
||||
for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq;
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size);
|
||||
return ret;
|
||||
} else if (step == 1) {
|
||||
const mem_opt_t *opt = aux->opt;
|
||||
const bwaidx_t *idx = aux->idx;
|
||||
if (opt->flag & MEM_F_SMARTPE) {
|
||||
bseq1_t *sep[2];
|
||||
int n_sep[2];
|
||||
mem_opt_t tmp_opt = *opt;
|
||||
bseq_classify(data->n_seqs, data->seqs, n_sep, sep);
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
|
||||
if (n_sep[0]) {
|
||||
tmp_opt.flag &= ~MEM_F_PE;
|
||||
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0);
|
||||
for (i = 0; i < n_sep[0]; ++i)
|
||||
data->seqs[sep[0][i].id].sam = sep[0][i].sam;
|
||||
}
|
||||
if (n_sep[1]) {
|
||||
tmp_opt.flag |= MEM_F_PE;
|
||||
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0);
|
||||
for (i = 0; i < n_sep[1]; ++i)
|
||||
data->seqs[sep[1][i].id].sam = sep[1][i].sam;
|
||||
}
|
||||
free(sep[0]); free(sep[1]);
|
||||
} else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0);
|
||||
aux->n_processed += data->n_seqs;
|
||||
return data;
|
||||
} else if (step == 2) {
|
||||
for (i = 0; i < data->n_seqs; ++i) {
|
||||
if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout);
|
||||
free(data->seqs[i].name); free(data->seqs[i].comment);
|
||||
free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam);
|
||||
}
|
||||
free(data->seqs); free(data);
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
|
||||
{
|
||||
if (opt0->a) { // matching score is changed
|
||||
if (!opt0->b) opt->b *= opt->a;
|
||||
if (!opt0->T) opt->T *= opt->a;
|
||||
if (!opt0->o_del) opt->o_del *= opt->a;
|
||||
if (!opt0->e_del) opt->e_del *= opt->a;
|
||||
if (!opt0->o_ins) opt->o_ins *= opt->a;
|
||||
if (!opt0->e_ins) opt->e_ins *= opt->a;
|
||||
if (!opt0->zdrop) opt->zdrop *= opt->a;
|
||||
if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a;
|
||||
if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a;
|
||||
if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a;
|
||||
}
|
||||
}
|
||||
|
||||
int main_mem(int argc, char *argv[])
|
||||
{
|
||||
mem_opt_t *opt, opt0;
|
||||
int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0;
|
||||
int fixed_chunk_size = -1;
|
||||
gzFile fp, fp2 = 0;
|
||||
char *p, *rg_line = 0, *hdr_line = 0;
|
||||
const char *mode = 0;
|
||||
void *ko = 0, *ko2 = 0;
|
||||
mem_pestat_t pes[4];
|
||||
ktp_aux_t aux;
|
||||
|
||||
memset(&aux, 0, sizeof(ktp_aux_t));
|
||||
memset(pes, 0, 4 * sizeof(mem_pestat_t));
|
||||
for (i = 0; i < 4; ++i) pes[i].failed = 1;
|
||||
|
||||
aux.opt = opt = mem_opt_init();
|
||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||
else if (c == '1') no_mt_io = 1;
|
||||
else if (c == 'x') mode = optarg;
|
||||
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
|
||||
else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
|
||||
else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1;
|
||||
else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1;
|
||||
else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1;
|
||||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
||||
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
||||
else if (c == 'a') opt->flag |= MEM_F_ALL;
|
||||
else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
|
||||
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
|
||||
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
|
||||
else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
|
||||
else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
|
||||
else if (c == '5') opt->flag |= MEM_F_PRIMARY5 | MEM_F_KEEP_SUPP_MAPQ; // always apply MEM_F_KEEP_SUPP_MAPQ with -5
|
||||
else if (c == 'q') opt->flag |= MEM_F_KEEP_SUPP_MAPQ;
|
||||
else if (c == 'u') opt->flag |= MEM_F_XB;
|
||||
else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
|
||||
else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
|
||||
else if (c == 'v') bwa_verbose = atoi(optarg);
|
||||
else if (c == 'j') ignore_alt = 1;
|
||||
else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.;
|
||||
else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.;
|
||||
else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1;
|
||||
else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1;
|
||||
else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
|
||||
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
|
||||
else if (c == 'o' || c == 'f') xreopen(optarg, "wb", stdout);
|
||||
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
|
||||
else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
|
||||
else if (c == 'C') aux.copy_comment = 1;
|
||||
else if (c == 'K') fixed_chunk_size = atoi(optarg);
|
||||
else if (c == 'X') opt->mask_level = atof(optarg);
|
||||
else if (c == 'F') bwa_dbg = atoi(optarg);
|
||||
else if (c == 'h') {
|
||||
opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
|
||||
opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
|
||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||
opt->max_XA_hits_alt = strtol(p+1, &p, 10);
|
||||
}
|
||||
else if (c == 'z') opt->XA_drop_ratio = atof(optarg);
|
||||
else if (c == 'Q') {
|
||||
opt0.mapQ_coef_len = 1;
|
||||
opt->mapQ_coef_len = atoi(optarg);
|
||||
opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0;
|
||||
} else if (c == 'O') {
|
||||
opt0.o_del = opt0.o_ins = 1;
|
||||
opt->o_del = opt->o_ins = strtol(optarg, &p, 10);
|
||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||
opt->o_ins = strtol(p+1, &p, 10);
|
||||
} else if (c == 'E') {
|
||||
opt0.e_del = opt0.e_ins = 1;
|
||||
opt->e_del = opt->e_ins = strtol(optarg, &p, 10);
|
||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||
opt->e_ins = strtol(p+1, &p, 10);
|
||||
} else if (c == 'L') {
|
||||
opt0.pen_clip5 = opt0.pen_clip3 = 1;
|
||||
opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10);
|
||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||
opt->pen_clip3 = strtol(p+1, &p, 10);
|
||||
} else if (c == 'R') {
|
||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
|
||||
} else if (c == 'H') {
|
||||
if (optarg[0] != '@') {
|
||||
FILE *fp;
|
||||
if ((fp = fopen(optarg, "r")) != 0) {
|
||||
char *buf;
|
||||
buf = calloc(1, 0x10000);
|
||||
while (fgets(buf, 0xffff, fp)) {
|
||||
i = strlen(buf);
|
||||
assert(buf[i-1] == '\n'); // a long line
|
||||
buf[i-1] = 0;
|
||||
hdr_line = bwa_insert_header(buf, hdr_line);
|
||||
}
|
||||
free(buf);
|
||||
fclose(fp);
|
||||
}
|
||||
} else hdr_line = bwa_insert_header(optarg, hdr_line);
|
||||
} else if (c == 'I') { // specify the insert size distribution
|
||||
aux.pes0 = pes;
|
||||
pes[1].failed = 0;
|
||||
pes[1].avg = strtod(optarg, &p);
|
||||
pes[1].std = pes[1].avg * .1;
|
||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||
pes[1].std = strtod(p+1, &p);
|
||||
pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499);
|
||||
pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499);
|
||||
if (pes[1].low < 1) pes[1].low = 1;
|
||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||
pes[1].high = (int)(strtod(p+1, &p) + .499);
|
||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||
pes[1].low = (int)(strtod(p+1, &p) + .499);
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
|
||||
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
|
||||
}
|
||||
else return 1;
|
||||
}
|
||||
|
||||
if (rg_line) {
|
||||
hdr_line = bwa_insert_header(rg_line, hdr_line);
|
||||
free(rg_line);
|
||||
}
|
||||
|
||||
if (opt->n_threads < 1) opt->n_threads = 1;
|
||||
if (optind + 1 >= argc || optind + 3 < argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
|
||||
fprintf(stderr, "Algorithm options:\n\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
|
||||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
|
||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv);
|
||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
|
||||
fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n");
|
||||
fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
|
||||
fprintf(stderr, " -S skip mate rescue\n");
|
||||
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
|
||||
fprintf(stderr, "\nScoring options:\n\n");
|
||||
fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
|
||||
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
|
||||
fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
|
||||
fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
|
||||
fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
|
||||
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired);
|
||||
fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overridden [null]\n");
|
||||
fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref)\n");
|
||||
fprintf(stderr, " ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref)\n");
|
||||
fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n");
|
||||
fprintf(stderr, "\nInput/output options:\n\n");
|
||||
fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n");
|
||||
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||
fprintf(stderr, " -H STR/FILE insert STR to header if it starts with @; or insert lines in FILE [null]\n");
|
||||
fprintf(stderr, " -o FILE sam file to output results to [stdout]\n");
|
||||
fprintf(stderr, " -j treat ALT contigs as part of the primary assembly (i.e. ignore <idxbase>.alt file)\n");
|
||||
fprintf(stderr, " -5 for split alignment, take the alignment with the smallest query (not genomic) coordinate as primary\n");
|
||||
fprintf(stderr, " -q don't modify mapQ of supplementary alignments\n");
|
||||
fprintf(stderr, " -K INT process INT input bases in each batch regardless of nThreads (for reproducibility) []\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " -v INT verbosity level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
|
||||
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
|
||||
fprintf(stderr, " -h INT[,INT] if there are <INT hits with score >%.2f%% of the max score, output all in XA [%d,%d]\n",
|
||||
opt->XA_drop_ratio * 100.0,
|
||||
opt->max_XA_hits, opt->max_XA_hits_alt);
|
||||
fprintf(stderr, " A second value may be given for alternate sequences.\n");
|
||||
fprintf(stderr, " -z FLOAT The fraction of the max score to use with -h [%f].\n", opt->XA_drop_ratio);
|
||||
fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n");
|
||||
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
|
||||
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
|
||||
fprintf(stderr, " -V output the reference FASTA header in the XR tag\n");
|
||||
fprintf(stderr, " -Y use soft clipping for supplementary alignments\n");
|
||||
fprintf(stderr, " -M mark shorter split hits as secondary\n\n");
|
||||
fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n");
|
||||
fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n");
|
||||
fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n");
|
||||
fprintf(stderr, " FR orientation only. [inferred]\n");
|
||||
fprintf(stderr, " -u output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
|
||||
fprintf(stderr, "\n");
|
||||
free(opt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (mode) {
|
||||
if (strcmp(mode, "intractg") == 0) {
|
||||
if (!opt0.o_del) opt->o_del = 16;
|
||||
if (!opt0.o_ins) opt->o_ins = 16;
|
||||
if (!opt0.b) opt->b = 9;
|
||||
if (!opt0.pen_clip5) opt->pen_clip5 = 5;
|
||||
if (!opt0.pen_clip3) opt->pen_clip3 = 5;
|
||||
} else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "ont2d") == 0) {
|
||||
if (!opt0.o_del) opt->o_del = 1;
|
||||
if (!opt0.e_del) opt->e_del = 1;
|
||||
if (!opt0.o_ins) opt->o_ins = 1;
|
||||
if (!opt0.e_ins) opt->e_ins = 1;
|
||||
if (!opt0.b) opt->b = 1;
|
||||
if (opt0.split_factor == 0.) opt->split_factor = 10.;
|
||||
if (strcmp(mode, "ont2d") == 0) {
|
||||
if (!opt0.min_chain_weight) opt->min_chain_weight = 20;
|
||||
if (!opt0.min_seed_len) opt->min_seed_len = 14;
|
||||
if (!opt0.pen_clip5) opt->pen_clip5 = 0;
|
||||
if (!opt0.pen_clip3) opt->pen_clip3 = 0;
|
||||
} else {
|
||||
if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
|
||||
if (!opt0.min_seed_len) opt->min_seed_len = 17;
|
||||
if (!opt0.pen_clip5) opt->pen_clip5 = 0;
|
||||
if (!opt0.pen_clip3) opt->pen_clip3 = 0;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode);
|
||||
return 1; // FIXME memory leak
|
||||
}
|
||||
} else update_a(opt, &opt0);
|
||||
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
|
||||
aux.idx = bwa_idx_load_from_shm(argv[optind]);
|
||||
if (aux.idx == 0) {
|
||||
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
||||
} else if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
|
||||
if (ignore_alt)
|
||||
for (i = 0; i < aux.idx->bns->n_seqs; ++i)
|
||||
aux.idx->bns->anns[i].is_alt = 0;
|
||||
|
||||
ko = kopen(argv[optind + 1], &fd);
|
||||
if (ko == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]);
|
||||
return 1;
|
||||
}
|
||||
fp = gzdopen(fd, "r");
|
||||
aux.ks = kseq_init(fp);
|
||||
if (optind + 2 < argc) {
|
||||
if (opt->flag&MEM_F_PE) {
|
||||
if (bwa_verbose >= 2)
|
||||
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
|
||||
} else {
|
||||
ko2 = kopen(argv[optind + 2], &fd2);
|
||||
if (ko2 == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]);
|
||||
return 1;
|
||||
}
|
||||
fp2 = gzdopen(fd2, "r");
|
||||
aux.ks2 = kseq_init(fp2);
|
||||
opt->flag |= MEM_F_PE;
|
||||
}
|
||||
}
|
||||
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
|
||||
aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
|
||||
kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
|
||||
free(hdr_line);
|
||||
free(opt);
|
||||
bwa_idx_destroy(aux.idx);
|
||||
kseq_destroy(aux.ks);
|
||||
err_gzclose(fp); kclose(ko);
|
||||
if (aux.ks2) {
|
||||
kseq_destroy(aux.ks2);
|
||||
err_gzclose(fp2); kclose(ko2);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main_fastmap(int argc, char *argv[])
|
||||
{
|
||||
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX;
|
||||
uint64_t max_intv = 0;
|
||||
kseq_t *seq;
|
||||
bwtint_t k;
|
||||
gzFile fp;
|
||||
smem_i *itr;
|
||||
const bwtintv_v *a;
|
||||
bwaidx_t *idx;
|
||||
|
||||
while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'p': print_seq = 1; break;
|
||||
case 'w': min_iwidth = atoi(optarg); break;
|
||||
case 'l': min_len = atoi(optarg); break;
|
||||
case 'i': min_intv = atoi(optarg); break;
|
||||
case 'I': max_intv = atol(optarg); break;
|
||||
case 'L': max_len = atoi(optarg); break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 1 >= argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
|
||||
fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
|
||||
fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
|
||||
fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);
|
||||
fprintf(stderr, " -L INT max MEM length [%d]\n", max_len);
|
||||
fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv);
|
||||
fprintf(stderr, "\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
fp = xzopen(argv[optind + 1], "r");
|
||||
seq = kseq_init(fp);
|
||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
|
||||
itr = smem_itr_init(idx->bwt);
|
||||
smem_config(itr, min_intv, max_len, max_intv);
|
||||
while (kseq_read(seq) >= 0) {
|
||||
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
|
||||
if (print_seq) {
|
||||
err_putchar('\t');
|
||||
err_puts(seq->seq.s);
|
||||
} else err_putchar('\n');
|
||||
for (i = 0; i < seq->seq.l; ++i)
|
||||
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
||||
smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
|
||||
while ((a = smem_next(itr)) != 0) {
|
||||
for (i = 0; i < a->n; ++i) {
|
||||
bwtintv_t *p = &a->a[i];
|
||||
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
|
||||
err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
|
||||
if (p->x[2] <= min_iwidth) {
|
||||
for (k = 0; k < p->x[2]; ++k) {
|
||||
bwtint_t pos;
|
||||
int len, is_rev, ref_id;
|
||||
len = (uint32_t)p->info - (p->info>>32);
|
||||
pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
|
||||
if (is_rev) pos -= len - 1;
|
||||
bns_cnt_ambi(idx->bns, pos, len, &ref_id);
|
||||
err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
|
||||
}
|
||||
} else err_puts("\t*");
|
||||
err_putchar('\n');
|
||||
}
|
||||
}
|
||||
err_puts("//");
|
||||
}
|
||||
|
||||
smem_itr_destroy(itr);
|
||||
bwa_idx_destroy(idx);
|
||||
kseq_destroy(seq);
|
||||
err_gzclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,223 @@
|
|||
/*
|
||||
* sais.c for sais-lite
|
||||
* Copyright (c) 2008 Yuta Mori All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
typedef unsigned char ubyte_t;
|
||||
#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i])
|
||||
|
||||
/* find the start or end of each bucket */
|
||||
static void getCounts(const unsigned char *T, int *C, int n, int k, int cs)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < k; ++i) C[i] = 0;
|
||||
for (i = 0; i < n; ++i) ++C[chr(i)];
|
||||
}
|
||||
static void getBuckets(const int *C, int *B, int k, int end)
|
||||
{
|
||||
int i, sum = 0;
|
||||
if (end) {
|
||||
for (i = 0; i < k; ++i) {
|
||||
sum += C[i];
|
||||
B[i] = sum;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < k; ++i) {
|
||||
sum += C[i];
|
||||
B[i] = sum - C[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* compute SA */
|
||||
static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs)
|
||||
{
|
||||
int *b, i, j;
|
||||
int c0, c1;
|
||||
/* compute SAl */
|
||||
if (C == B) getCounts(T, C, n, k, cs);
|
||||
getBuckets(C, B, k, 0); /* find starts of buckets */
|
||||
j = n - 1;
|
||||
b = SA + B[c1 = chr(j)];
|
||||
*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
|
||||
for (i = 0; i < n; ++i) {
|
||||
j = SA[i], SA[i] = ~j;
|
||||
if (0 < j) {
|
||||
--j;
|
||||
if ((c0 = chr(j)) != c1) {
|
||||
B[c1] = b - SA;
|
||||
b = SA + B[c1 = c0];
|
||||
}
|
||||
*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
|
||||
}
|
||||
}
|
||||
/* compute SAs */
|
||||
if (C == B) getCounts(T, C, n, k, cs);
|
||||
getBuckets(C, B, k, 1); /* find ends of buckets */
|
||||
for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
|
||||
if (0 < (j = SA[i])) {
|
||||
--j;
|
||||
if ((c0 = chr(j)) != c1) {
|
||||
B[c1] = b - SA;
|
||||
b = SA + B[c1 = c0];
|
||||
}
|
||||
*--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j;
|
||||
} else SA[i] = ~j;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working
|
||||
* space (excluding T and SA) of at most 2n+O(1) for a constant alphabet
|
||||
*/
|
||||
static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs)
|
||||
{
|
||||
int *C, *B, *RA;
|
||||
int i, j, c, m, p, q, plen, qlen, name;
|
||||
int c0, c1;
|
||||
int diff;
|
||||
|
||||
/* stage 1: reduce the problem by at least 1/2 sort all the
|
||||
* S-substrings */
|
||||
if (k <= fs) {
|
||||
C = SA + n;
|
||||
B = (k <= (fs - k)) ? C + k : C;
|
||||
} else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
|
||||
getCounts(T, C, n, k, cs);
|
||||
getBuckets(C, B, k, 1); /* find ends of buckets */
|
||||
for (i = 0; i < n; ++i) SA[i] = 0;
|
||||
for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
|
||||
if ((c0 = chr(i)) < (c1 + c)) c = 1;
|
||||
else if (c != 0) SA[--B[c1]] = i + 1, c = 0;
|
||||
}
|
||||
induceSA(T, SA, C, B, n, k, cs);
|
||||
if (fs < k) free(C);
|
||||
/* compact all the sorted substrings into the first m items of SA
|
||||
* 2*m must be not larger than n (proveable) */
|
||||
for (i = 0, m = 0; i < n; ++i) {
|
||||
p = SA[i];
|
||||
if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) {
|
||||
for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j);
|
||||
if ((j < n) && (c0 < c1)) SA[m++] = p;
|
||||
}
|
||||
}
|
||||
for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */
|
||||
/* store the length of all substrings */
|
||||
for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
|
||||
if ((c0 = chr(i)) < (c1 + c)) c = 1;
|
||||
else if (c != 0) {
|
||||
SA[m + ((i + 1) >> 1)] = j - i - 1;
|
||||
j = i + 1;
|
||||
c = 0;
|
||||
}
|
||||
}
|
||||
/* find the lexicographic names of all substrings */
|
||||
for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
|
||||
p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
|
||||
if (plen == qlen) {
|
||||
for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++);
|
||||
if (j == plen) diff = 0;
|
||||
}
|
||||
if (diff != 0) ++name, q = p, qlen = plen;
|
||||
SA[m + (p >> 1)] = name;
|
||||
}
|
||||
|
||||
/* stage 2: solve the reduced problem recurse if names are not yet
|
||||
* unique */
|
||||
if (name < m) {
|
||||
RA = SA + n + fs - m;
|
||||
for (i = n - 1, j = m - 1; m <= i; --i) {
|
||||
if (SA[i] != 0) RA[j--] = SA[i] - 1;
|
||||
}
|
||||
if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2;
|
||||
for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
|
||||
if ((c0 = chr(i)) < (c1 + c)) c = 1;
|
||||
else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */
|
||||
}
|
||||
for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */
|
||||
}
|
||||
/* stage 3: induce the result for the original problem */
|
||||
if (k <= fs) {
|
||||
C = SA + n;
|
||||
B = (k <= (fs - k)) ? C + k : C;
|
||||
} else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
|
||||
/* put all left-most S characters into their buckets */
|
||||
getCounts(T, C, n, k, cs);
|
||||
getBuckets(C, B, k, 1); /* find ends of buckets */
|
||||
for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
|
||||
for (i = m - 1; 0 <= i; --i) {
|
||||
j = SA[i], SA[i] = 0;
|
||||
SA[--B[chr(j)]] = j;
|
||||
}
|
||||
induceSA(T, SA, C, B, n, k, cs);
|
||||
if (fs < k) free(C);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the suffix array of a given string.
|
||||
* @param T[0..n-1] The input string.
|
||||
* @param SA[0..n] The output array of suffixes.
|
||||
* @param n The length of the given string.
|
||||
* @return 0 if no error occurred
|
||||
*/
|
||||
int is_sa(const ubyte_t *T, int *SA, int n)
|
||||
{
|
||||
if ((T == NULL) || (SA == NULL) || (n < 0)) return -1;
|
||||
SA[0] = n;
|
||||
if (n <= 1) {
|
||||
if (n == 1) SA[1] = 0;
|
||||
return 0;
|
||||
}
|
||||
return sais_main(T, SA+1, 0, n, 256, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the burrows-wheeler transformed string of a given string.
|
||||
* @param T[0..n-1] The input string.
|
||||
* @param n The length of the given string.
|
||||
* @return The primary index if no error occurred, -1 or -2 otherwise.
|
||||
*/
|
||||
int is_bwt(ubyte_t *T, int n)
|
||||
{
|
||||
int *SA, i, primary = 0;
|
||||
SA = (int*)calloc(n+1, sizeof(int));
|
||||
|
||||
if (is_sa(T, SA, n)) return -1;
|
||||
|
||||
for (i = 0; i <= n; ++i) {
|
||||
if (SA[i] == 0) primary = i;
|
||||
else SA[i] = T[SA[i] - 1];
|
||||
}
|
||||
for (i = 0; i < primary; ++i) T[i] = SA[i];
|
||||
for (; i < n; ++i) T[i] = SA[i + 1];
|
||||
free(SA);
|
||||
return primary;
|
||||
}
|
||||
|
|
@ -0,0 +1,388 @@
|
|||
/*-
|
||||
* Copyright 1997-1999, 2001, John-Mark Gurney.
|
||||
* 2008-2009, Attractive Chaos <attractor@live.co.uk>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __AC_KBTREE_H
|
||||
#define __AC_KBTREE_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int32_t is_internal:1, n:31;
|
||||
} kbnode_t;
|
||||
|
||||
#define __KB_KEY(type, x) ((type*)((char*)x + 4))
|
||||
#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr))
|
||||
|
||||
#define __KB_TREE_T(name) \
|
||||
typedef struct { \
|
||||
kbnode_t *root; \
|
||||
int off_key, off_ptr, ilen, elen; \
|
||||
int n, t; \
|
||||
int n_keys, n_nodes; \
|
||||
} kbtree_##name##_t;
|
||||
|
||||
#define __KB_INIT(name, key_t) \
|
||||
kbtree_##name##_t *kb_init_##name(int size) \
|
||||
{ \
|
||||
kbtree_##name##_t *b; \
|
||||
b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \
|
||||
b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
|
||||
if (b->t < 2) { \
|
||||
free(b); return 0; \
|
||||
} \
|
||||
b->n = 2 * b->t - 1; \
|
||||
b->off_ptr = 4 + b->n * sizeof(key_t); \
|
||||
b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
|
||||
b->elen = (b->off_ptr + 3) >> 2 << 2; \
|
||||
b->root = (kbnode_t*)calloc(1, b->ilen); \
|
||||
++b->n_nodes; \
|
||||
return b; \
|
||||
}
|
||||
|
||||
#define __kb_destroy(b) do { \
|
||||
int i, max = 8; \
|
||||
kbnode_t *x, **top, **stack = 0; \
|
||||
if (b) { \
|
||||
top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \
|
||||
*top++ = (b)->root; \
|
||||
while (top != stack) { \
|
||||
x = *--top; \
|
||||
if (x == 0 || x->is_internal == 0) { free(x); continue; } \
|
||||
for (i = 0; i <= x->n; ++i) \
|
||||
if (__KB_PTR(b, x)[i]) { \
|
||||
if (top - stack == max) { \
|
||||
max <<= 1; \
|
||||
stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \
|
||||
top = stack + (max>>1); \
|
||||
} \
|
||||
*top++ = __KB_PTR(b, x)[i]; \
|
||||
} \
|
||||
free(x); \
|
||||
} \
|
||||
} \
|
||||
free(b); free(stack); \
|
||||
} while (0)
|
||||
|
||||
#define __kb_get_first(key_t, b, ret) do { \
|
||||
kbnode_t *__x = (b)->root; \
|
||||
while (__KB_PTR(b, __x)[0] != 0) \
|
||||
__x = __KB_PTR(b, __x)[0]; \
|
||||
(ret) = __KB_KEY(key_t, __x)[0]; \
|
||||
} while (0)
|
||||
|
||||
#define __KB_GET_AUX0(name, key_t, __cmp) \
|
||||
static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
|
||||
{ \
|
||||
int tr, *rr, begin, end, n = x->n >> 1; \
|
||||
if (x->n == 0) return -1; \
|
||||
if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \
|
||||
begin = 0; end = n; \
|
||||
} else { begin = n; end = x->n - 1; } \
|
||||
rr = r? r : &tr; \
|
||||
n = end; \
|
||||
while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
|
||||
return n; \
|
||||
}
|
||||
|
||||
#define __KB_GET_AUX1(name, key_t, __cmp) \
|
||||
static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
|
||||
{ \
|
||||
int tr, *rr, begin = 0, end = x->n; \
|
||||
if (x->n == 0) return -1; \
|
||||
rr = r? r : &tr; \
|
||||
while (begin < end) { \
|
||||
int mid = (begin + end) >> 1; \
|
||||
if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
|
||||
else end = mid; \
|
||||
} \
|
||||
if (begin == x->n) { *rr = 1; return x->n - 1; } \
|
||||
if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \
|
||||
return begin; \
|
||||
}
|
||||
|
||||
#define __KB_GET(name, key_t) \
|
||||
static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
|
||||
{ \
|
||||
int i, r = 0; \
|
||||
kbnode_t *x = b->root; \
|
||||
while (x) { \
|
||||
i = __kb_getp_aux_##name(x, k, &r); \
|
||||
if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \
|
||||
if (x->is_internal == 0) return 0; \
|
||||
x = __KB_PTR(b, x)[i + 1]; \
|
||||
} \
|
||||
return 0; \
|
||||
} \
|
||||
static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
|
||||
{ \
|
||||
return kb_getp_##name(b, &k); \
|
||||
}
|
||||
|
||||
#define __KB_INTERVAL(name, key_t) \
|
||||
static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \
|
||||
{ \
|
||||
int i, r = 0; \
|
||||
kbnode_t *x = b->root; \
|
||||
*lower = *upper = 0; \
|
||||
while (x) { \
|
||||
i = __kb_getp_aux_##name(x, k, &r); \
|
||||
if (i >= 0 && r == 0) { \
|
||||
*lower = *upper = &__KB_KEY(key_t, x)[i]; \
|
||||
return; \
|
||||
} \
|
||||
if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \
|
||||
if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \
|
||||
if (x->is_internal == 0) return; \
|
||||
x = __KB_PTR(b, x)[i + 1]; \
|
||||
} \
|
||||
} \
|
||||
static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
|
||||
{ \
|
||||
kb_intervalp_##name(b, &k, lower, upper); \
|
||||
}
|
||||
|
||||
#define __KB_PUT(name, key_t, __cmp) \
|
||||
/* x must be an internal node */ \
|
||||
static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
|
||||
{ \
|
||||
kbnode_t *z; \
|
||||
z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \
|
||||
++b->n_nodes; \
|
||||
z->is_internal = y->is_internal; \
|
||||
z->n = b->t - 1; \
|
||||
memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
|
||||
if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
|
||||
y->n = b->t - 1; \
|
||||
memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
|
||||
__KB_PTR(b, x)[i + 1] = z; \
|
||||
memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
|
||||
__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \
|
||||
++x->n; \
|
||||
} \
|
||||
static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
|
||||
{ \
|
||||
int i = x->n - 1; \
|
||||
if (x->is_internal == 0) { \
|
||||
i = __kb_getp_aux_##name(x, k, 0); \
|
||||
if (i != x->n - 1) \
|
||||
memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
|
||||
__KB_KEY(key_t, x)[i + 1] = *k; \
|
||||
++x->n; \
|
||||
} else { \
|
||||
i = __kb_getp_aux_##name(x, k, 0) + 1; \
|
||||
if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \
|
||||
__kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \
|
||||
if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \
|
||||
} \
|
||||
__kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \
|
||||
} \
|
||||
} \
|
||||
static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
|
||||
{ \
|
||||
kbnode_t *r, *s; \
|
||||
++b->n_keys; \
|
||||
r = b->root; \
|
||||
if (r->n == 2 * b->t - 1) { \
|
||||
++b->n_nodes; \
|
||||
s = (kbnode_t*)calloc(1, b->ilen); \
|
||||
b->root = s; s->is_internal = 1; s->n = 0; \
|
||||
__KB_PTR(b, s)[0] = r; \
|
||||
__kb_split_##name(b, s, 0, r); \
|
||||
r = s; \
|
||||
} \
|
||||
__kb_putp_aux_##name(b, r, k); \
|
||||
} \
|
||||
static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
|
||||
{ \
|
||||
kb_putp_##name(b, &k); \
|
||||
}
|
||||
|
||||
|
||||
#define __KB_DEL(name, key_t) \
|
||||
static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
|
||||
{ \
|
||||
int yn, zn, i, r = 0; \
|
||||
kbnode_t *xp, *y, *z; \
|
||||
key_t kp; \
|
||||
if (x == 0) return *k; \
|
||||
if (s) { /* s can only be 0, 1 or 2 */ \
|
||||
r = x->is_internal == 0? 0 : s == 1? 1 : -1; \
|
||||
i = s == 1? x->n - 1 : -1; \
|
||||
} else i = __kb_getp_aux_##name(x, k, &r); \
|
||||
if (x->is_internal == 0) { \
|
||||
if (s == 2) ++i; \
|
||||
kp = __KB_KEY(key_t, x)[i]; \
|
||||
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
|
||||
--x->n; \
|
||||
return kp; \
|
||||
} \
|
||||
if (r == 0) { \
|
||||
if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \
|
||||
xp = __KB_PTR(b, x)[i]; \
|
||||
kp = __KB_KEY(key_t, x)[i]; \
|
||||
__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
|
||||
return kp; \
|
||||
} else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \
|
||||
xp = __KB_PTR(b, x)[i + 1]; \
|
||||
kp = __KB_KEY(key_t, x)[i]; \
|
||||
__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
|
||||
return kp; \
|
||||
} else if (yn == b->t - 1 && zn == b->t - 1) { \
|
||||
y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \
|
||||
__KB_KEY(key_t, y)[y->n++] = *k; \
|
||||
memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
|
||||
if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
|
||||
y->n += z->n; \
|
||||
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
|
||||
memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
|
||||
--x->n; \
|
||||
free(z); \
|
||||
return __kb_delp_aux_##name(b, y, k, s); \
|
||||
} \
|
||||
} \
|
||||
++i; \
|
||||
if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \
|
||||
if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \
|
||||
memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
|
||||
if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
|
||||
__KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \
|
||||
__KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
|
||||
if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
|
||||
--y->n; ++xp->n; \
|
||||
} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
|
||||
__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
|
||||
__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \
|
||||
if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
|
||||
--y->n; \
|
||||
memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
|
||||
if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
|
||||
} else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
|
||||
__KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \
|
||||
memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
|
||||
if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
|
||||
y->n += xp->n; \
|
||||
memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
|
||||
memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
|
||||
--x->n; \
|
||||
free(xp); \
|
||||
xp = y; \
|
||||
} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
|
||||
__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
|
||||
memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \
|
||||
if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
|
||||
xp->n += y->n; \
|
||||
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
|
||||
memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
|
||||
--x->n; \
|
||||
free(y); \
|
||||
} \
|
||||
} \
|
||||
return __kb_delp_aux_##name(b, xp, k, s); \
|
||||
} \
|
||||
static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
|
||||
{ \
|
||||
kbnode_t *x; \
|
||||
key_t ret; \
|
||||
ret = __kb_delp_aux_##name(b, b->root, k, 0); \
|
||||
--b->n_keys; \
|
||||
if (b->root->n == 0 && b->root->is_internal) { \
|
||||
--b->n_nodes; \
|
||||
x = b->root; \
|
||||
b->root = __KB_PTR(b, x)[0]; \
|
||||
free(x); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
|
||||
{ \
|
||||
return kb_delp_##name(b, &k); \
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
kbnode_t *x;
|
||||
int i;
|
||||
} __kbstack_t;
|
||||
|
||||
#define __kb_traverse(key_t, b, __func) do { \
|
||||
int __kmax = 8; \
|
||||
__kbstack_t *__kstack, *__kp; \
|
||||
__kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \
|
||||
__kp->x = (b)->root; __kp->i = 0; \
|
||||
for (;;) { \
|
||||
while (__kp->x && __kp->i <= __kp->x->n) { \
|
||||
if (__kp - __kstack == __kmax - 1) { \
|
||||
__kmax <<= 1; \
|
||||
__kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \
|
||||
__kp = __kstack + (__kmax>>1) - 1; \
|
||||
} \
|
||||
(__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
|
||||
++__kp; \
|
||||
} \
|
||||
--__kp; \
|
||||
if (__kp >= __kstack) { \
|
||||
if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
|
||||
++__kp->i; \
|
||||
} else break; \
|
||||
} \
|
||||
free(__kstack); \
|
||||
} while (0)
|
||||
|
||||
#define KBTREE_INIT(name, key_t, __cmp) \
|
||||
__KB_TREE_T(name) \
|
||||
__KB_INIT(name, key_t) \
|
||||
__KB_GET_AUX1(name, key_t, __cmp) \
|
||||
__KB_GET(name, key_t) \
|
||||
__KB_INTERVAL(name, key_t) \
|
||||
__KB_PUT(name, key_t, __cmp) \
|
||||
__KB_DEL(name, key_t)
|
||||
|
||||
#define KB_DEFAULT_SIZE 512
|
||||
|
||||
#define kbtree_t(name) kbtree_##name##_t
|
||||
#define kb_init(name, s) kb_init_##name(s)
|
||||
#define kb_destroy(name, b) __kb_destroy(b)
|
||||
#define kb_get(name, b, k) kb_get_##name(b, k)
|
||||
#define kb_put(name, b, k) kb_put_##name(b, k)
|
||||
#define kb_del(name, b, k) kb_del_##name(b, k)
|
||||
#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
|
||||
#define kb_getp(name, b, k) kb_getp_##name(b, k)
|
||||
#define kb_putp(name, b, k) kb_putp_##name(b, k)
|
||||
#define kb_delp(name, b, k) kb_delp_##name(b, k)
|
||||
#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
|
||||
|
||||
#define kb_size(b) ((b)->n_keys)
|
||||
|
||||
#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
|
||||
#define kb_str_cmp(a, b) strcmp(a, b)
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,614 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
An example:
|
||||
|
||||
#include "khash.h"
|
||||
KHASH_MAP_INIT_INT(32, char)
|
||||
int main() {
|
||||
int ret, is_missing;
|
||||
khiter_t k;
|
||||
khash_t(32) *h = kh_init(32);
|
||||
k = kh_put(32, h, 5, &ret);
|
||||
kh_value(h, k) = 10;
|
||||
k = kh_get(32, h, 10);
|
||||
is_missing = (k == kh_end(h));
|
||||
k = kh_get(32, h, 5);
|
||||
kh_del(32, h, k);
|
||||
for (k = kh_begin(h); k != kh_end(h); ++k)
|
||||
if (kh_exist(h, k)) kh_value(h, k) = 1;
|
||||
kh_destroy(32, h);
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
2011-12-29 (0.2.7):
|
||||
|
||||
* Minor code clean up; no actual effect.
|
||||
|
||||
2011-09-16 (0.2.6):
|
||||
|
||||
* The capacity is a power of 2. This seems to dramatically improve the
|
||||
speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
|
||||
|
||||
- http://code.google.com/p/ulib/
|
||||
- http://nothings.org/computer/judy/
|
||||
|
||||
* Allow to optionally use linear probing which usually has better
|
||||
performance for random input. Double hashing is still the default as it
|
||||
is more robust to certain non-random input.
|
||||
|
||||
* Added Wang's integer hash function (not used by default). This hash
|
||||
function is more robust to certain non-random input.
|
||||
|
||||
2011-02-14 (0.2.5):
|
||||
|
||||
* Allow to declare global functions.
|
||||
|
||||
2009-09-26 (0.2.4):
|
||||
|
||||
* Improve portability
|
||||
|
||||
2008-09-19 (0.2.3):
|
||||
|
||||
* Corrected the example
|
||||
* Improved interfaces
|
||||
|
||||
2008-09-11 (0.2.2):
|
||||
|
||||
* Improved speed a little in kh_put()
|
||||
|
||||
2008-09-10 (0.2.1):
|
||||
|
||||
* Added kh_clear()
|
||||
* Fixed a compiling error
|
||||
|
||||
2008-09-02 (0.2.0):
|
||||
|
||||
* Changed to token concatenation which increases flexibility.
|
||||
|
||||
2008-08-31 (0.1.2):
|
||||
|
||||
* Fixed a bug in kh_get(), which has not been tested previously.
|
||||
|
||||
2008-08-31 (0.1.1):
|
||||
|
||||
* Added destructor
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __AC_KHASH_H
|
||||
#define __AC_KHASH_H
|
||||
|
||||
/*!
|
||||
@header
|
||||
|
||||
Generic hash table library.
|
||||
*/
|
||||
|
||||
#define AC_VERSION_KHASH_H "0.2.6"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
/* compipler specific configuration */
|
||||
|
||||
#if UINT_MAX == 0xffffffffu
|
||||
typedef unsigned int khint32_t;
|
||||
#elif ULONG_MAX == 0xffffffffu
|
||||
typedef unsigned long khint32_t;
|
||||
#endif
|
||||
|
||||
#if ULONG_MAX == ULLONG_MAX
|
||||
typedef unsigned long khint64_t;
|
||||
#else
|
||||
typedef unsigned long long khint64_t;
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define kh_inline __inline
|
||||
#else
|
||||
#define kh_inline inline
|
||||
#endif
|
||||
|
||||
typedef khint32_t khint_t;
|
||||
typedef khint_t khiter_t;
|
||||
|
||||
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
|
||||
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
|
||||
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
|
||||
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
|
||||
|
||||
#ifdef KHASH_LINEAR
|
||||
#define __ac_inc(k, m) 1
|
||||
#else
|
||||
#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
|
||||
#endif
|
||||
|
||||
#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
|
||||
|
||||
#ifndef kroundup32
|
||||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
|
||||
#ifndef kcalloc
|
||||
#define kcalloc(N,Z) calloc(N,Z)
|
||||
#endif
|
||||
#ifndef kmalloc
|
||||
#define kmalloc(Z) malloc(Z)
|
||||
#endif
|
||||
#ifndef krealloc
|
||||
#define krealloc(P,Z) realloc(P,Z)
|
||||
#endif
|
||||
#ifndef kfree
|
||||
#define kfree(P) free(P)
|
||||
#endif
|
||||
|
||||
static const double __ac_HASH_UPPER = 0.77;
|
||||
|
||||
#define __KHASH_TYPE(name, khkey_t, khval_t) \
|
||||
typedef struct { \
|
||||
khint_t n_buckets, size, n_occupied, upper_bound; \
|
||||
khint32_t *flags; \
|
||||
khkey_t *keys; \
|
||||
khval_t *vals; \
|
||||
} kh_##name##_t;
|
||||
|
||||
#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
|
||||
extern kh_##name##_t *kh_init_##name(void); \
|
||||
extern void kh_destroy_##name(kh_##name##_t *h); \
|
||||
extern void kh_clear_##name(kh_##name##_t *h); \
|
||||
extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
|
||||
extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
|
||||
extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
|
||||
extern void kh_del_##name(kh_##name##_t *h, khint_t x);
|
||||
|
||||
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
SCOPE kh_##name##_t *kh_init_##name(void) { \
|
||||
return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
|
||||
} \
|
||||
SCOPE void kh_destroy_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h) { \
|
||||
kfree((void *)h->keys); kfree(h->flags); \
|
||||
kfree((void *)h->vals); \
|
||||
kfree(h); \
|
||||
} \
|
||||
} \
|
||||
SCOPE void kh_clear_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h && h->flags) { \
|
||||
memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
|
||||
h->size = h->n_occupied = 0; \
|
||||
} \
|
||||
} \
|
||||
SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
|
||||
{ \
|
||||
if (h->n_buckets) { \
|
||||
khint_t inc, k, i, last, mask; \
|
||||
mask = h->n_buckets - 1; \
|
||||
k = __hash_func(key); i = k & mask; \
|
||||
inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
i = (i + inc) & mask; \
|
||||
if (i == last) return h->n_buckets; \
|
||||
} \
|
||||
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
|
||||
} else return 0; \
|
||||
} \
|
||||
SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
|
||||
{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
|
||||
khint32_t *new_flags = 0; \
|
||||
khint_t j = 1; \
|
||||
{ \
|
||||
kroundup32(new_n_buckets); \
|
||||
if (new_n_buckets < 4) new_n_buckets = 4; \
|
||||
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
|
||||
else { /* hash table size to be changed (shrink or expand); rehash */ \
|
||||
new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
|
||||
if (!new_flags) return -1; \
|
||||
memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
|
||||
if (h->n_buckets < new_n_buckets) { /* expand */ \
|
||||
khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (!new_keys) return -1; \
|
||||
h->keys = new_keys; \
|
||||
if (kh_is_map) { \
|
||||
khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
if (!new_vals) return -1; \
|
||||
h->vals = new_vals; \
|
||||
} \
|
||||
} /* otherwise shrink */ \
|
||||
} \
|
||||
} \
|
||||
if (j) { /* rehashing is needed */ \
|
||||
for (j = 0; j != h->n_buckets; ++j) { \
|
||||
if (__ac_iseither(h->flags, j) == 0) { \
|
||||
khkey_t key = h->keys[j]; \
|
||||
khval_t val; \
|
||||
khint_t new_mask; \
|
||||
new_mask = new_n_buckets - 1; \
|
||||
if (kh_is_map) val = h->vals[j]; \
|
||||
__ac_set_isdel_true(h->flags, j); \
|
||||
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
|
||||
khint_t inc, k, i; \
|
||||
k = __hash_func(key); \
|
||||
i = k & new_mask; \
|
||||
inc = __ac_inc(k, new_mask); \
|
||||
while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
|
||||
__ac_set_isempty_false(new_flags, i); \
|
||||
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
|
||||
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
|
||||
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
|
||||
__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
|
||||
} else { /* write the element and jump out of the loop */ \
|
||||
h->keys[i] = key; \
|
||||
if (kh_is_map) h->vals[i] = val; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
|
||||
h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
} \
|
||||
kfree(h->flags); /* free the working space */ \
|
||||
h->flags = new_flags; \
|
||||
h->n_buckets = new_n_buckets; \
|
||||
h->n_occupied = h->size; \
|
||||
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
|
||||
} \
|
||||
return 0; \
|
||||
} \
|
||||
SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
|
||||
{ \
|
||||
khint_t x; \
|
||||
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
|
||||
if (h->n_buckets > (h->size<<1)) { \
|
||||
if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
|
||||
*ret = -1; return h->n_buckets; \
|
||||
} \
|
||||
} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
|
||||
*ret = -1; return h->n_buckets; \
|
||||
} \
|
||||
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
|
||||
{ \
|
||||
khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
|
||||
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
|
||||
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
|
||||
else { \
|
||||
inc = __ac_inc(k, mask); last = i; \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
if (__ac_isdel(h->flags, i)) site = i; \
|
||||
i = (i + inc) & mask; \
|
||||
if (i == last) { x = site; break; } \
|
||||
} \
|
||||
if (x == h->n_buckets) { \
|
||||
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
|
||||
else x = i; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (__ac_isempty(h->flags, x)) { /* not present at all */ \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; ++h->n_occupied; \
|
||||
*ret = 1; \
|
||||
} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; \
|
||||
*ret = 2; \
|
||||
} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
|
||||
return x; \
|
||||
} \
|
||||
SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
|
||||
{ \
|
||||
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
|
||||
__ac_set_isdel_true(h->flags, x); \
|
||||
--h->size; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define KHASH_DECLARE(name, khkey_t, khval_t) \
|
||||
__KHASH_TYPE(name, khkey_t, khval_t) \
|
||||
__KHASH_PROTOTYPES(name, khkey_t, khval_t)
|
||||
|
||||
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
__KHASH_TYPE(name, khkey_t, khval_t) \
|
||||
__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
|
||||
|
||||
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
|
||||
|
||||
/* --- BEGIN OF HASH FUNCTIONS --- */
|
||||
|
||||
/*! @function
|
||||
@abstract Integer hash function
|
||||
@param key The integer [khint32_t]
|
||||
@return The hash value [khint_t]
|
||||
*/
|
||||
#define kh_int_hash_func(key) (khint32_t)(key)
|
||||
/*! @function
|
||||
@abstract Integer comparison function
|
||||
*/
|
||||
#define kh_int_hash_equal(a, b) ((a) == (b))
|
||||
/*! @function
|
||||
@abstract 64-bit integer hash function
|
||||
@param key The integer [khint64_t]
|
||||
@return The hash value [khint_t]
|
||||
*/
|
||||
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
|
||||
/*! @function
|
||||
@abstract 64-bit integer comparison function
|
||||
*/
|
||||
#define kh_int64_hash_equal(a, b) ((a) == (b))
|
||||
/*! @function
|
||||
@abstract const char* hash function
|
||||
@param s Pointer to a null terminated string
|
||||
@return The hash value
|
||||
*/
|
||||
static kh_inline khint_t __ac_X31_hash_string(const char *s)
|
||||
{
|
||||
khint_t h = (khint_t)*s;
|
||||
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
|
||||
return h;
|
||||
}
|
||||
/*! @function
|
||||
@abstract Another interface to const char* hash function
|
||||
@param key Pointer to a null terminated string [const char*]
|
||||
@return The hash value [khint_t]
|
||||
*/
|
||||
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
|
||||
/*! @function
|
||||
@abstract Const char* comparison function
|
||||
*/
|
||||
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
|
||||
|
||||
static kh_inline khint_t __ac_Wang_hash(khint_t key)
|
||||
{
|
||||
key += ~(key << 15);
|
||||
key ^= (key >> 10);
|
||||
key += (key << 3);
|
||||
key ^= (key >> 6);
|
||||
key += ~(key << 11);
|
||||
key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
|
||||
|
||||
/* --- END OF HASH FUNCTIONS --- */
|
||||
|
||||
/* Other convenient macros... */
|
||||
|
||||
/*!
|
||||
@abstract Type of the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define khash_t(name) kh_##name##_t
|
||||
|
||||
/*! @function
|
||||
@abstract Initiate a hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@return Pointer to the hash table [khash_t(name)*]
|
||||
*/
|
||||
#define kh_init(name) kh_init_##name()
|
||||
|
||||
/*! @function
|
||||
@abstract Destroy a hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
*/
|
||||
#define kh_destroy(name, h) kh_destroy_##name(h)
|
||||
|
||||
/*! @function
|
||||
@abstract Reset a hash table without deallocating memory.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
*/
|
||||
#define kh_clear(name, h) kh_clear_##name(h)
|
||||
|
||||
/*! @function
|
||||
@abstract Resize a hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param s New size [khint_t]
|
||||
*/
|
||||
#define kh_resize(name, h, s) kh_resize_##name(h, s)
|
||||
|
||||
/*! @function
|
||||
@abstract Insert a key to the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param k Key [type of keys]
|
||||
@param r Extra return code: 0 if the key is present in the hash table;
|
||||
1 if the bucket is empty (never used); 2 if the element in
|
||||
the bucket has been deleted [int*]
|
||||
@return Iterator to the inserted element [khint_t]
|
||||
*/
|
||||
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
|
||||
|
||||
/*! @function
|
||||
@abstract Retrieve a key from the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param k Key [type of keys]
|
||||
@return Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
|
||||
*/
|
||||
#define kh_get(name, h, k) kh_get_##name(h, k)
|
||||
|
||||
/*! @function
|
||||
@abstract Remove a key from the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param k Iterator to the element to be deleted [khint_t]
|
||||
*/
|
||||
#define kh_del(name, h, k) kh_del_##name(h, k)
|
||||
|
||||
/*! @function
|
||||
@abstract Test whether a bucket contains data.
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param x Iterator to the bucket [khint_t]
|
||||
@return 1 if containing data; 0 otherwise [int]
|
||||
*/
|
||||
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
|
||||
|
||||
/*! @function
|
||||
@abstract Get key given an iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param x Iterator to the bucket [khint_t]
|
||||
@return Key [type of keys]
|
||||
*/
|
||||
#define kh_key(h, x) ((h)->keys[x])
|
||||
|
||||
/*! @function
|
||||
@abstract Get value given an iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param x Iterator to the bucket [khint_t]
|
||||
@return Value [type of values]
|
||||
@discussion For hash sets, calling this results in segfault.
|
||||
*/
|
||||
#define kh_val(h, x) ((h)->vals[x])
|
||||
|
||||
/*! @function
|
||||
@abstract Alias of kh_val()
|
||||
*/
|
||||
#define kh_value(h, x) ((h)->vals[x])
|
||||
|
||||
/*! @function
|
||||
@abstract Get the start iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return The start iterator [khint_t]
|
||||
*/
|
||||
#define kh_begin(h) (khint_t)(0)
|
||||
|
||||
/*! @function
|
||||
@abstract Get the end iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return The end iterator [khint_t]
|
||||
*/
|
||||
#define kh_end(h) ((h)->n_buckets)
|
||||
|
||||
/*! @function
|
||||
@abstract Get the number of elements in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return Number of elements in the hash table [khint_t]
|
||||
*/
|
||||
#define kh_size(h) ((h)->size)
|
||||
|
||||
/*! @function
|
||||
@abstract Get the number of buckets in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return Number of buckets in the hash table [khint_t]
|
||||
*/
|
||||
#define kh_n_buckets(h) ((h)->n_buckets)
|
||||
|
||||
/*! @function
|
||||
@abstract Iterate over the entries in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param kvar Variable to which key will be assigned
|
||||
@param vvar Variable to which value will be assigned
|
||||
@param code Block of code to execute
|
||||
*/
|
||||
#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
|
||||
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
|
||||
if (!kh_exist(h,__i)) continue; \
|
||||
(kvar) = kh_key(h,__i); \
|
||||
(vvar) = kh_val(h,__i); \
|
||||
code; \
|
||||
} }
|
||||
|
||||
/*! @function
|
||||
@abstract Iterate over the values in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param vvar Variable to which value will be assigned
|
||||
@param code Block of code to execute
|
||||
*/
|
||||
#define kh_foreach_value(h, vvar, code) { khint_t __i; \
|
||||
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
|
||||
if (!kh_exist(h,__i)) continue; \
|
||||
(vvar) = kh_val(h,__i); \
|
||||
code; \
|
||||
} }
|
||||
|
||||
/* More conenient interfaces */
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash set containing integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define KHASH_SET_INIT_INT(name) \
|
||||
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_INT(name, khval_t) \
|
||||
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing 64-bit integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define KHASH_SET_INIT_INT64(name) \
|
||||
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing 64-bit integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_INT64(name, khval_t) \
|
||||
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
typedef const char *kh_cstr_t;
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing const char* keys
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define KHASH_SET_INIT_STR(name) \
|
||||
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing const char* keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_STR(name, khval_t) \
|
||||
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
|
||||
|
||||
#endif /* __AC_KHASH_H */
|
||||
|
|
@ -0,0 +1,374 @@
|
|||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <ctype.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <signal.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/types.h>
|
||||
#ifndef _WIN32
|
||||
#include <netdb.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <sys/socket.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#define _KO_NO_NET
|
||||
#endif
|
||||
|
||||
#ifndef _KO_NO_NET
|
||||
static int socket_wait(int fd, int is_read)
|
||||
{
|
||||
fd_set fds, *fdr = 0, *fdw = 0;
|
||||
struct timeval tv;
|
||||
int ret;
|
||||
tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
|
||||
FD_ZERO(&fds);
|
||||
FD_SET(fd, &fds);
|
||||
if (is_read) fdr = &fds;
|
||||
else fdw = &fds;
|
||||
ret = select(fd+1, fdr, fdw, 0, &tv);
|
||||
if (ret == -1) perror("select");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int socket_connect(const char *host, const char *port)
|
||||
{
|
||||
#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
|
||||
|
||||
int on = 1, fd;
|
||||
struct linger lng = { 0, 0 };
|
||||
struct addrinfo hints, *res = 0;
|
||||
memset(&hints, 0, sizeof(struct addrinfo));
|
||||
hints.ai_family = AF_UNSPEC;
|
||||
hints.ai_socktype = SOCK_STREAM;
|
||||
if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
|
||||
if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
|
||||
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
|
||||
if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
|
||||
if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
|
||||
freeaddrinfo(res);
|
||||
return fd;
|
||||
#undef __err_connect
|
||||
}
|
||||
|
||||
static int write_bytes(int fd, const char *buf, size_t len)
|
||||
{
|
||||
ssize_t bytes;
|
||||
do {
|
||||
bytes = write(fd, buf, len);
|
||||
if (bytes >= 0) {
|
||||
len -= bytes;
|
||||
} else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
|
||||
return -1;
|
||||
}
|
||||
} while (len > 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int http_open(const char *fn)
|
||||
{
|
||||
char *p, *proxy, *q, *http_host, *host, *port, *path, *buf;
|
||||
int fd, ret, l;
|
||||
ssize_t bytes = 0, bufsz = 0x10000;
|
||||
|
||||
/* parse URL; adapted from khttp_parse_url() in knetfile.c */
|
||||
if (strstr(fn, "http://") != fn) return 0;
|
||||
// set ->http_host
|
||||
for (p = (char*)fn + 7; *p && *p != '/'; ++p);
|
||||
l = p - fn - 7;
|
||||
http_host = calloc(l + 1, 1);
|
||||
strncpy(http_host, fn + 7, l);
|
||||
http_host[l] = 0;
|
||||
for (q = http_host; *q && *q != ':'; ++q);
|
||||
if (*q == ':') *q++ = 0;
|
||||
// get http_proxy
|
||||
proxy = getenv("http_proxy");
|
||||
// set host, port and path
|
||||
if (proxy == 0) {
|
||||
host = strdup(http_host); // when there is no proxy, server name is identical to http_host name.
|
||||
port = strdup(*q? q : "80");
|
||||
path = strdup(*p? p : "/");
|
||||
} else {
|
||||
host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
|
||||
for (q = host; *q && *q != ':'; ++q);
|
||||
if (*q == ':') *q++ = 0;
|
||||
port = strdup(*q? q : "80");
|
||||
path = strdup(fn);
|
||||
}
|
||||
|
||||
/* connect; adapted from khttp_connect() in knetfile.c */
|
||||
l = 0;
|
||||
fd = socket_connect(host, port);
|
||||
buf = calloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
|
||||
l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n",
|
||||
path, http_host);
|
||||
if (write_bytes(fd, buf, l) != 0) {
|
||||
close(fd);
|
||||
fd = -1;
|
||||
goto out;
|
||||
}
|
||||
l = 0;
|
||||
retry:
|
||||
while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency
|
||||
if (buf[l] == '\n' && l >= 3)
|
||||
if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
|
||||
++l;
|
||||
}
|
||||
if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry;
|
||||
|
||||
buf[l] = 0;
|
||||
if (bytes < 0 || l < 14) { // prematured header
|
||||
close(fd);
|
||||
fd = -1;
|
||||
goto out;
|
||||
}
|
||||
ret = strtol(buf + 8, &p, 0); // HTTP return code
|
||||
if (ret != 200) {
|
||||
close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
out:
|
||||
free(buf); free(http_host); free(host); free(port); free(path);
|
||||
return fd;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int max_response, ctrl_fd;
|
||||
char *response;
|
||||
} ftpaux_t;
|
||||
|
||||
static int kftp_get_response(ftpaux_t *aux)
|
||||
{
|
||||
unsigned char c;
|
||||
int n = 0;
|
||||
char *p;
|
||||
if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0;
|
||||
while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
|
||||
if (n >= aux->max_response) {
|
||||
aux->max_response = aux->max_response? aux->max_response<<1 : 256;
|
||||
aux->response = realloc(aux->response, aux->max_response);
|
||||
}
|
||||
aux->response[n++] = c;
|
||||
if (c == '\n') {
|
||||
if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2])
|
||||
&& aux->response[3] != '-') break;
|
||||
n = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (n < 2) return -1;
|
||||
aux->response[n-2] = 0;
|
||||
return strtol(aux->response, &p, 0);
|
||||
}
|
||||
|
||||
static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get)
|
||||
{
|
||||
if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
|
||||
if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1;
|
||||
return is_get? kftp_get_response(aux) : 0;
|
||||
}
|
||||
|
||||
static int ftp_open(const char *fn)
|
||||
{
|
||||
char *p, *host = 0, *port = 0, *retr = 0;
|
||||
char host2[80], port2[10];
|
||||
int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4];
|
||||
ftpaux_t aux;
|
||||
|
||||
/* parse URL */
|
||||
if (strstr(fn, "ftp://") != fn) return 0;
|
||||
for (p = (char*)fn + 6; *p && *p != '/'; ++p);
|
||||
if (*p != '/') return 0;
|
||||
l = p - fn - 6;
|
||||
port = strdup("21");
|
||||
host = calloc(l + 1, 1);
|
||||
strncpy(host, fn + 6, l);
|
||||
retr = calloc(strlen(p) + 8, 1);
|
||||
sprintf(retr, "RETR %s\r\n", p);
|
||||
|
||||
/* connect to ctrl */
|
||||
memset(&aux, 0, sizeof(ftpaux_t));
|
||||
aux.ctrl_fd = socket_connect(host, port);
|
||||
if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */
|
||||
|
||||
/* connect to the data stream */
|
||||
kftp_get_response(&aux);
|
||||
kftp_send_cmd(&aux, "USER anonymous\r\n", 1);
|
||||
kftp_send_cmd(&aux, "PASS kopen@\r\n", 1);
|
||||
kftp_send_cmd(&aux, "TYPE I\r\n", 1);
|
||||
kftp_send_cmd(&aux, "PASV\r\n", 1);
|
||||
for (p = aux.response; *p && *p != '('; ++p);
|
||||
if (*p != '(') goto ftp_open_end;
|
||||
++p;
|
||||
sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
|
||||
memcpy(pasv_ip, v, 4 * sizeof(int));
|
||||
pasv_port = (v[4]<<8&0xff00) + v[5];
|
||||
kftp_send_cmd(&aux, retr, 0);
|
||||
sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
|
||||
sprintf(port2, "%d", pasv_port);
|
||||
fd = socket_connect(host2, port2);
|
||||
if (fd == -1) goto ftp_open_end;
|
||||
ret = kftp_get_response(&aux);
|
||||
if (ret != 150) {
|
||||
close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
close(aux.ctrl_fd);
|
||||
|
||||
ftp_open_end:
|
||||
free(host); free(port); free(retr); free(aux.response);
|
||||
return fd;
|
||||
}
|
||||
#endif /* !defined(_KO_NO_NET) */
|
||||
|
||||
static char **cmd2argv(const char *cmd)
|
||||
{
|
||||
int i, beg, end, argc;
|
||||
char **argv, *str;
|
||||
end = strlen(cmd);
|
||||
for (i = end - 1; i >= 0; --i)
|
||||
if (!isspace(cmd[i])) break;
|
||||
end = i + 1;
|
||||
for (beg = 0; beg < end; ++beg)
|
||||
if (!isspace(cmd[beg])) break;
|
||||
if (beg == end) return 0;
|
||||
for (i = beg + 1, argc = 0; i < end; ++i)
|
||||
if (isspace(cmd[i]) && !isspace(cmd[i-1]))
|
||||
++argc;
|
||||
argv = (char**)calloc(argc + 2, sizeof(void*));
|
||||
argv[0] = str = (char*)calloc(end - beg + 1, 1);
|
||||
strncpy(argv[0], cmd + beg, end - beg);
|
||||
for (i = argc = 1; i < end - beg; ++i)
|
||||
if (isspace(str[i])) str[i] = 0;
|
||||
else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i];
|
||||
return argv;
|
||||
}
|
||||
|
||||
#define KO_STDIN 1
|
||||
#define KO_FILE 2
|
||||
#define KO_PIPE 3
|
||||
#define KO_HTTP 4
|
||||
#define KO_FTP 5
|
||||
|
||||
typedef struct {
|
||||
int type, fd;
|
||||
pid_t pid;
|
||||
} koaux_t;
|
||||
|
||||
void *kopen(const char *fn, int *_fd)
|
||||
{
|
||||
koaux_t *aux = 0;
|
||||
*_fd = -1;
|
||||
if (strstr(fn, "http://") == fn) {
|
||||
aux = calloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_HTTP;
|
||||
aux->fd = http_open(fn);
|
||||
} else if (strstr(fn, "ftp://") == fn) {
|
||||
aux = calloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_FTP;
|
||||
aux->fd = ftp_open(fn);
|
||||
} else if (strcmp(fn, "-") == 0) {
|
||||
aux = calloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_STDIN;
|
||||
aux->fd = STDIN_FILENO;
|
||||
} else {
|
||||
const char *p, *q;
|
||||
for (p = fn; *p; ++p)
|
||||
if (!isspace(*p)) break;
|
||||
if (*p == '<') { // pipe open
|
||||
int need_shell, pfd[2];
|
||||
pid_t pid;
|
||||
// a simple check to see if we need to invoke a shell; not always working
|
||||
for (q = p + 1; *q; ++q)
|
||||
if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':')
|
||||
break;
|
||||
need_shell = (*q != 0);
|
||||
if (pipe(pfd) != 0) return 0;
|
||||
pid = vfork();
|
||||
if (pid == -1) { /* vfork() error */
|
||||
close(pfd[0]); close(pfd[1]);
|
||||
return 0;
|
||||
}
|
||||
if (pid == 0) { /* the child process */
|
||||
char **argv; /* FIXME: I do not know if this will lead to a memory leak */
|
||||
close(pfd[0]);
|
||||
dup2(pfd[1], STDOUT_FILENO);
|
||||
close(pfd[1]);
|
||||
if (!need_shell) {
|
||||
argv = cmd2argv(p + 1);
|
||||
execvp(argv[0], argv);
|
||||
free(argv[0]); free(argv);
|
||||
} else execl("/bin/sh", "sh", "-c", p + 1, NULL);
|
||||
exit(1);
|
||||
} else { /* parent process */
|
||||
close(pfd[1]);
|
||||
aux = calloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_PIPE;
|
||||
aux->fd = pfd[0];
|
||||
aux->pid = pid;
|
||||
}
|
||||
} else {
|
||||
#ifdef _WIN32
|
||||
*_fd = open(fn, O_RDONLY | O_BINARY);
|
||||
#else
|
||||
*_fd = open(fn, O_RDONLY);
|
||||
#endif
|
||||
if (*_fd >= 0) {
|
||||
aux = calloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_FILE;
|
||||
aux->fd = *_fd;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (aux) *_fd = aux->fd;
|
||||
return aux;
|
||||
}
|
||||
|
||||
int kclose(void *a)
|
||||
{
|
||||
koaux_t *aux = (koaux_t*)a;
|
||||
if (aux->type == KO_PIPE) {
|
||||
int status;
|
||||
pid_t pid;
|
||||
pid = waitpid(aux->pid, &status, WNOHANG);
|
||||
if (pid != aux->pid) kill(aux->pid, 15);
|
||||
}
|
||||
free(aux);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef _KO_MAIN
|
||||
#define BUF_SIZE 0x10000
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
void *x;
|
||||
int l, fd;
|
||||
unsigned char buf[BUF_SIZE];
|
||||
FILE *fp;
|
||||
if (argc == 1) {
|
||||
fprintf(stderr, "Usage: kopen <file>\n");
|
||||
return 1;
|
||||
}
|
||||
x = kopen(argv[1], &fd);
|
||||
fp = fdopen(fd, "r");
|
||||
if (fp == 0) {
|
||||
fprintf(stderr, "ERROR: fail to open the input\n");
|
||||
return 1;
|
||||
}
|
||||
do {
|
||||
if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0)
|
||||
fwrite(buf, 1, l, stdout);
|
||||
} while (l == BUF_SIZE);
|
||||
fclose(fp);
|
||||
kclose(x);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
|
@ -0,0 +1,239 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Last Modified: 05MAR2012 */
|
||||
|
||||
#ifndef AC_KSEQ_H
|
||||
#define AC_KSEQ_H
|
||||
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
|
||||
#define KS_SEP_TAB 1 // isspace() && !' '
|
||||
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
|
||||
#define KS_SEP_MAX 2
|
||||
|
||||
#define __KS_TYPE(type_t) \
|
||||
typedef struct __kstream_t { \
|
||||
unsigned char *buf; \
|
||||
int begin, end, is_eof; \
|
||||
type_t f; \
|
||||
} kstream_t;
|
||||
|
||||
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
|
||||
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
|
||||
|
||||
#define __KS_BASIC(type_t, __bufsize) \
|
||||
static inline kstream_t *ks_init(type_t f) \
|
||||
{ \
|
||||
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
|
||||
ks->f = f; \
|
||||
ks->buf = (unsigned char*)malloc(__bufsize); \
|
||||
return ks; \
|
||||
} \
|
||||
static inline void ks_destroy(kstream_t *ks) \
|
||||
{ \
|
||||
if (ks) { \
|
||||
free(ks->buf); \
|
||||
free(ks); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define __KS_GETC(__read, __bufsize) \
|
||||
static inline int ks_getc(kstream_t *ks) \
|
||||
{ \
|
||||
if (ks->is_eof && ks->begin >= ks->end) return -1; \
|
||||
if (ks->begin >= ks->end) { \
|
||||
ks->begin = 0; \
|
||||
ks->end = __read(ks->f, ks->buf, __bufsize); \
|
||||
if (ks->end == 0) { ks->is_eof = 1; return -1;} \
|
||||
} \
|
||||
return (int)ks->buf[ks->begin++]; \
|
||||
}
|
||||
|
||||
#ifndef KSTRING_T
|
||||
#define KSTRING_T kstring_t
|
||||
typedef struct __kstring_t {
|
||||
size_t l, m;
|
||||
char *s;
|
||||
} kstring_t;
|
||||
#endif
|
||||
|
||||
#ifndef kroundup32
|
||||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
|
||||
#define __KS_GETUNTIL(__read, __bufsize) \
|
||||
static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
|
||||
{ \
|
||||
int gotany = 0; \
|
||||
if (dret) *dret = 0; \
|
||||
str->l = append? str->l : 0; \
|
||||
for (;;) { \
|
||||
int i; \
|
||||
if (ks->begin >= ks->end) { \
|
||||
if (!ks->is_eof) { \
|
||||
ks->begin = 0; \
|
||||
ks->end = __read(ks->f, ks->buf, __bufsize); \
|
||||
if (ks->end == 0) { ks->is_eof = 1; break; } \
|
||||
} else break; \
|
||||
} \
|
||||
if (delimiter == KS_SEP_LINE) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (ks->buf[i] == '\n') break; \
|
||||
} else if (delimiter > KS_SEP_MAX) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (ks->buf[i] == delimiter) break; \
|
||||
} else if (delimiter == KS_SEP_SPACE) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (isspace(ks->buf[i])) break; \
|
||||
} else if (delimiter == KS_SEP_TAB) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
|
||||
} else i = 0; /* never come to here! */ \
|
||||
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
|
||||
str->m = str->l + (i - ks->begin) + 1; \
|
||||
kroundup32(str->m); \
|
||||
str->s = (char*)realloc(str->s, str->m); \
|
||||
} \
|
||||
gotany = 1; \
|
||||
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
|
||||
str->l = str->l + (i - ks->begin); \
|
||||
ks->begin = i + 1; \
|
||||
if (i < ks->end) { \
|
||||
if (dret) *dret = ks->buf[i]; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
if (!gotany && ks_eof(ks)) return -1; \
|
||||
if (str->s == 0) { \
|
||||
str->m = 1; \
|
||||
str->s = (char*)calloc(1, 1); \
|
||||
} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
|
||||
str->s[str->l] = '\0'; \
|
||||
return str->l; \
|
||||
} \
|
||||
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
|
||||
{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
|
||||
|
||||
#define KSTREAM_INIT(type_t, __read, __bufsize) \
|
||||
__KS_TYPE(type_t) \
|
||||
__KS_BASIC(type_t, __bufsize) \
|
||||
__KS_GETC(__read, __bufsize) \
|
||||
__KS_GETUNTIL(__read, __bufsize)
|
||||
|
||||
#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
|
||||
|
||||
#define __KSEQ_BASIC(SCOPE, type_t) \
|
||||
SCOPE kseq_t *kseq_init(type_t fd) \
|
||||
{ \
|
||||
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
|
||||
s->f = ks_init(fd); \
|
||||
return s; \
|
||||
} \
|
||||
SCOPE void kseq_destroy(kseq_t *ks) \
|
||||
{ \
|
||||
if (!ks) return; \
|
||||
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
|
||||
ks_destroy(ks->f); \
|
||||
free(ks); \
|
||||
}
|
||||
|
||||
/* Return value:
|
||||
>=0 length of the sequence (normal)
|
||||
-1 end-of-file
|
||||
-2 truncated quality string
|
||||
*/
|
||||
#define __KSEQ_READ(SCOPE) \
|
||||
SCOPE int kseq_read(kseq_t *seq) \
|
||||
{ \
|
||||
int c; \
|
||||
kstream_t *ks = seq->f; \
|
||||
if (seq->last_char == 0) { /* then jump to the next header line */ \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
|
||||
if (c == -1) return -1; /* end of file */ \
|
||||
seq->last_char = c; \
|
||||
} /* else: the first header char has been read in the previous call */ \
|
||||
seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
|
||||
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
|
||||
if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
|
||||
if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
|
||||
seq->seq.m = 256; \
|
||||
seq->seq.s = (char*)malloc(seq->seq.m); \
|
||||
} \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
|
||||
if (c == '\n') continue; /* skip empty lines */ \
|
||||
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
|
||||
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
|
||||
} \
|
||||
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
|
||||
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
|
||||
seq->seq.m = seq->seq.l + 2; \
|
||||
kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
|
||||
seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
|
||||
} \
|
||||
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
|
||||
if (c != '+') return seq->seq.l; /* FASTA */ \
|
||||
if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
|
||||
seq->qual.m = seq->seq.m; \
|
||||
seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
|
||||
} \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
|
||||
if (c == -1) return -2; /* error: no quality string */ \
|
||||
while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
|
||||
seq->last_char = 0; /* we have not come to the next header line */ \
|
||||
if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
|
||||
return seq->seq.l; \
|
||||
}
|
||||
|
||||
#define __KSEQ_TYPE(type_t) \
|
||||
typedef struct { \
|
||||
kstring_t name, comment, seq, qual; \
|
||||
int last_char; \
|
||||
kstream_t *f; \
|
||||
} kseq_t;
|
||||
|
||||
#define KSEQ_INIT2(SCOPE, type_t, __read) \
|
||||
KSTREAM_INIT(type_t, __read, 16384) \
|
||||
__KSEQ_TYPE(type_t) \
|
||||
__KSEQ_BASIC(SCOPE, type_t) \
|
||||
__KSEQ_READ(SCOPE)
|
||||
|
||||
#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
|
||||
|
||||
#define KSEQ_DECLARE(type_t) \
|
||||
__KS_TYPE(type_t) \
|
||||
__KSEQ_TYPE(type_t) \
|
||||
extern kseq_t *kseq_init(type_t fd); \
|
||||
void kseq_destroy(kseq_t *ks); \
|
||||
int kseq_read(kseq_t *seq);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,273 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, by Attractive Chaos <attractivechaos@aol.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
2008-11-16 (0.1.4):
|
||||
|
||||
* Fixed a bug in introsort() that happens in rare cases.
|
||||
|
||||
2008-11-05 (0.1.3):
|
||||
|
||||
* Fixed a bug in introsort() for complex comparisons.
|
||||
|
||||
* Fixed a bug in mergesort(). The previous version is not stable.
|
||||
|
||||
2008-09-15 (0.1.2):
|
||||
|
||||
* Accelerated introsort. On my Mac (not on another Linux machine),
|
||||
my implementation is as fast as std::sort on random input.
|
||||
|
||||
* Added combsort and in introsort, switch to combsort if the
|
||||
recursion is too deep.
|
||||
|
||||
2008-09-13 (0.1.1):
|
||||
|
||||
* Added k-small algorithm
|
||||
|
||||
2008-09-05 (0.1.0):
|
||||
|
||||
* Initial version
|
||||
|
||||
*/
|
||||
|
||||
#ifndef AC_KSORT_H
|
||||
#define AC_KSORT_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
void *left, *right;
|
||||
int depth;
|
||||
} ks_isort_stack_t;
|
||||
|
||||
#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
|
||||
|
||||
#define KSORT_INIT(name, type_t, __sort_lt) \
|
||||
void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
|
||||
{ \
|
||||
type_t *a2[2], *a, *b; \
|
||||
int curr, shift; \
|
||||
\
|
||||
a2[0] = array; \
|
||||
a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
|
||||
for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
|
||||
a = a2[curr]; b = a2[1-curr]; \
|
||||
if (shift == 0) { \
|
||||
type_t *p = b, *i, *eb = a + n; \
|
||||
for (i = a; i < eb; i += 2) { \
|
||||
if (i == eb - 1) *p++ = *i; \
|
||||
else { \
|
||||
if (__sort_lt(*(i+1), *i)) { \
|
||||
*p++ = *(i+1); *p++ = *i; \
|
||||
} else { \
|
||||
*p++ = *i; *p++ = *(i+1); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} else { \
|
||||
size_t i, step = 1ul<<shift; \
|
||||
for (i = 0; i < n; i += step<<1) { \
|
||||
type_t *p, *j, *k, *ea, *eb; \
|
||||
if (n < i + step) { \
|
||||
ea = a + n; eb = a; \
|
||||
} else { \
|
||||
ea = a + i + step; \
|
||||
eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
|
||||
} \
|
||||
j = a + i; k = a + i + step; p = b + i; \
|
||||
while (j < ea && k < eb) { \
|
||||
if (__sort_lt(*k, *j)) *p++ = *k++; \
|
||||
else *p++ = *j++; \
|
||||
} \
|
||||
while (j < ea) *p++ = *j++; \
|
||||
while (k < eb) *p++ = *k++; \
|
||||
} \
|
||||
} \
|
||||
curr = 1 - curr; \
|
||||
} \
|
||||
if (curr == 1) { \
|
||||
type_t *p = a2[0], *i = a2[1], *eb = array + n; \
|
||||
for (; p < eb; ++i) *p++ = *i; \
|
||||
} \
|
||||
if (temp == 0) free(a2[1]); \
|
||||
} \
|
||||
void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
|
||||
{ \
|
||||
size_t k = i; \
|
||||
type_t tmp = l[i]; \
|
||||
while ((k = (k << 1) + 1) < n) { \
|
||||
if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
|
||||
if (__sort_lt(l[k], tmp)) break; \
|
||||
l[i] = l[k]; i = k; \
|
||||
} \
|
||||
l[i] = tmp; \
|
||||
} \
|
||||
void ks_heapmake_##name(size_t lsize, type_t l[]) \
|
||||
{ \
|
||||
size_t i; \
|
||||
for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
|
||||
ks_heapadjust_##name(i, lsize, l); \
|
||||
} \
|
||||
void ks_heapsort_##name(size_t lsize, type_t l[]) \
|
||||
{ \
|
||||
size_t i; \
|
||||
for (i = lsize - 1; i > 0; --i) { \
|
||||
type_t tmp; \
|
||||
tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
|
||||
} \
|
||||
} \
|
||||
static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
|
||||
{ \
|
||||
type_t *i, *j, swap_tmp; \
|
||||
for (i = s + 1; i < t; ++i) \
|
||||
for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
|
||||
swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
|
||||
} \
|
||||
} \
|
||||
void ks_combsort_##name(size_t n, type_t a[]) \
|
||||
{ \
|
||||
const double shrink_factor = 1.2473309501039786540366528676643; \
|
||||
int do_swap; \
|
||||
size_t gap = n; \
|
||||
type_t tmp, *i, *j; \
|
||||
do { \
|
||||
if (gap > 2) { \
|
||||
gap = (size_t)(gap / shrink_factor); \
|
||||
if (gap == 9 || gap == 10) gap = 11; \
|
||||
} \
|
||||
do_swap = 0; \
|
||||
for (i = a; i < a + n - gap; ++i) { \
|
||||
j = i + gap; \
|
||||
if (__sort_lt(*j, *i)) { \
|
||||
tmp = *i; *i = *j; *j = tmp; \
|
||||
do_swap = 1; \
|
||||
} \
|
||||
} \
|
||||
} while (do_swap || gap > 2); \
|
||||
if (gap != 1) __ks_insertsort_##name(a, a + n); \
|
||||
} \
|
||||
void ks_introsort_##name(size_t n, type_t a[]) \
|
||||
{ \
|
||||
int d; \
|
||||
ks_isort_stack_t *top, *stack; \
|
||||
type_t rp, swap_tmp; \
|
||||
type_t *s, *t, *i, *j, *k; \
|
||||
\
|
||||
if (n < 1) return; \
|
||||
else if (n == 2) { \
|
||||
if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
|
||||
return; \
|
||||
} \
|
||||
for (d = 2; 1ul<<d < n; ++d); \
|
||||
stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
|
||||
top = stack; s = a; t = a + (n-1); d <<= 1; \
|
||||
while (1) { \
|
||||
if (s < t) { \
|
||||
if (--d == 0) { \
|
||||
ks_combsort_##name(t - s + 1, s); \
|
||||
t = s; \
|
||||
continue; \
|
||||
} \
|
||||
i = s; j = t; k = i + ((j-i)>>1) + 1; \
|
||||
if (__sort_lt(*k, *i)) { \
|
||||
if (__sort_lt(*k, *j)) k = j; \
|
||||
} else k = __sort_lt(*j, *i)? i : j; \
|
||||
rp = *k; \
|
||||
if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
|
||||
for (;;) { \
|
||||
do ++i; while (__sort_lt(*i, rp)); \
|
||||
do --j; while (i <= j && __sort_lt(rp, *j)); \
|
||||
if (j <= i) break; \
|
||||
swap_tmp = *i; *i = *j; *j = swap_tmp; \
|
||||
} \
|
||||
swap_tmp = *i; *i = *t; *t = swap_tmp; \
|
||||
if (i-s > t-i) { \
|
||||
if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
|
||||
s = t-i > 16? i+1 : t; \
|
||||
} else { \
|
||||
if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
|
||||
t = i-s > 16? i-1 : s; \
|
||||
} \
|
||||
} else { \
|
||||
if (top == stack) { \
|
||||
free(stack); \
|
||||
__ks_insertsort_##name(a, a+n); \
|
||||
return; \
|
||||
} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
/* This function is adapted from: http://ndevilla.free.fr/median/ */ \
|
||||
/* 0 <= kk < n */ \
|
||||
type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
|
||||
{ \
|
||||
type_t *low, *high, *k, *ll, *hh, *mid; \
|
||||
low = arr; high = arr + n - 1; k = arr + kk; \
|
||||
for (;;) { \
|
||||
if (high <= low) return *k; \
|
||||
if (high == low + 1) { \
|
||||
if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
|
||||
return *k; \
|
||||
} \
|
||||
mid = low + (high - low) / 2; \
|
||||
if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
|
||||
if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
|
||||
if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
|
||||
KSORT_SWAP(type_t, *mid, *(low+1)); \
|
||||
ll = low + 1; hh = high; \
|
||||
for (;;) { \
|
||||
do ++ll; while (__sort_lt(*ll, *low)); \
|
||||
do --hh; while (__sort_lt(*low, *hh)); \
|
||||
if (hh < ll) break; \
|
||||
KSORT_SWAP(type_t, *ll, *hh); \
|
||||
} \
|
||||
KSORT_SWAP(type_t, *low, *hh); \
|
||||
if (hh <= k) low = ll; \
|
||||
if (hh >= k) high = hh - 1; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
|
||||
#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
|
||||
#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
|
||||
#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
|
||||
#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
|
||||
#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
|
||||
#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
|
||||
|
||||
#define ks_lt_generic(a, b) ((a) < (b))
|
||||
#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
|
||||
|
||||
typedef const char *ksstr_t;
|
||||
|
||||
#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
|
||||
#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "kstring.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
int ksprintf(kstring_t *s, const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
int l;
|
||||
va_start(ap, fmt);
|
||||
l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
|
||||
va_end(ap);
|
||||
if (l + 1 > s->m - s->l) {
|
||||
s->m = s->l + l + 2;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)realloc(s->s, s->m);
|
||||
va_start(ap, fmt);
|
||||
l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
|
||||
}
|
||||
va_end(ap);
|
||||
s->l += l;
|
||||
return l;
|
||||
}
|
||||
|
||||
#ifdef KSTRING_MAIN
|
||||
#include <stdio.h>
|
||||
int main()
|
||||
{
|
||||
kstring_t *s;
|
||||
s = (kstring_t*)calloc(1, sizeof(kstring_t));
|
||||
ksprintf(s, "abcdefg: %d", 100);
|
||||
printf("%s\n", s->s);
|
||||
free(s);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
|
@ -0,0 +1,115 @@
|
|||
#ifndef KSTRING_H
|
||||
#define KSTRING_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
#ifndef kroundup32
|
||||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
|
||||
#ifndef KSTRING_T
|
||||
#define KSTRING_T kstring_t
|
||||
typedef struct __kstring_t {
|
||||
size_t l, m;
|
||||
char *s;
|
||||
} kstring_t;
|
||||
#endif
|
||||
|
||||
static inline void ks_resize(kstring_t *s, size_t size)
|
||||
{
|
||||
if (s->m < size) {
|
||||
s->m = size;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)realloc(s->s, s->m);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int kputsn(const char *p, int l, kstring_t *s)
|
||||
{
|
||||
if (s->l + l + 1 >= s->m) {
|
||||
s->m = s->l + l + 2;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)realloc(s->s, s->m);
|
||||
}
|
||||
memcpy(s->s + s->l, p, l);
|
||||
s->l += l;
|
||||
s->s[s->l] = 0;
|
||||
return l;
|
||||
}
|
||||
|
||||
static inline int kputs(const char *p, kstring_t *s)
|
||||
{
|
||||
return kputsn(p, strlen(p), s);
|
||||
}
|
||||
|
||||
static inline int kputc(int c, kstring_t *s)
|
||||
{
|
||||
if (s->l + 1 >= s->m) {
|
||||
s->m = s->l + 2;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)realloc(s->s, s->m);
|
||||
}
|
||||
s->s[s->l++] = c;
|
||||
s->s[s->l] = 0;
|
||||
return c;
|
||||
}
|
||||
|
||||
static inline int kputw(int c, kstring_t *s)
|
||||
{
|
||||
char buf[16];
|
||||
int l, x;
|
||||
if (c == 0) return kputc('0', s);
|
||||
for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
|
||||
if (c < 0) buf[l++] = '-';
|
||||
if (s->l + l + 1 >= s->m) {
|
||||
s->m = s->l + l + 2;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)realloc(s->s, s->m);
|
||||
}
|
||||
for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
|
||||
s->s[s->l] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int kputuw(unsigned c, kstring_t *s)
|
||||
{
|
||||
char buf[16];
|
||||
int l, i;
|
||||
unsigned x;
|
||||
if (c == 0) return kputc('0', s);
|
||||
for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
|
||||
if (s->l + l + 1 >= s->m) {
|
||||
s->m = s->l + l + 2;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)realloc(s->s, s->m);
|
||||
}
|
||||
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
|
||||
s->s[s->l] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int kputl(long c, kstring_t *s)
|
||||
{
|
||||
char buf[32];
|
||||
long l, x;
|
||||
if (c == 0) return kputc('0', s);
|
||||
for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
|
||||
if (c < 0) buf[l++] = '-';
|
||||
if (s->l + l + 1 >= s->m) {
|
||||
s->m = s->l + l + 2;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)realloc(s->s, s->m);
|
||||
}
|
||||
for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
|
||||
s->s[s->l] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ksprintf(kstring_t *s, const char *fmt, ...);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,749 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#if defined __SSE2__
|
||||
#include <emmintrin.h>
|
||||
#elif defined __ARM_NEON
|
||||
#include "neon_sse.h"
|
||||
#else
|
||||
#include "scalar_sse.h"
|
||||
#endif
|
||||
#include "ksw.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define LIKELY(x) __builtin_expect((x),1)
|
||||
#define UNLIKELY(x) __builtin_expect((x),0)
|
||||
#else
|
||||
#define LIKELY(x) (x)
|
||||
#define UNLIKELY(x) (x)
|
||||
#endif
|
||||
|
||||
const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
|
||||
|
||||
struct _kswq_t {
|
||||
int qlen, slen;
|
||||
uint8_t shift, mdiff, max, size;
|
||||
__m128i *qp, *H0, *H1, *E, *Hmax;
|
||||
};
|
||||
|
||||
/**
|
||||
* Initialize the query data structure
|
||||
*
|
||||
* @param size Number of bytes used to store a score; valid valures are 1 or 2
|
||||
* @param qlen Length of the query sequence
|
||||
* @param query Query sequence
|
||||
* @param m Size of the alphabet
|
||||
* @param mat Scoring matrix in a one-dimension array
|
||||
*
|
||||
* @return Query data structure
|
||||
*/
|
||||
kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
|
||||
{
|
||||
kswq_t *q;
|
||||
int slen, a, tmp, p;
|
||||
|
||||
size = size > 1? 2 : 1;
|
||||
p = 8 * (3 - size); // # values per __m128i
|
||||
slen = (qlen + p - 1) / p; // segmented length
|
||||
q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
|
||||
q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
|
||||
q->H0 = q->qp + slen * m;
|
||||
q->H1 = q->H0 + slen;
|
||||
q->E = q->H1 + slen;
|
||||
q->Hmax = q->E + slen;
|
||||
q->slen = slen; q->qlen = qlen; q->size = size;
|
||||
// compute shift
|
||||
tmp = m * m;
|
||||
for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
|
||||
if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
|
||||
if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
|
||||
}
|
||||
q->max = q->mdiff;
|
||||
q->shift = 256 - q->shift; // NB: q->shift is uint8_t
|
||||
q->mdiff += q->shift; // this is the difference between the min and max scores
|
||||
// An example: p=8, qlen=19, slen=3 and segmentation:
|
||||
// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
|
||||
if (size == 1) {
|
||||
int8_t *t = (int8_t*)q->qp;
|
||||
for (a = 0; a < m; ++a) {
|
||||
int i, k, nlen = slen * p;
|
||||
const int8_t *ma = mat + a * m;
|
||||
for (i = 0; i < slen; ++i)
|
||||
for (k = i; k < nlen; k += slen) // p iterations
|
||||
*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
|
||||
}
|
||||
} else {
|
||||
int16_t *t = (int16_t*)q->qp;
|
||||
for (a = 0; a < m; ++a) {
|
||||
int i, k, nlen = slen * p;
|
||||
const int8_t *ma = mat + a * m;
|
||||
for (i = 0; i < slen; ++i)
|
||||
for (k = i; k < nlen; k += slen) // p iterations
|
||||
*t++ = (k >= qlen? 0 : ma[query[k]]);
|
||||
}
|
||||
}
|
||||
return q;
|
||||
}
|
||||
|
||||
#if defined __ARM_NEON
|
||||
// This macro implicitly uses each function's `zero` local variable
|
||||
#define _mm_slli_si128(a, n) (vextq_u8(zero, (a), 16 - (n)))
|
||||
#endif
|
||||
|
||||
kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
|
||||
{
|
||||
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
|
||||
uint64_t *b;
|
||||
__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
|
||||
kswr_t r;
|
||||
|
||||
#if defined __SSE2__
|
||||
#define __max_16(ret, xx) do { \
|
||||
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
|
||||
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
|
||||
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
|
||||
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
|
||||
(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
|
||||
} while (0)
|
||||
|
||||
// Given entries with arbitrary values, return whether they are all 0x00
|
||||
#define allzero_16(xx) (_mm_movemask_epi8(_mm_cmpeq_epi8((xx), zero)) == 0xffff)
|
||||
|
||||
#elif defined __ARM_NEON
|
||||
#define __max_16(ret, xx) (ret) = vmaxvq_u8((xx))
|
||||
#define allzero_16(xx) (vmaxvq_u8((xx)) == 0)
|
||||
|
||||
#else
|
||||
#define __max_16(ret, xx) (ret) = m128i_max_u8((xx))
|
||||
#define allzero_16(xx) (m128i_allzero((xx)))
|
||||
#endif
|
||||
|
||||
// initialization
|
||||
r = g_defr;
|
||||
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
|
||||
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
|
||||
m_b = n_b = 0; b = 0;
|
||||
zero = _mm_set1_epi32(0);
|
||||
oe_del = _mm_set1_epi8(_o_del + _e_del);
|
||||
e_del = _mm_set1_epi8(_e_del);
|
||||
oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
|
||||
e_ins = _mm_set1_epi8(_e_ins);
|
||||
shift = _mm_set1_epi8(q->shift);
|
||||
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
|
||||
slen = q->slen;
|
||||
for (i = 0; i < slen; ++i) {
|
||||
_mm_store_si128(E + i, zero);
|
||||
_mm_store_si128(H0 + i, zero);
|
||||
_mm_store_si128(Hmax + i, zero);
|
||||
}
|
||||
// the core loop
|
||||
for (i = 0; i < tlen; ++i) {
|
||||
int j, k, imax;
|
||||
__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
|
||||
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
|
||||
h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
|
||||
for (j = 0; LIKELY(j < slen); ++j) {
|
||||
/* SW cells are computed in the following order:
|
||||
* H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
|
||||
* E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
|
||||
* F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
|
||||
*/
|
||||
// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
|
||||
h = _mm_adds_epu8(h, _mm_load_si128(S + j));
|
||||
h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
|
||||
e = _mm_load_si128(E + j); // e=E'(i,j)
|
||||
h = _mm_max_epu8(h, e);
|
||||
h = _mm_max_epu8(h, f); // h=H'(i,j)
|
||||
max = _mm_max_epu8(max, h); // set max
|
||||
_mm_store_si128(H1 + j, h); // save to H'(i,j)
|
||||
// now compute E'(i+1,j)
|
||||
e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
|
||||
t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
|
||||
e = _mm_max_epu8(e, t); // e=E'(i+1,j)
|
||||
_mm_store_si128(E + j, e); // save to E'(i+1,j)
|
||||
// now compute F'(i,j+1)
|
||||
f = _mm_subs_epu8(f, e_ins);
|
||||
t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
|
||||
f = _mm_max_epu8(f, t);
|
||||
// get H'(i-1,j) and prepare for the next j
|
||||
h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
|
||||
}
|
||||
// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
|
||||
for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
|
||||
f = _mm_slli_si128(f, 1);
|
||||
for (j = 0; LIKELY(j < slen); ++j) {
|
||||
h = _mm_load_si128(H1 + j);
|
||||
h = _mm_max_epu8(h, f); // h=H'(i,j)
|
||||
_mm_store_si128(H1 + j, h);
|
||||
h = _mm_subs_epu8(h, oe_ins);
|
||||
f = _mm_subs_epu8(f, e_ins);
|
||||
if (UNLIKELY(allzero_16(_mm_subs_epu8(f, h)))) goto end_loop16;
|
||||
}
|
||||
}
|
||||
end_loop16:
|
||||
//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
|
||||
__max_16(imax, max); // imax is the maximum number in max
|
||||
if (imax >= minsc) { // write the b array; this condition adds branching unfornately
|
||||
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
|
||||
if (n_b == m_b) {
|
||||
m_b = m_b? m_b<<1 : 8;
|
||||
b = (uint64_t*)realloc(b, 8 * m_b);
|
||||
}
|
||||
b[n_b++] = (uint64_t)imax<<32 | i;
|
||||
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
|
||||
}
|
||||
if (imax > gmax) {
|
||||
gmax = imax; te = i; // te is the end position on the target
|
||||
for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
|
||||
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
|
||||
if (gmax + q->shift >= 255 || gmax >= endsc) break;
|
||||
}
|
||||
S = H1; H1 = H0; H0 = S; // swap H0 and H1
|
||||
}
|
||||
r.score = gmax + q->shift < 255? gmax : 255;
|
||||
r.te = te;
|
||||
if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
|
||||
int max = -1, tmp, low, high, qlen = slen * 16;
|
||||
uint8_t *t = (uint8_t*)Hmax;
|
||||
for (i = 0; i < qlen; ++i, ++t)
|
||||
if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
|
||||
else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp;
|
||||
//printf("%d,%d\n", max, gmax);
|
||||
if (b) {
|
||||
i = (r.score + q->max - 1) / q->max;
|
||||
low = te - i; high = te + i;
|
||||
for (i = 0; i < n_b; ++i) {
|
||||
int e = (int32_t)b[i];
|
||||
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
|
||||
r.score2 = b[i]>>32, r.te2 = e;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(b);
|
||||
return r;
|
||||
}
|
||||
|
||||
kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
|
||||
{
|
||||
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
|
||||
uint64_t *b;
|
||||
__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
|
||||
kswr_t r;
|
||||
|
||||
#if defined __SSE2__
|
||||
#define __max_8(ret, xx) do { \
|
||||
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
|
||||
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
|
||||
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
|
||||
(ret) = _mm_extract_epi16((xx), 0); \
|
||||
} while (0)
|
||||
|
||||
// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000
|
||||
#define allzero_0f_8(xx) (!_mm_movemask_epi8((xx)))
|
||||
|
||||
#elif defined __ARM_NEON
|
||||
#define __max_8(ret, xx) (ret) = vmaxvq_s16(vreinterpretq_s16_u8((xx)))
|
||||
#define allzero_0f_8(xx) (vmaxvq_u16(vreinterpretq_u16_u8((xx))) == 0)
|
||||
|
||||
#else
|
||||
#define __max_8(ret, xx) (ret) = m128i_max_s16((xx))
|
||||
#define allzero_0f_8(xx) (m128i_allzero((xx)))
|
||||
#endif
|
||||
|
||||
// initialization
|
||||
r = g_defr;
|
||||
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
|
||||
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
|
||||
m_b = n_b = 0; b = 0;
|
||||
zero = _mm_set1_epi32(0);
|
||||
oe_del = _mm_set1_epi16(_o_del + _e_del);
|
||||
e_del = _mm_set1_epi16(_e_del);
|
||||
oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
|
||||
e_ins = _mm_set1_epi16(_e_ins);
|
||||
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
|
||||
slen = q->slen;
|
||||
for (i = 0; i < slen; ++i) {
|
||||
_mm_store_si128(E + i, zero);
|
||||
_mm_store_si128(H0 + i, zero);
|
||||
_mm_store_si128(Hmax + i, zero);
|
||||
}
|
||||
// the core loop
|
||||
for (i = 0; i < tlen; ++i) {
|
||||
int j, k, imax;
|
||||
__m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
|
||||
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
|
||||
h = _mm_slli_si128(h, 2);
|
||||
for (j = 0; LIKELY(j < slen); ++j) {
|
||||
h = _mm_adds_epi16(h, _mm_load_si128(S++));
|
||||
e = _mm_load_si128(E + j);
|
||||
h = _mm_max_epi16(h, e);
|
||||
h = _mm_max_epi16(h, f);
|
||||
max = _mm_max_epi16(max, h);
|
||||
_mm_store_si128(H1 + j, h);
|
||||
e = _mm_subs_epu16(e, e_del);
|
||||
t = _mm_subs_epu16(h, oe_del);
|
||||
e = _mm_max_epi16(e, t);
|
||||
_mm_store_si128(E + j, e);
|
||||
f = _mm_subs_epu16(f, e_ins);
|
||||
t = _mm_subs_epu16(h, oe_ins);
|
||||
f = _mm_max_epi16(f, t);
|
||||
h = _mm_load_si128(H0 + j);
|
||||
}
|
||||
for (k = 0; LIKELY(k < 16); ++k) {
|
||||
f = _mm_slli_si128(f, 2);
|
||||
for (j = 0; LIKELY(j < slen); ++j) {
|
||||
h = _mm_load_si128(H1 + j);
|
||||
h = _mm_max_epi16(h, f);
|
||||
_mm_store_si128(H1 + j, h);
|
||||
h = _mm_subs_epu16(h, oe_ins);
|
||||
f = _mm_subs_epu16(f, e_ins);
|
||||
if(UNLIKELY(allzero_0f_8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
|
||||
}
|
||||
}
|
||||
end_loop8:
|
||||
__max_8(imax, max);
|
||||
if (imax >= minsc) {
|
||||
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
|
||||
if (n_b == m_b) {
|
||||
m_b = m_b? m_b<<1 : 8;
|
||||
b = (uint64_t*)realloc(b, 8 * m_b);
|
||||
}
|
||||
b[n_b++] = (uint64_t)imax<<32 | i;
|
||||
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
|
||||
}
|
||||
if (imax > gmax) {
|
||||
gmax = imax; te = i;
|
||||
for (j = 0; LIKELY(j < slen); ++j)
|
||||
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
|
||||
if (gmax >= endsc) break;
|
||||
}
|
||||
S = H1; H1 = H0; H0 = S;
|
||||
}
|
||||
r.score = gmax; r.te = te;
|
||||
{
|
||||
int max = -1, tmp, low, high, qlen = slen * 8;
|
||||
uint16_t *t = (uint16_t*)Hmax;
|
||||
for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
|
||||
if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
|
||||
else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp;
|
||||
if (b) {
|
||||
i = (r.score + q->max - 1) / q->max;
|
||||
low = te - i; high = te + i;
|
||||
for (i = 0; i < n_b; ++i) {
|
||||
int e = (int32_t)b[i];
|
||||
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
|
||||
r.score2 = b[i]>>32, r.te2 = e;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(b);
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline void revseq(int l, uint8_t *s)
|
||||
{
|
||||
int i, t;
|
||||
for (i = 0; i < l>>1; ++i)
|
||||
t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
|
||||
}
|
||||
|
||||
kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry)
|
||||
{
|
||||
int size;
|
||||
kswq_t *q;
|
||||
kswr_t r, rr;
|
||||
kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int);
|
||||
|
||||
q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
|
||||
if (qry && *qry == 0) *qry = q;
|
||||
func = q->size == 2? ksw_i16 : ksw_u8;
|
||||
size = q->size;
|
||||
r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra);
|
||||
if (qry == 0) free(q);
|
||||
if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
|
||||
revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
|
||||
q = ksw_qinit(size, r.qe + 1, query, m, mat);
|
||||
rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score);
|
||||
revseq(r.qe + 1, query); revseq(r.te + 1, target);
|
||||
free(q);
|
||||
if (r.score == rr.score)
|
||||
r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
|
||||
return r;
|
||||
}
|
||||
|
||||
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
|
||||
{
|
||||
return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry);
|
||||
}
|
||||
|
||||
/********************
|
||||
*** SW extension ***
|
||||
********************/
|
||||
|
||||
typedef struct {
|
||||
int32_t h, e;
|
||||
} eh_t;
|
||||
|
||||
int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
|
||||
{
|
||||
eh_t *eh; // score array
|
||||
int8_t *qp; // query profile
|
||||
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
|
||||
assert(h0 > 0);
|
||||
// allocate memory
|
||||
qp = malloc(qlen * m);
|
||||
eh = calloc(qlen + 1, 8);
|
||||
// generate the query profile
|
||||
for (k = i = 0; k < m; ++k) {
|
||||
const int8_t *p = &mat[k * m];
|
||||
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
|
||||
}
|
||||
// fill the first row
|
||||
eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
|
||||
for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
|
||||
eh[j].h = eh[j-1].h - e_ins;
|
||||
// adjust $w if it is too large
|
||||
k = m * m;
|
||||
for (i = 0, max = 0; i < k; ++i) // get the max score
|
||||
max = max > mat[i]? max : mat[i];
|
||||
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
|
||||
max_ins = max_ins > 1? max_ins : 1;
|
||||
w = w < max_ins? w : max_ins;
|
||||
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
|
||||
max_del = max_del > 1? max_del : 1;
|
||||
w = w < max_del? w : max_del; // TODO: is this necessary?
|
||||
// DP loop
|
||||
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
|
||||
max_off = 0;
|
||||
beg = 0, end = qlen;
|
||||
for (i = 0; LIKELY(i < tlen); ++i) {
|
||||
int t, f = 0, h1, m = 0, mj = -1;
|
||||
int8_t *q = &qp[target[i] * qlen];
|
||||
// apply the band and the constraint (if provided)
|
||||
if (beg < i - w) beg = i - w;
|
||||
if (end > i + w + 1) end = i + w + 1;
|
||||
if (end > qlen) end = qlen;
|
||||
// compute the first column
|
||||
if (beg == 0) {
|
||||
h1 = h0 - (o_del + e_del * (i + 1));
|
||||
if (h1 < 0) h1 = 0;
|
||||
} else h1 = 0;
|
||||
for (j = beg; LIKELY(j < end); ++j) {
|
||||
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
|
||||
// Similar to SSE2-SW, cells are computed in the following order:
|
||||
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
|
||||
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
|
||||
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
|
||||
eh_t *p = &eh[j];
|
||||
int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
|
||||
p->h = h1; // set H(i,j-1) for the next row
|
||||
M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M"
|
||||
h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0
|
||||
h = h > f? h : f;
|
||||
h1 = h; // save H(i,j) to h1 for the next column
|
||||
mj = m > h? mj : j; // record the position where max score is achieved
|
||||
m = m > h? m : h; // m is stored at eh[mj+1]
|
||||
t = M - oe_del;
|
||||
t = t > 0? t : 0;
|
||||
e -= e_del;
|
||||
e = e > t? e : t; // computed E(i+1,j)
|
||||
p->e = e; // save E(i+1,j) for the next row
|
||||
t = M - oe_ins;
|
||||
t = t > 0? t : 0;
|
||||
f -= e_ins;
|
||||
f = f > t? f : t; // computed F(i,j+1)
|
||||
}
|
||||
eh[end].h = h1; eh[end].e = 0;
|
||||
if (j == qlen) {
|
||||
max_ie = gscore > h1? max_ie : i;
|
||||
gscore = gscore > h1? gscore : h1;
|
||||
}
|
||||
if (m == 0) break;
|
||||
if (m > max) {
|
||||
max = m, max_i = i, max_j = mj;
|
||||
max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
|
||||
} else if (zdrop > 0) {
|
||||
if (i - max_i > mj - max_j) {
|
||||
if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
|
||||
} else {
|
||||
if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
|
||||
}
|
||||
}
|
||||
// update beg and end for the next round
|
||||
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j);
|
||||
beg = j;
|
||||
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
|
||||
end = j + 2 < qlen? j + 2 : qlen;
|
||||
//beg = 0; end = qlen; // uncomment this line for debugging
|
||||
}
|
||||
free(eh); free(qp);
|
||||
if (_qle) *_qle = max_j + 1;
|
||||
if (_tle) *_tle = max_i + 1;
|
||||
if (_gtle) *_gtle = max_ie + 1;
|
||||
if (_gscore) *_gscore = gscore;
|
||||
if (_max_off) *_max_off = max_off;
|
||||
return max;
|
||||
}
|
||||
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off)
|
||||
{
|
||||
return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off);
|
||||
}
|
||||
|
||||
/********************
|
||||
* Global alignment *
|
||||
********************/
|
||||
|
||||
#define MINUS_INF -0x40000000
|
||||
|
||||
static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
|
||||
{
|
||||
if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
|
||||
if (*n_cigar == *m_cigar) {
|
||||
*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
|
||||
cigar = realloc(cigar, (*m_cigar) << 2);
|
||||
}
|
||||
cigar[(*n_cigar)++] = len<<4 | op;
|
||||
} else cigar[(*n_cigar)-1] += len<<4;
|
||||
return cigar;
|
||||
}
|
||||
|
||||
int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
|
||||
{
|
||||
eh_t *eh;
|
||||
int8_t *qp; // query profile
|
||||
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col;
|
||||
uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
|
||||
if (n_cigar_) *n_cigar_ = 0;
|
||||
// allocate memory
|
||||
n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
|
||||
z = n_cigar_ && cigar_? malloc((long)n_col * tlen) : 0;
|
||||
qp = malloc(qlen * m);
|
||||
eh = calloc(qlen + 1, 8);
|
||||
// generate the query profile
|
||||
for (k = i = 0; k < m; ++k) {
|
||||
const int8_t *p = &mat[k * m];
|
||||
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
|
||||
}
|
||||
// fill the first row
|
||||
eh[0].h = 0; eh[0].e = MINUS_INF;
|
||||
for (j = 1; j <= qlen && j <= w; ++j)
|
||||
eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF;
|
||||
for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
|
||||
// DP loop
|
||||
for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
|
||||
int32_t f = MINUS_INF, h1, beg, end, t;
|
||||
int8_t *q = &qp[target[i] * qlen];
|
||||
beg = i > w? i - w : 0;
|
||||
end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
|
||||
h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF;
|
||||
if (n_cigar_ && cigar_) {
|
||||
uint8_t *zi = &z[(long)i * n_col];
|
||||
for (j = beg; LIKELY(j < end); ++j) {
|
||||
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
|
||||
// Cells are computed in the following order:
|
||||
// M(i,j) = H(i-1,j-1) + S(i,j)
|
||||
// H(i,j) = max{M(i,j), E(i,j), F(i,j)}
|
||||
// E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape
|
||||
// F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape
|
||||
// We have to separate M(i,j); otherwise the direction may not be recorded correctly.
|
||||
// However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global().
|
||||
// Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k.
|
||||
// In practice, this should happen very rarely given a reasonable scoring system.
|
||||
eh_t *p = &eh[j];
|
||||
int32_t h, m = p->h, e = p->e;
|
||||
uint8_t d; // direction
|
||||
p->h = h1;
|
||||
m += q[j];
|
||||
d = m >= e? 0 : 1;
|
||||
h = m >= e? m : e;
|
||||
d = h >= f? d : 2;
|
||||
h = h >= f? h : f;
|
||||
h1 = h;
|
||||
t = m - oe_del;
|
||||
e -= e_del;
|
||||
d |= e > t? 1<<2 : 0;
|
||||
e = e > t? e : t;
|
||||
p->e = e;
|
||||
t = m - oe_ins;
|
||||
f -= e_ins;
|
||||
d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
|
||||
f = f > t? f : t;
|
||||
zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
|
||||
}
|
||||
} else {
|
||||
for (j = beg; LIKELY(j < end); ++j) {
|
||||
eh_t *p = &eh[j];
|
||||
int32_t h, m = p->h, e = p->e;
|
||||
p->h = h1;
|
||||
m += q[j];
|
||||
h = m >= e? m : e;
|
||||
h = h >= f? h : f;
|
||||
h1 = h;
|
||||
t = m - oe_del;
|
||||
e -= e_del;
|
||||
e = e > t? e : t;
|
||||
p->e = e;
|
||||
t = m - oe_ins;
|
||||
f -= e_ins;
|
||||
f = f > t? f : t;
|
||||
}
|
||||
}
|
||||
eh[end].h = h1; eh[end].e = MINUS_INF;
|
||||
}
|
||||
score = eh[qlen].h;
|
||||
if (n_cigar_ && cigar_) { // backtrack
|
||||
int n_cigar = 0, m_cigar = 0, which = 0;
|
||||
uint32_t *cigar = 0, tmp;
|
||||
i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
|
||||
while (i >= 0 && k >= 0) {
|
||||
which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
|
||||
if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
|
||||
else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
|
||||
else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
|
||||
}
|
||||
if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
|
||||
if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
|
||||
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
|
||||
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
|
||||
*n_cigar_ = n_cigar, *cigar_ = cigar;
|
||||
}
|
||||
free(eh); free(qp); free(z);
|
||||
return score;
|
||||
}
|
||||
|
||||
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
|
||||
{
|
||||
return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_);
|
||||
}
|
||||
|
||||
/*******************************************
|
||||
* Main function (not compiled by default) *
|
||||
*******************************************/
|
||||
|
||||
#ifdef _KSW_MAIN
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT(gzFile, err_gzread)
|
||||
|
||||
unsigned char seq_nt4_table[256] = {
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
||||
};
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
|
||||
int8_t mat[25];
|
||||
int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
|
||||
uint8_t *rseq = 0;
|
||||
gzFile fpt, fpq;
|
||||
kseq_t *kst, *ksq;
|
||||
|
||||
// parse command line
|
||||
while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
|
||||
switch (c) {
|
||||
case 'a': sa = atoi(optarg); break;
|
||||
case 'b': sb = atoi(optarg); break;
|
||||
case 'q': gapo = atoi(optarg); break;
|
||||
case 'r': gape = atoi(optarg); break;
|
||||
case 't': minsc = atoi(optarg); break;
|
||||
case 'f': forward_only = 1; break;
|
||||
case '1': xtra |= KSW_XBYTE; break;
|
||||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
|
||||
return 1;
|
||||
}
|
||||
if (minsc > 0xffff) minsc = 0xffff;
|
||||
xtra |= KSW_XSUBO | minsc;
|
||||
// initialize scoring matrix
|
||||
for (i = k = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
mat[k++] = i == j? sa : -sb;
|
||||
mat[k++] = 0; // ambiguous base
|
||||
}
|
||||
for (j = 0; j < 5; ++j) mat[k++] = 0;
|
||||
// open file
|
||||
fpt = xzopen(argv[optind], "r"); kst = kseq_init(fpt);
|
||||
fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
|
||||
// all-pair alignment
|
||||
while (kseq_read(ksq) > 0) {
|
||||
kswq_t *q[2] = {0, 0};
|
||||
kswr_t r;
|
||||
for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
|
||||
if (!forward_only) { // reverse
|
||||
if ((int)ksq->seq.m > max_rseq) {
|
||||
max_rseq = ksq->seq.m;
|
||||
rseq = (uint8_t*)realloc(rseq, max_rseq);
|
||||
}
|
||||
for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
|
||||
rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
|
||||
}
|
||||
gzrewind(fpt); kseq_rewind(kst);
|
||||
while (kseq_read(kst) > 0) {
|
||||
for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
|
||||
r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
|
||||
if (r.score >= minsc)
|
||||
err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
|
||||
if (rseq) {
|
||||
r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
|
||||
if (r.score >= minsc)
|
||||
err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
|
||||
}
|
||||
}
|
||||
free(q[0]); free(q[1]);
|
||||
}
|
||||
free(rseq);
|
||||
kseq_destroy(kst); err_gzclose(fpt);
|
||||
kseq_destroy(ksq); err_gzclose(fpq);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
#ifndef __AC_KSW_H
|
||||
#define __AC_KSW_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define KSW_XBYTE 0x10000
|
||||
#define KSW_XSTOP 0x20000
|
||||
#define KSW_XSUBO 0x40000
|
||||
#define KSW_XSTART 0x80000
|
||||
|
||||
struct _kswq_t;
|
||||
typedef struct _kswq_t kswq_t;
|
||||
|
||||
typedef struct {
|
||||
int score; // best score
|
||||
int te, qe; // target end and query end
|
||||
int score2, te2; // second best score and ending position on the target
|
||||
int tb, qb; // target start and query start
|
||||
} kswr_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Aligning two sequences
|
||||
*
|
||||
* @param qlen length of the query sequence (typically <tlen)
|
||||
* @param query query sequence with 0 <= query[i] < m
|
||||
* @param tlen length of the target sequence
|
||||
* @param target target sequence
|
||||
* @param m number of residue types
|
||||
* @param mat m*m scoring matrix in one-dimension array
|
||||
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
|
||||
* @param gape gap extension penalty
|
||||
* @param xtra extra information (see below)
|
||||
* @param qry query profile (see below)
|
||||
*
|
||||
* @return alignment information in a struct; unset values to -1
|
||||
*
|
||||
* When xtra==0, ksw_align() uses a signed two-byte integer to store a
|
||||
* score and only finds the best score and the end positions. The 2nd best
|
||||
* score or the start positions are not attempted. The default behavior can
|
||||
* be tuned by setting KSW_X* flags:
|
||||
*
|
||||
* KSW_XBYTE: use an unsigned byte to store a score. If overflow occurs,
|
||||
* kswr_t::score will be set to 255
|
||||
*
|
||||
* KSW_XSUBO: track the 2nd best score and the ending position on the
|
||||
* target if the 2nd best is higher than (xtra&0xffff)
|
||||
*
|
||||
* KSW_XSTOP: stop if the maximum score is above (xtra&0xffff)
|
||||
*
|
||||
* KSW_XSTART: find the start positions
|
||||
*
|
||||
* When *qry==NULL, ksw_align() will compute and allocate the query profile
|
||||
* and when the function returns, *qry will point to the profile, which can
|
||||
* be deallocated simply by free(). If one query is aligned against multiple
|
||||
* target sequences, *qry should be set to NULL during the first call and
|
||||
* freed after the last call. Note that qry can equal 0. In this case, the
|
||||
* query profile will be deallocated in ksw_align().
|
||||
*/
|
||||
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
|
||||
kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry);
|
||||
|
||||
/**
|
||||
* Banded global alignment
|
||||
*
|
||||
* @param qlen query length
|
||||
* @param query query sequence with 0 <= query[i] < m
|
||||
* @param tlen target length
|
||||
* @param target target sequence with 0 <= target[i] < m
|
||||
* @param m number of residue types
|
||||
* @param mat m*m scoring mattrix in one-dimension array
|
||||
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
|
||||
* @param gape gap extension penalty
|
||||
* @param w band width
|
||||
* @param n_cigar (out) number of CIGAR elements
|
||||
* @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with free()
|
||||
*
|
||||
* @return score of the alignment
|
||||
*/
|
||||
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
|
||||
int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar);
|
||||
|
||||
/**
|
||||
* Extend alignment
|
||||
*
|
||||
* The routine aligns $query and $target, assuming their upstream sequences,
|
||||
* which are not provided, have been aligned with score $h0. In return,
|
||||
* region [0,*qle) on the query and [0,*tle) on the target sequences are
|
||||
* aligned together. If *gscore>=0, *gscore keeps the best score such that
|
||||
* the entire query sequence is aligned; *gtle keeps the position on the
|
||||
* target where *gscore is achieved. Returning *gscore and *gtle helps the
|
||||
* caller to decide whether an end-to-end hit or a partial hit is preferred.
|
||||
*
|
||||
* The first 9 parameters are identical to those in ksw_global()
|
||||
*
|
||||
* @param h0 alignment score of upstream sequences
|
||||
* @param _qle (out) length of the query in the alignment
|
||||
* @param _tle (out) length of the target in the alignment
|
||||
* @param _gtle (out) length of the target if query is fully aligned
|
||||
* @param _gscore (out) score of the best end-to-end alignment; negative if not found
|
||||
*
|
||||
* @return best semi-local alignment score
|
||||
*/
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
|
||||
int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
#include <pthread.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
|
||||
/************
|
||||
* kt_for() *
|
||||
************/
|
||||
|
||||
struct kt_for_t;
|
||||
|
||||
typedef struct {
|
||||
struct kt_for_t *t;
|
||||
long i;
|
||||
} ktf_worker_t;
|
||||
|
||||
typedef struct kt_for_t {
|
||||
int n_threads;
|
||||
long n;
|
||||
ktf_worker_t *w;
|
||||
void (*func)(void*,long,int);
|
||||
void *data;
|
||||
} kt_for_t;
|
||||
|
||||
static inline long steal_work(kt_for_t *t)
|
||||
{
|
||||
int i, min_i = -1;
|
||||
long k, min = LONG_MAX;
|
||||
for (i = 0; i < t->n_threads; ++i)
|
||||
if (min > t->w[i].i) min = t->w[i].i, min_i = i;
|
||||
k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
|
||||
return k >= t->n? -1 : k;
|
||||
}
|
||||
|
||||
static void *ktf_worker(void *data)
|
||||
{
|
||||
ktf_worker_t *w = (ktf_worker_t*)data;
|
||||
long i;
|
||||
for (;;) {
|
||||
i = __sync_fetch_and_add(&w->i, w->t->n_threads);
|
||||
if (i >= w->t->n) break;
|
||||
w->t->func(w->t->data, i, w - w->t->w);
|
||||
}
|
||||
while ((i = steal_work(w->t)) >= 0)
|
||||
w->t->func(w->t->data, i, w - w->t->w);
|
||||
pthread_exit(0);
|
||||
}
|
||||
|
||||
void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
|
||||
{
|
||||
int i;
|
||||
kt_for_t t;
|
||||
pthread_t *tid;
|
||||
t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
|
||||
t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t));
|
||||
tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
|
||||
for (i = 0; i < n_threads; ++i)
|
||||
t.w[i].t = &t, t.w[i].i = i;
|
||||
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
|
||||
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
|
||||
}
|
||||
|
||||
/*****************
|
||||
* kt_pipeline() *
|
||||
*****************/
|
||||
|
||||
struct ktp_t;
|
||||
|
||||
typedef struct {
|
||||
struct ktp_t *pl;
|
||||
int64_t index;
|
||||
int step;
|
||||
void *data;
|
||||
} ktp_worker_t;
|
||||
|
||||
typedef struct ktp_t {
|
||||
void *shared;
|
||||
void *(*func)(void*, int, void*);
|
||||
int64_t index;
|
||||
int n_workers, n_steps;
|
||||
ktp_worker_t *workers;
|
||||
pthread_mutex_t mutex;
|
||||
pthread_cond_t cv;
|
||||
} ktp_t;
|
||||
|
||||
static void *ktp_worker(void *data)
|
||||
{
|
||||
ktp_worker_t *w = (ktp_worker_t*)data;
|
||||
ktp_t *p = w->pl;
|
||||
while (w->step < p->n_steps) {
|
||||
// test whether we can kick off the job with this worker
|
||||
pthread_mutex_lock(&p->mutex);
|
||||
for (;;) {
|
||||
int i;
|
||||
// test whether another worker is doing the same step
|
||||
for (i = 0; i < p->n_workers; ++i) {
|
||||
if (w == &p->workers[i]) continue; // ignore itself
|
||||
if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
|
||||
break;
|
||||
}
|
||||
if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
|
||||
pthread_cond_wait(&p->cv, &p->mutex);
|
||||
}
|
||||
pthread_mutex_unlock(&p->mutex);
|
||||
|
||||
// working on w->step
|
||||
w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
|
||||
|
||||
// update step and let other workers know
|
||||
pthread_mutex_lock(&p->mutex);
|
||||
w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
|
||||
if (w->step == 0) w->index = p->index++;
|
||||
pthread_cond_broadcast(&p->cv);
|
||||
pthread_mutex_unlock(&p->mutex);
|
||||
}
|
||||
pthread_exit(0);
|
||||
}
|
||||
|
||||
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
|
||||
{
|
||||
ktp_t aux;
|
||||
pthread_t *tid;
|
||||
int i;
|
||||
|
||||
if (n_threads < 1) n_threads = 1;
|
||||
aux.n_workers = n_threads;
|
||||
aux.n_steps = n_steps;
|
||||
aux.func = func;
|
||||
aux.shared = shared_data;
|
||||
aux.index = 0;
|
||||
pthread_mutex_init(&aux.mutex, 0);
|
||||
pthread_cond_init(&aux.cv, 0);
|
||||
|
||||
aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t));
|
||||
for (i = 0; i < n_threads; ++i) {
|
||||
ktp_worker_t *w = &aux.workers[i];
|
||||
w->step = 0; w->pl = &aux; w->data = 0;
|
||||
w->index = aux.index++;
|
||||
}
|
||||
|
||||
tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
|
||||
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
|
||||
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
|
||||
|
||||
pthread_mutex_destroy(&aux.mutex);
|
||||
pthread_cond_destroy(&aux.cv);
|
||||
}
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
An example:
|
||||
|
||||
#include "kvec.h"
|
||||
int main() {
|
||||
kvec_t(int) array;
|
||||
kv_init(array);
|
||||
kv_push(int, array, 10); // append
|
||||
kv_a(int, array, 20) = 5; // dynamic
|
||||
kv_A(array, 20) = 4; // static
|
||||
kv_destroy(array);
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
2008-09-22 (0.1.0):
|
||||
|
||||
* The initial version.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef AC_KVEC_H
|
||||
#define AC_KVEC_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
|
||||
#define kvec_t(type) struct { size_t n, m; type *a; }
|
||||
#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
|
||||
#define kv_destroy(v) free((v).a)
|
||||
#define kv_A(v, i) ((v).a[(i)])
|
||||
#define kv_pop(v) ((v).a[--(v).n])
|
||||
#define kv_size(v) ((v).n)
|
||||
#define kv_max(v) ((v).m)
|
||||
|
||||
#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
|
||||
|
||||
#define kv_copy(type, v1, v0) do { \
|
||||
if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
|
||||
(v1).n = (v0).n; \
|
||||
memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
|
||||
} while (0) \
|
||||
|
||||
#define kv_push(type, v, x) do { \
|
||||
if ((v).n == (v).m) { \
|
||||
(v).m = (v).m? (v).m<<1 : 2; \
|
||||
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
|
||||
} \
|
||||
(v).a[(v).n++] = (x); \
|
||||
} while (0)
|
||||
|
||||
#define kv_pushp(type, v) ((((v).n == (v).m)? \
|
||||
((v).m = ((v).m? (v).m<<1 : 2), \
|
||||
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
|
||||
: 0), &(v).a[(v).n++])
|
||||
|
||||
#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
|
||||
((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
|
||||
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
|
||||
: (v).n <= (size_t)(i)? (v).n = (i) + 1 \
|
||||
: 0), (v).a[(i)])
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "kstring.h"
|
||||
#include "utils.h"
|
||||
|
||||
#ifndef PACKAGE_VERSION
|
||||
#define PACKAGE_VERSION "0.7.17-r1198-dirty"
|
||||
#endif
|
||||
|
||||
int bwa_fa2pac(int argc, char *argv[]);
|
||||
int bwa_pac2bwt(int argc, char *argv[]);
|
||||
int bwa_bwtupdate(int argc, char *argv[]);
|
||||
int bwa_bwt2sa(int argc, char *argv[]);
|
||||
int bwa_index(int argc, char *argv[]);
|
||||
int bwt_bwtgen_main(int argc, char *argv[]);
|
||||
|
||||
int bwa_aln(int argc, char *argv[]);
|
||||
int bwa_sai2sam_se(int argc, char *argv[]);
|
||||
int bwa_sai2sam_pe(int argc, char *argv[]);
|
||||
|
||||
int bwa_bwtsw2(int argc, char *argv[]);
|
||||
|
||||
int main_fastmap(int argc, char *argv[]);
|
||||
int main_mem(int argc, char *argv[]);
|
||||
int main_shm(int argc, char *argv[]);
|
||||
|
||||
int main_pemerge(int argc, char *argv[]);
|
||||
int main_maxk(int argc, char *argv[]);
|
||||
|
||||
static int usage()
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n");
|
||||
fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
|
||||
fprintf(stderr, "Contact: Heng Li <hli@ds.dfci.harvard.edu>\n\n");
|
||||
fprintf(stderr, "Usage: bwa <command> [options]\n\n");
|
||||
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
|
||||
fprintf(stderr, " mem BWA-MEM algorithm\n");
|
||||
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
||||
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
|
||||
fprintf(stderr, " aln gapped/ungapped alignment\n");
|
||||
fprintf(stderr, " samse generate alignment (single ended)\n");
|
||||
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
||||
fprintf(stderr, " bwasw BWA-SW for long queries (DEPRECATED)\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " shm manage indices in shared memory\n");
|
||||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
|
||||
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
|
||||
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr,
|
||||
"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
|
||||
" There are three alignment algorithms in BWA: `mem', `bwasw', and\n"
|
||||
" `aln/samse/sampe'. If you are not sure which to use, try `bwa mem'\n"
|
||||
" first. Please `man ./bwa.1' for the manual.\n\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
extern char *bwa_pg;
|
||||
int i, ret;
|
||||
double t_real;
|
||||
kstring_t pg = {0,0,0};
|
||||
t_real = realtime();
|
||||
ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]);
|
||||
for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]);
|
||||
bwa_pg = pg.s;
|
||||
if (argc < 2) return usage();
|
||||
if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "maxk") == 0) ret = main_maxk(argc-1, argv+1);
|
||||
else {
|
||||
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
|
||||
return 1;
|
||||
}
|
||||
err_fflush(stdout);
|
||||
err_fclose(stdout);
|
||||
if (ret == 0) {
|
||||
fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION);
|
||||
fprintf(stderr, "[%s] CMD:", __func__);
|
||||
for (i = 0; i < argc; ++i)
|
||||
fprintf(stderr, " %s", argv[i]);
|
||||
fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime());
|
||||
}
|
||||
free(bwa_pg);
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
/* Don't wrap ourselves */
|
||||
# undef USE_MALLOC_WRAPPERS
|
||||
#endif
|
||||
#include "malloc_wrap.h"
|
||||
|
||||
void *wrap_calloc(size_t nmemb, size_t size,
|
||||
const char *file, unsigned int line, const char *func) {
|
||||
void *p = calloc(nmemb, size);
|
||||
if (NULL == p) {
|
||||
fprintf(stderr,
|
||||
"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
|
||||
func, nmemb * size, file, line, strerror(errno));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void *wrap_malloc(size_t size,
|
||||
const char *file, unsigned int line, const char *func) {
|
||||
void *p = malloc(size);
|
||||
if (NULL == p) {
|
||||
fprintf(stderr,
|
||||
"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
|
||||
func, size, file, line, strerror(errno));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void *wrap_realloc(void *ptr, size_t size,
|
||||
const char *file, unsigned int line, const char *func) {
|
||||
void *p = realloc(ptr, size);
|
||||
if (NULL == p) {
|
||||
fprintf(stderr,
|
||||
"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
|
||||
func, size, file, line, strerror(errno));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
char *wrap_strdup(const char *s,
|
||||
const char *file, unsigned int line, const char *func) {
|
||||
char *p = strdup(s);
|
||||
if (NULL == p) {
|
||||
fprintf(stderr,
|
||||
"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
|
||||
func, strlen(s), file, line, strerror(errno));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
#ifndef MALLOC_WRAP_H
|
||||
#define MALLOC_WRAP_H
|
||||
|
||||
#include <stdlib.h> /* Avoid breaking the usual definitions */
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void *wrap_calloc(size_t nmemb, size_t size,
|
||||
const char *file, unsigned int line, const char *func);
|
||||
void *wrap_malloc(size_t size,
|
||||
const char *file, unsigned int line, const char *func);
|
||||
void *wrap_realloc(void *ptr, size_t size,
|
||||
const char *file, unsigned int line, const char *func);
|
||||
char *wrap_strdup(const char *s,
|
||||
const char *file, unsigned int line, const char *func);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# ifdef calloc
|
||||
# undef calloc
|
||||
# endif
|
||||
# define calloc(n, s) wrap_calloc( (n), (s), __FILE__, __LINE__, __func__)
|
||||
|
||||
# ifdef malloc
|
||||
# undef malloc
|
||||
# endif
|
||||
# define malloc(s) wrap_malloc( (s), __FILE__, __LINE__, __func__)
|
||||
|
||||
# ifdef realloc
|
||||
# undef realloc
|
||||
# endif
|
||||
# define realloc(p, s) wrap_realloc((p), (s), __FILE__, __LINE__, __func__)
|
||||
|
||||
# ifdef strdup
|
||||
# undef strdup
|
||||
# endif
|
||||
# define strdup(s) wrap_strdup( (s), __FILE__, __LINE__, __func__)
|
||||
|
||||
#endif /* USE_MALLOC_WRAPPERS */
|
||||
|
||||
#endif /* MALLOC_WRAP_H */
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
#include <zlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include "bwa.h"
|
||||
#include "bwamem.h"
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
int main_maxk(int argc, char *argv[])
|
||||
{
|
||||
int i, c, self = 0, max_len = 0;
|
||||
uint8_t *cnt = 0;
|
||||
uint64_t hist[256];
|
||||
bwt_t *bwt;
|
||||
kseq_t *ks;
|
||||
smem_i *itr;
|
||||
gzFile fp;
|
||||
|
||||
while ((c = getopt(argc, argv, "s")) >= 0) {
|
||||
if (c == 's') self = 1;
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa maxk [-s] <index.prefix> <seq.fa>\n");
|
||||
return 1;
|
||||
}
|
||||
fp = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "rb") : gzdopen(fileno(stdin), "rb");
|
||||
ks = kseq_init(fp);
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
itr = smem_itr_init(bwt);
|
||||
if (self) smem_config(itr, 2, INT_MAX, 0);
|
||||
memset(hist, 0, 8 * 256);
|
||||
|
||||
while (kseq_read(ks) >= 0) {
|
||||
const bwtintv_v *a;
|
||||
if (ks->seq.l > max_len) {
|
||||
max_len = ks->seq.l;
|
||||
kroundup32(max_len);
|
||||
cnt = realloc(cnt, max_len);
|
||||
}
|
||||
memset(cnt, 0, ks->seq.l);
|
||||
for (i = 0; i < ks->seq.l; ++i)
|
||||
ks->seq.s[i] = nst_nt4_table[(int)ks->seq.s[i]];
|
||||
smem_set_query(itr, ks->seq.l, (uint8_t*)ks->seq.s);
|
||||
while ((a = smem_next(itr)) != 0) {
|
||||
for (i = 0; i < a->n; ++i) {
|
||||
bwtintv_t *p = &a->a[i];
|
||||
int j, l, start = p->info>>32, end = (uint32_t)p->info;
|
||||
l = end - start < 255? end - start : 255;
|
||||
for (j = start; j < end; ++j)
|
||||
cnt[j] = cnt[j] > l? cnt[j] : l;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < ks->seq.l; ++i) ++hist[cnt[i]];
|
||||
}
|
||||
for (i = 0; i < 256; ++i)
|
||||
printf("%d\t%lld\n", i, (long long)hist[i]);
|
||||
free(cnt);
|
||||
|
||||
smem_itr_destroy(itr);
|
||||
bwt_destroy(bwt);
|
||||
kseq_destroy(ks);
|
||||
gzclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
#ifndef NEON_SSE_H
|
||||
#define NEON_SSE_H
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
typedef uint8x16_t __m128i;
|
||||
|
||||
static inline __m128i _mm_load_si128(const __m128i *ptr) { return vld1q_u8((const uint8_t *) ptr); }
|
||||
static inline __m128i _mm_set1_epi32(int n) { return vreinterpretq_u8_s32(vdupq_n_s32(n)); }
|
||||
static inline void _mm_store_si128(__m128i *ptr, __m128i a) { vst1q_u8((uint8_t *) ptr, a); }
|
||||
|
||||
static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) { return vqaddq_u8(a, b); }
|
||||
static inline __m128i _mm_max_epu8(__m128i a, __m128i b) { return vmaxq_u8(a, b); }
|
||||
static inline __m128i _mm_set1_epi8(int8_t n) { return vreinterpretq_u8_s8(vdupq_n_s8(n)); }
|
||||
static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) { return vqsubq_u8(a, b); }
|
||||
|
||||
#define M128I(a) vreinterpretq_u8_s16((a))
|
||||
#define UM128I(a) vreinterpretq_u8_u16((a))
|
||||
#define S16(a) vreinterpretq_s16_u8((a))
|
||||
#define U16(a) vreinterpretq_u16_u8((a))
|
||||
|
||||
static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) { return M128I(vqaddq_s16(S16(a), S16(b))); }
|
||||
static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return UM128I(vcgtq_s16(S16(a), S16(b))); }
|
||||
static inline __m128i _mm_max_epi16(__m128i a, __m128i b) { return M128I(vmaxq_s16(S16(a), S16(b))); }
|
||||
static inline __m128i _mm_set1_epi16(int16_t n) { return vreinterpretq_u8_s16(vdupq_n_s16(n)); }
|
||||
static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) { return UM128I(vqsubq_u16(U16(a), U16(b))); }
|
||||
|
||||
#undef M128I
|
||||
#undef UM128I
|
||||
#undef S16
|
||||
#undef U16
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,291 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include <pthread.h>
|
||||
#include <errno.h>
|
||||
#include "ksw.h"
|
||||
#include "kseq.h"
|
||||
#include "kstring.h"
|
||||
#include "bwa.h"
|
||||
#include "utils.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
#endif
|
||||
|
||||
#define MAX_SCORE_RATIO 0.9f
|
||||
#define MAX_ERR 8
|
||||
|
||||
static const char *err_msg[MAX_ERR+1] = {
|
||||
"successful merges",
|
||||
"low-scoring pairs",
|
||||
"pairs where the best SW alignment is not an overlap (long left end)",
|
||||
"pairs where the best SW alignment is not an overlap (long right end)",
|
||||
"pairs with large 2nd best SW score",
|
||||
"pairs with gapped overlap",
|
||||
"pairs where the end-to-end alignment is inconsistent with SW",
|
||||
"pairs potentially with tandem overlaps",
|
||||
"pairs with high sum of errors"
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
int a, b, q, r, w;
|
||||
int q_def, q_thres;
|
||||
int T;
|
||||
int chunk_size;
|
||||
int n_threads;
|
||||
int flag; // bit 1: print merged; 2: print unmerged
|
||||
int8_t mat[25];
|
||||
} pem_opt_t;
|
||||
|
||||
pem_opt_t *pem_opt_init()
|
||||
{
|
||||
pem_opt_t *opt;
|
||||
opt = calloc(1, sizeof(pem_opt_t));
|
||||
opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20;
|
||||
opt->T = opt->a * 10;
|
||||
opt->q_def = 20;
|
||||
opt->q_thres = 70;
|
||||
opt->chunk_size = 10000000;
|
||||
opt->n_threads = 1;
|
||||
opt->flag = 3;
|
||||
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
return opt;
|
||||
}
|
||||
|
||||
int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2])
|
||||
{
|
||||
uint8_t *s[2], *q[2], *seq, *qual;
|
||||
int i, xtra, l, l_seq, sum_q, ret = 0;
|
||||
kswr_t r;
|
||||
|
||||
s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq);
|
||||
s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq);
|
||||
for (i = 0; i < x[0].l_seq; ++i) {
|
||||
int c = x[0].seq[i];
|
||||
s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c];
|
||||
q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def;
|
||||
}
|
||||
for (i = 0; i < x[1].l_seq; ++i) {
|
||||
int c = x[1].seq[x[1].l_seq - 1 - i];
|
||||
c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c];
|
||||
s[1][i] = c < 4? 3 - c : 4;
|
||||
q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def;
|
||||
}
|
||||
|
||||
xtra = KSW_XSTART | KSW_XSUBO;
|
||||
r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0);
|
||||
++r.qe; ++r.te; // change to the half-close-half-open coordinates
|
||||
|
||||
if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment
|
||||
if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end
|
||||
if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end
|
||||
if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large
|
||||
if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps
|
||||
|
||||
{ // test tandem match; O(n^2)
|
||||
int max_m, max_m2, min_l, max_l, max_l2;
|
||||
max_m = max_m2 = 0; max_l = max_l2 = 0;
|
||||
min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq;
|
||||
for (l = 1; l < min_l; ++l) {
|
||||
int m = 0, o = x[0].l_seq - l;
|
||||
uint8_t *s0o = &s[0][o], *s1 = s[1];
|
||||
for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck!
|
||||
m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i]
|
||||
if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l;
|
||||
else if (m > max_m2) max_m2 = m, max_l2 = l;
|
||||
}
|
||||
if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; }
|
||||
if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) {
|
||||
ret = -7; goto pem_ret;
|
||||
}
|
||||
if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; }
|
||||
}
|
||||
|
||||
l = x[0].l_seq - (r.tb - r.qb); // length to merge
|
||||
l_seq = x[0].l_seq + x[1].l_seq - l;
|
||||
seq = malloc(l_seq + 1);
|
||||
qual = malloc(l_seq + 1);
|
||||
memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l);
|
||||
memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l);
|
||||
for (i = 0, sum_q = 0; i < l; ++i) {
|
||||
int k = x[0].l_seq - l + i;
|
||||
if (s[0][k] == 4) { // ambiguous
|
||||
seq[k] = s[1][i];
|
||||
qual[k] = q[1][i];
|
||||
} else if (s[1][i] == 4) { // do nothing
|
||||
} else if (s[0][k] == s[1][i]) {
|
||||
qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i];
|
||||
} else { // s[0][k] != s[1][i] and neither is N
|
||||
int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i];
|
||||
sum_q += qq >= 3? qq<<1 : 1;
|
||||
seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i];
|
||||
qual[k] = abs((int)q[0][k] - (int)q[1][i]);
|
||||
}
|
||||
}
|
||||
if (sum_q>>1 > opt->q_thres) { // too many mismatches
|
||||
free(seq); free(qual);
|
||||
ret = -8; goto pem_ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33;
|
||||
seq[l_seq] = qual[l_seq] = 0;
|
||||
|
||||
free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment);
|
||||
memset(&x[1], 0, sizeof(bseq1_t));
|
||||
free(x[0].seq); free(x[0].qual);
|
||||
x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual;
|
||||
|
||||
pem_ret:
|
||||
free(s[0]); free(s[1]); free(q[0]); free(q[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void print_bseq(const bseq1_t *s, int rn)
|
||||
{
|
||||
err_putchar(s->qual? '@' : '>');
|
||||
err_fputs(s->name, stdout);
|
||||
if (rn == 1 || rn == 2) {
|
||||
err_putchar('/'); err_putchar('0' + rn); err_putchar('\n');
|
||||
} else err_puts(" merged");
|
||||
err_puts(s->seq);
|
||||
if (s->qual) {
|
||||
err_puts("+"); err_puts(s->qual);
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int n, start;
|
||||
bseq1_t *seqs;
|
||||
int64_t cnt[MAX_ERR+1];
|
||||
const pem_opt_t *opt;
|
||||
} worker_t;
|
||||
|
||||
void *worker(void *data)
|
||||
{
|
||||
worker_t *w = (worker_t*)data;
|
||||
int i;
|
||||
for (i = w->start; i < w->n>>1; i += w->opt->n_threads)
|
||||
++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])];
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1])
|
||||
{
|
||||
int i, j, n = n_>>1<<1;
|
||||
worker_t *w;
|
||||
|
||||
w = calloc(opt->n_threads, sizeof(worker_t));
|
||||
for (i = 0; i < opt->n_threads; ++i) {
|
||||
worker_t *p = &w[i];
|
||||
p->start = i; p->n = n;
|
||||
p->opt = opt;
|
||||
p->seqs = seqs;
|
||||
}
|
||||
if (opt->n_threads == 1) {
|
||||
worker(w);
|
||||
} else {
|
||||
pthread_t *tid;
|
||||
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
|
||||
for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]);
|
||||
for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
|
||||
free(tid);
|
||||
}
|
||||
for (i = 0; i < opt->n_threads; ++i) {
|
||||
worker_t *p = &w[i];
|
||||
for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j];
|
||||
}
|
||||
free(w);
|
||||
for (i = 0; i < n>>1; ++i) {
|
||||
if (seqs[i<<1|1].l_seq != 0) {
|
||||
if (opt->flag&2) {
|
||||
print_bseq(&seqs[i<<1|0], 1);
|
||||
print_bseq(&seqs[i<<1|1], 2);
|
||||
}
|
||||
} else if (opt->flag&1)
|
||||
print_bseq(&seqs[i<<1|0], 0);
|
||||
}
|
||||
for (i = 0; i < n; ++i) {
|
||||
bseq1_t *s = &seqs[i];
|
||||
free(s->name); free(s->seq); free(s->qual); free(s->comment);
|
||||
}
|
||||
}
|
||||
|
||||
int main_pemerge(int argc, char *argv[])
|
||||
{
|
||||
int c, flag = 0, i, n, min_ovlp = 10;
|
||||
int64_t cnt[MAX_ERR+1];
|
||||
bseq1_t *bseq;
|
||||
gzFile fp, fp2 = 0;
|
||||
kseq_t *ks, *ks2 = 0;
|
||||
pem_opt_t *opt;
|
||||
|
||||
opt = pem_opt_init();
|
||||
while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) {
|
||||
if (c == 'm') flag |= 1;
|
||||
else if (c == 'u') flag |= 2;
|
||||
else if (c == 'Q') opt->q_thres = atoi(optarg);
|
||||
else if (c == 't') opt->n_threads = atoi(optarg);
|
||||
else if (c == 'T') min_ovlp = atoi(optarg);
|
||||
else return 1;
|
||||
}
|
||||
if (flag == 0) flag = 3;
|
||||
opt->flag = flag;
|
||||
opt->T = opt->a * min_ovlp;
|
||||
|
||||
if (optind == argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
|
||||
fprintf(stderr, "Options: -m output merged reads only\n");
|
||||
fprintf(stderr, " -u output unmerged reads only\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp);
|
||||
fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres);
|
||||
fprintf(stderr, "\n");
|
||||
free(opt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Couldn't open %s : %s\n",
|
||||
strcmp(argv[optind], "-") ? argv[optind] : "stdin",
|
||||
errno ? strerror(errno) : "Out of memory");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ks = kseq_init(fp);
|
||||
if (optind + 1 < argc) {
|
||||
fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Couldn't open %s : %s\n",
|
||||
strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin",
|
||||
errno ? strerror(errno) : "Out of memory");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ks2 = kseq_init(fp2);
|
||||
}
|
||||
|
||||
memset(cnt, 0, 8 * (MAX_ERR+1));
|
||||
while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) {
|
||||
process_seqs(opt, n, bseq, cnt);
|
||||
free(bseq);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]);
|
||||
for (i = 1; i <= MAX_ERR; ++i)
|
||||
fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]);
|
||||
kseq_destroy(ks);
|
||||
err_gzclose(fp);
|
||||
if (ks2) {
|
||||
kseq_destroy(ks2);
|
||||
err_gzclose(fp2);
|
||||
}
|
||||
free(opt);
|
||||
|
||||
err_fflush(stdout);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
die("Usage: qualfa2fq.pl <in.fasta> <in.qual>\n") if (@ARGV != 2);
|
||||
|
||||
my ($fhs, $fhq, $q);
|
||||
open($fhs, ($ARGV[0] =~ /\.gz$/)? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die;
|
||||
open($fhq, ($ARGV[1] =~ /\.gz$/)? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die;
|
||||
|
||||
$/ = ">"; <$fhs>; <$fhq>; $/ = "\n";
|
||||
while (<$fhs>) {
|
||||
$q = <$fhq>;
|
||||
print "\@$_";
|
||||
$/ = ">";
|
||||
$_ = <$fhs>; $q = <$fhq>;
|
||||
chomp; chomp($q);
|
||||
$q =~ s/\s*(\d+)\s*/chr($1+33)/eg;
|
||||
print $_, "+\n";
|
||||
for (my $i = 0; $i < length($q); $i += 60) {
|
||||
print substr($q, $i, 60), "\n";
|
||||
}
|
||||
$/ = "\n";
|
||||
}
|
||||
|
||||
close($fhs); close($fhq);
|
||||
|
|
@ -0,0 +1,191 @@
|
|||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "rle.h"
|
||||
|
||||
const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 };
|
||||
|
||||
// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase
|
||||
int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6])
|
||||
{
|
||||
uint16_t *nptr = (uint16_t*)block;
|
||||
int diff;
|
||||
|
||||
block += 2; // skip the first 2 counting bytes
|
||||
if (*nptr == 0) {
|
||||
memset(cnt, 0, 48);
|
||||
diff = rle_enc1(block, a, rl);
|
||||
} else {
|
||||
uint8_t *p, *end = block + *nptr, *q;
|
||||
int64_t pre, z, l = 0, tot, beg_l;
|
||||
int c = -1, n_bytes = 0, n_bytes2, t = 0;
|
||||
uint8_t tmp[24];
|
||||
beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5];
|
||||
tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
|
||||
if (x < beg_l) {
|
||||
beg_l = 0, *beg = 0;
|
||||
memset(bc, 0, 48);
|
||||
}
|
||||
if (x == beg_l) {
|
||||
p = q = block + (*beg); z = beg_l;
|
||||
memcpy(cnt, bc, 48);
|
||||
} else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward
|
||||
z = beg_l; p = block + (*beg);
|
||||
memcpy(cnt, bc, 48);
|
||||
while (z < x) {
|
||||
rle_dec1(p, c, l);
|
||||
z += l; cnt[c] += l;
|
||||
}
|
||||
for (q = p - 1; *q>>6 == 2; --q);
|
||||
} else { // backward
|
||||
memcpy(cnt, ec, 48);
|
||||
z = tot; p = end;
|
||||
while (z >= x) {
|
||||
--p;
|
||||
if (*p>>6 != 2) {
|
||||
l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3;
|
||||
z -= l; cnt[*p&7] -= l;
|
||||
l = 0; t = 0;
|
||||
} else {
|
||||
l |= (*p&0x3fL) << t;
|
||||
t += 6;
|
||||
}
|
||||
}
|
||||
q = p;
|
||||
rle_dec1(p, c, l);
|
||||
z += l; cnt[c] += l;
|
||||
}
|
||||
*beg = q - block;
|
||||
memcpy(bc, cnt, 48);
|
||||
bc[c] -= l;
|
||||
n_bytes = p - q;
|
||||
if (x == z && a != c && p < end) { // then try the next run
|
||||
int tc;
|
||||
int64_t tl;
|
||||
q = p;
|
||||
rle_dec1(q, tc, tl);
|
||||
if (a == tc)
|
||||
c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl;
|
||||
}
|
||||
if (z != x) cnt[c] -= z - x;
|
||||
pre = x - (z - l); p -= n_bytes;
|
||||
if (a == c) { // insert to the same run
|
||||
n_bytes2 = rle_enc1(tmp, c, l + rl);
|
||||
} else if (x == z) { // at the end; append to the existing run
|
||||
p += n_bytes; n_bytes = 0;
|
||||
n_bytes2 = rle_enc1(tmp, a, rl);
|
||||
} else { // break the current run
|
||||
n_bytes2 = rle_enc1(tmp, c, pre);
|
||||
n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl);
|
||||
n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre);
|
||||
}
|
||||
if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed
|
||||
memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes);
|
||||
memcpy(p, tmp, n_bytes2);
|
||||
diff = n_bytes2 - n_bytes;
|
||||
}
|
||||
return (*nptr += diff);
|
||||
}
|
||||
|
||||
int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6])
|
||||
{
|
||||
int beg = 0;
|
||||
int64_t bc[6];
|
||||
memset(bc, 0, 48);
|
||||
return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc);
|
||||
}
|
||||
|
||||
void rle_split(uint8_t *block, uint8_t *new_block)
|
||||
{
|
||||
int n = *(uint16_t*)block;
|
||||
uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1);
|
||||
while (*q>>6 == 2) --q;
|
||||
memcpy(new_block + 2, q, end - q);
|
||||
*(uint16_t*)new_block = end - q;
|
||||
*(uint16_t*)block = q - block - 2;
|
||||
}
|
||||
|
||||
void rle_count(const uint8_t *block, int64_t cnt[6])
|
||||
{
|
||||
const uint8_t *q = block + 2, *end = q + *(uint16_t*)block;
|
||||
while (q < end) {
|
||||
int c;
|
||||
int64_t l;
|
||||
rle_dec1(q, c, l);
|
||||
cnt[c] += l;
|
||||
}
|
||||
}
|
||||
|
||||
void rle_print(const uint8_t *block, int expand)
|
||||
{
|
||||
const uint16_t *p = (const uint16_t*)block;
|
||||
const uint8_t *q = block + 2, *end = block + 2 + *p;
|
||||
while (q < end) {
|
||||
int c;
|
||||
int64_t l, x;
|
||||
rle_dec1(q, c, l);
|
||||
if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]);
|
||||
else printf("%c%ld", "$ACGTN"[c], (long)l);
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6])
|
||||
{
|
||||
int a;
|
||||
int64_t tot, cnt[6];
|
||||
const uint8_t *p;
|
||||
|
||||
y = y >= x? y : x;
|
||||
tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
|
||||
if (tot == 0) return;
|
||||
if (x <= (tot - y) + (tot>>3)) {
|
||||
int c = 0;
|
||||
int64_t l, z = 0;
|
||||
memset(cnt, 0, 48);
|
||||
p = block + 2;
|
||||
while (z < x) {
|
||||
rle_dec1(p, c, l);
|
||||
z += l; cnt[c] += l;
|
||||
}
|
||||
for (a = 0; a != 6; ++a) cx[a] += cnt[a];
|
||||
cx[c] -= z - x;
|
||||
if (cy) {
|
||||
while (z < y) {
|
||||
rle_dec1(p, c, l);
|
||||
z += l; cnt[c] += l;
|
||||
}
|
||||
for (a = 0; a != 6; ++a) cy[a] += cnt[a];
|
||||
cy[c] -= z - y;
|
||||
}
|
||||
} else {
|
||||
#define move_backward(_x) \
|
||||
while (z >= (_x)) { \
|
||||
--p; \
|
||||
if (*p>>6 != 2) { \
|
||||
l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \
|
||||
z -= l; cnt[*p&7] -= l; \
|
||||
l = 0; t = 0; \
|
||||
} else { \
|
||||
l |= (*p&0x3fL) << t; \
|
||||
t += 6; \
|
||||
} \
|
||||
} \
|
||||
|
||||
int t = 0;
|
||||
int64_t l = 0, z = tot;
|
||||
memcpy(cnt, ec, 48);
|
||||
p = block + 2 + *(const uint16_t*)block;
|
||||
if (cy) {
|
||||
move_backward(y)
|
||||
for (a = 0; a != 6; ++a) cy[a] += cnt[a];
|
||||
cy[*p&7] += y - z;
|
||||
}
|
||||
move_backward(x)
|
||||
for (a = 0; a != 6; ++a) cx[a] += cnt[a];
|
||||
cx[*p&7] += x - z;
|
||||
|
||||
#undef move_backward
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
#ifndef RLE6_H_
|
||||
#define RLE6_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define LIKELY(x) __builtin_expect((x),1)
|
||||
#else
|
||||
#define LIKELY(x) (x)
|
||||
#endif
|
||||
#ifdef __cplusplus
|
||||
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]);
|
||||
int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]);
|
||||
void rle_split(uint8_t *block, uint8_t *new_block);
|
||||
void rle_count(const uint8_t *block, int64_t cnt[6]);
|
||||
void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]);
|
||||
#define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec)
|
||||
|
||||
void rle_print(const uint8_t *block, int expand);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/******************
|
||||
*** 43+3 codec ***
|
||||
******************/
|
||||
|
||||
extern const uint8_t rle_auxtab[8];
|
||||
|
||||
#define RLE_MIN_SPACE 18
|
||||
#define rle_nptr(block) ((uint16_t*)(block))
|
||||
|
||||
// decode one run (c,l) and move the pointer p
|
||||
#define rle_dec1(p, c, l) do { \
|
||||
(c) = *(p) & 7; \
|
||||
if (LIKELY((*(p)&0x80) == 0)) { \
|
||||
(l) = *(p)++ >> 3; \
|
||||
} else if (LIKELY(*(p)>>5 == 6)) { \
|
||||
(l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \
|
||||
(p) += 2; \
|
||||
} else { \
|
||||
int n = ((*(p)&0x10) >> 2) + 4; \
|
||||
(l) = *(p)++ >> 3 & 1; \
|
||||
while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static inline int rle_enc1(uint8_t *p, int c, int64_t l)
|
||||
{
|
||||
if (l < 1LL<<4) {
|
||||
*p = l << 3 | c;
|
||||
return 1;
|
||||
} else if (l < 1LL<<8) {
|
||||
*p = 0xC0 | l >> 6 << 3 | c;
|
||||
p[1] = 0x80 | (l & 0x3f);
|
||||
return 2;
|
||||
} else if (l < 1LL<<19) {
|
||||
*p = 0xE0 | l >> 18 << 3 | c;
|
||||
p[1] = 0x80 | (l >> 12 & 0x3f);
|
||||
p[2] = 0x80 | (l >> 6 & 0x3f);
|
||||
p[3] = 0x80 | (l & 0x3f);
|
||||
return 4;
|
||||
} else {
|
||||
int i, shift = 36;
|
||||
*p = 0xF0 | l >> 42 << 3 | c;
|
||||
for (i = 1; i < 8; ++i, shift -= 6)
|
||||
p[i] = 0x80 | (l>>shift & 0x3f);
|
||||
return 8;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,318 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
#include "rle.h"
|
||||
#include "rope.h"
|
||||
|
||||
/*******************
|
||||
*** Memory Pool ***
|
||||
*******************/
|
||||
|
||||
#define MP_CHUNK_SIZE 0x100000 // 1MB per chunk
|
||||
|
||||
typedef struct { // memory pool for fast and compact memory allocation (no free)
|
||||
int size, i, n_elems;
|
||||
int64_t top, max;
|
||||
uint8_t **mem;
|
||||
} mempool_t;
|
||||
|
||||
static mempool_t *mp_init(int size)
|
||||
{
|
||||
mempool_t *mp;
|
||||
mp = calloc(1, sizeof(mempool_t));
|
||||
mp->size = size;
|
||||
mp->i = mp->n_elems = MP_CHUNK_SIZE / size;
|
||||
mp->top = -1;
|
||||
return mp;
|
||||
}
|
||||
|
||||
static void mp_destroy(mempool_t *mp)
|
||||
{
|
||||
int64_t i;
|
||||
for (i = 0; i <= mp->top; ++i) free(mp->mem[i]);
|
||||
free(mp->mem); free(mp);
|
||||
}
|
||||
|
||||
static inline void *mp_alloc(mempool_t *mp)
|
||||
{
|
||||
if (mp->i == mp->n_elems) {
|
||||
if (++mp->top == mp->max) {
|
||||
mp->max = mp->max? mp->max<<1 : 1;
|
||||
mp->mem = realloc(mp->mem, sizeof(void*) * mp->max);
|
||||
}
|
||||
mp->mem[mp->top] = calloc(mp->n_elems, mp->size);
|
||||
mp->i = 0;
|
||||
}
|
||||
return mp->mem[mp->top] + (mp->i++) * mp->size;
|
||||
}
|
||||
|
||||
/***************
|
||||
*** B+ rope ***
|
||||
***************/
|
||||
|
||||
rope_t *rope_init(int max_nodes, int block_len)
|
||||
{
|
||||
rope_t *rope;
|
||||
rope = calloc(1, sizeof(rope_t));
|
||||
if (block_len < 32) block_len = 32;
|
||||
rope->max_nodes = (max_nodes+ 1)>>1<<1;
|
||||
rope->block_len = (block_len + 7) >> 3 << 3;
|
||||
rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes);
|
||||
rope->leaf = mp_init(rope->block_len);
|
||||
rope->root = mp_alloc(rope->node);
|
||||
rope->root->n = 1;
|
||||
rope->root->is_bottom = 1;
|
||||
rope->root->p = mp_alloc(rope->leaf);
|
||||
return rope;
|
||||
}
|
||||
|
||||
void rope_destroy(rope_t *rope)
|
||||
{
|
||||
mp_destroy(rope->node);
|
||||
mp_destroy(rope->leaf);
|
||||
free(rope);
|
||||
}
|
||||
|
||||
static inline rpnode_t *split_node(rope_t *rope, rpnode_t *u, rpnode_t *v)
|
||||
{ // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u
|
||||
int j, i = v - u;
|
||||
rpnode_t *w; // $w is the sibling of $v
|
||||
if (u == 0) { // only happens at the root; add a new root
|
||||
u = v = mp_alloc(rope->node);
|
||||
v->n = 1; v->p = rope->root; // the new root has the old root as the only child
|
||||
memcpy(v->c, rope->c, 48);
|
||||
for (j = 0; j < 6; ++j) v->l += v->c[j];
|
||||
rope->root = v;
|
||||
}
|
||||
if (i != u->n - 1) // then make room for a new node
|
||||
memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1));
|
||||
++u->n; w = v + 1;
|
||||
memset(w, 0, sizeof(rpnode_t));
|
||||
w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node);
|
||||
if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node
|
||||
uint8_t *p = (uint8_t*)v->p, *q = (uint8_t*)w->p;
|
||||
rle_split(p, q);
|
||||
rle_count(q, w->c);
|
||||
} else { // $v->p is a node, not a string
|
||||
rpnode_t *p = v->p, *q = w->p; // $v and $w are siblings and thus $p and $q are cousins
|
||||
p->n -= rope->max_nodes>>1;
|
||||
memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1));
|
||||
q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy()
|
||||
q->is_bottom = p->is_bottom;
|
||||
for (i = 0; i < q->n; ++i)
|
||||
for (j = 0; j < 6; ++j)
|
||||
w->c[j] += q[i].c[j];
|
||||
}
|
||||
for (j = 0; j < 6; ++j) // compute $w->l and update $v->c
|
||||
w->l += w->c[j], v->c[j] -= w->c[j];
|
||||
v->l -= w->l; // update $v->c
|
||||
return v;
|
||||
}
|
||||
|
||||
int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache)
|
||||
{ // insert $a after $x symbols in $rope and the returns rank(a, x)
|
||||
rpnode_t *u = 0, *v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket
|
||||
int64_t y = 0, z = 0, cnt[6];
|
||||
int n_runs;
|
||||
do { // top-down update. Searching and node splitting are done together in one pass.
|
||||
if (p->n == rope->max_nodes) { // node is full; split
|
||||
v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root
|
||||
if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v
|
||||
y += v->l, z += v->c[a], ++v, p = v->p;
|
||||
}
|
||||
u = p;
|
||||
if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend
|
||||
p += p->n - 1; y += v->l; z += v->c[a];
|
||||
for (; y >= x; --p) y -= p->l, z -= p->c[a];
|
||||
++p;
|
||||
} else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly
|
||||
assert(p - u < u->n);
|
||||
if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split
|
||||
v = p; p = p->p; // descend
|
||||
} while (!u->is_bottom);
|
||||
rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts
|
||||
if (cache) {
|
||||
if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t));
|
||||
n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc);
|
||||
cache->p = (uint8_t*)p;
|
||||
} else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c);
|
||||
z += cnt[a];
|
||||
v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work
|
||||
if (n_runs + RLE_MIN_SPACE > rope->block_len) {
|
||||
split_node(rope, u, v);
|
||||
if (cache) memset(cache, 0, sizeof(rpcache_t));
|
||||
}
|
||||
return z;
|
||||
}
|
||||
|
||||
static rpnode_t *rope_count_to_leaf(const rope_t *rope, int64_t x, int64_t cx[6], int64_t *rest)
|
||||
{
|
||||
rpnode_t *u, *v = 0, *p = rope->root;
|
||||
int64_t y = 0;
|
||||
int a;
|
||||
|
||||
memset(cx, 0, 48);
|
||||
do {
|
||||
u = p;
|
||||
if (v && x - y > v->l>>1) {
|
||||
p += p->n - 1; y += v->l;
|
||||
for (a = 0; a != 6; ++a) cx[a] += v->c[a];
|
||||
for (; y >= x; --p) {
|
||||
y -= p->l;
|
||||
for (a = 0; a != 6; ++a) cx[a] -= p->c[a];
|
||||
}
|
||||
++p;
|
||||
} else {
|
||||
for (; y + p->l < x; ++p) {
|
||||
y += p->l;
|
||||
for (a = 0; a != 6; ++a) cx[a] += p->c[a];
|
||||
}
|
||||
}
|
||||
v = p; p = p->p;
|
||||
} while (!u->is_bottom);
|
||||
*rest = x - y;
|
||||
return v;
|
||||
}
|
||||
|
||||
void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy)
|
||||
{
|
||||
rpnode_t *v;
|
||||
int64_t rest;
|
||||
v = rope_count_to_leaf(rope, x, cx, &rest);
|
||||
if (y < x || cy == 0) {
|
||||
rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
|
||||
} else if (rest + (y - x) <= v->l) {
|
||||
memcpy(cy, cx, 48);
|
||||
rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c);
|
||||
} else {
|
||||
rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
|
||||
v = rope_count_to_leaf(rope, y, cy, &rest);
|
||||
rle_rank1a((const uint8_t*)v->p, rest, cy, v->c);
|
||||
}
|
||||
}
|
||||
|
||||
/*********************
|
||||
*** Rope iterator ***
|
||||
*********************/
|
||||
|
||||
void rope_itr_first(const rope_t *rope, rpitr_t *i)
|
||||
{
|
||||
memset(i, 0, sizeof(rpitr_t));
|
||||
i->rope = rope;
|
||||
for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf
|
||||
++i->d, i->pa[i->d] = i->pa[i->d - 1]->p;
|
||||
}
|
||||
|
||||
const uint8_t *rope_itr_next_block(rpitr_t *i)
|
||||
{
|
||||
const uint8_t *ret;
|
||||
assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall
|
||||
if (i->d < 0) return 0;
|
||||
ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p;
|
||||
while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking
|
||||
if (i->d >= 0)
|
||||
while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf
|
||||
++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/***********
|
||||
*** I/O ***
|
||||
***********/
|
||||
|
||||
void rope_print_node(const rpnode_t *p)
|
||||
{
|
||||
if (p->is_bottom) {
|
||||
int i;
|
||||
putchar('(');
|
||||
for (i = 0; i < p->n; ++i) {
|
||||
uint8_t *block = (uint8_t*)p[i].p;
|
||||
const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block);
|
||||
if (i) putchar(',');
|
||||
while (q < end) {
|
||||
int c = 0;
|
||||
int64_t j, l;
|
||||
rle_dec1(q, c, l);
|
||||
for (j = 0; j < l; ++j) putchar("$ACGTN"[c]);
|
||||
}
|
||||
}
|
||||
putchar(')');
|
||||
} else {
|
||||
int i;
|
||||
putchar('(');
|
||||
for (i = 0; i < p->n; ++i) {
|
||||
if (i) putchar(',');
|
||||
rope_print_node(p[i].p);
|
||||
}
|
||||
putchar(')');
|
||||
}
|
||||
}
|
||||
|
||||
void rope_dump_node(const rpnode_t *p, FILE *fp)
|
||||
{
|
||||
int16_t i, n = p->n;
|
||||
uint8_t is_bottom = p->is_bottom;
|
||||
fwrite(&is_bottom, 1, 1, fp);
|
||||
fwrite(&n, 2, 1, fp);
|
||||
if (is_bottom) {
|
||||
for (i = 0; i < n; ++i) {
|
||||
fwrite(p[i].c, 8, 6, fp);
|
||||
fwrite(p[i].p, 1, *rle_nptr(p[i].p) + 2, fp);
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < p->n; ++i)
|
||||
rope_dump_node(p[i].p, fp);
|
||||
}
|
||||
}
|
||||
|
||||
void rope_dump(const rope_t *r, FILE *fp)
|
||||
{
|
||||
fwrite(&r->max_nodes, 4, 1, fp);
|
||||
fwrite(&r->block_len, 4, 1, fp);
|
||||
rope_dump_node(r->root, fp);
|
||||
}
|
||||
|
||||
rpnode_t *rope_restore_node(const rope_t *r, FILE *fp, int64_t c[6])
|
||||
{
|
||||
uint8_t is_bottom, a;
|
||||
int16_t i, n;
|
||||
rpnode_t *p;
|
||||
fread(&is_bottom, 1, 1, fp);
|
||||
fread(&n, 2, 1, fp);
|
||||
p = mp_alloc(r->node);
|
||||
p->is_bottom = is_bottom, p->n = n;
|
||||
if (is_bottom) {
|
||||
for (i = 0; i < n; ++i) {
|
||||
uint16_t *q;
|
||||
p[i].p = mp_alloc(r->leaf);
|
||||
q = rle_nptr(p[i].p);
|
||||
fread(p[i].c, 8, 6, fp);
|
||||
fread(q, 2, 1, fp);
|
||||
fread(q + 1, 1, *q, fp);
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; ++i)
|
||||
p[i].p = rope_restore_node(r, fp, p[i].c);
|
||||
}
|
||||
memset(c, 0, 48);
|
||||
for (i = 0; i < n; ++i) {
|
||||
p[i].l = 0;
|
||||
for (a = 0; a < 6; ++a)
|
||||
c[a] += p[i].c[a], p[i].l += p[i].c[a];
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
rope_t *rope_restore(FILE *fp)
|
||||
{
|
||||
rope_t *r;
|
||||
r = calloc(1, sizeof(rope_t));
|
||||
fread(&r->max_nodes, 4, 1, fp);
|
||||
fread(&r->block_len, 4, 1, fp);
|
||||
r->node = mp_init(sizeof(rpnode_t) * r->max_nodes);
|
||||
r->leaf = mp_init(r->block_len);
|
||||
r->root = rope_restore_node(r, fp, r->c);
|
||||
return r;
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
#ifndef ROPE_H_
|
||||
#define ROPE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define ROPE_MAX_DEPTH 80
|
||||
#define ROPE_DEF_MAX_NODES 64
|
||||
#define ROPE_DEF_BLOCK_LEN 512
|
||||
|
||||
typedef struct rpnode_s {
|
||||
struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs)
|
||||
uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket
|
||||
int64_t c[6]; // marginal counts
|
||||
} rpnode_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t max_nodes, block_len; // both MUST BE even numbers
|
||||
int64_t c[6]; // marginal counts
|
||||
rpnode_t *root;
|
||||
void *node, *leaf; // memory pool
|
||||
} rope_t;
|
||||
|
||||
typedef struct {
|
||||
const rope_t *rope; // the rope
|
||||
const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes
|
||||
int ia[ROPE_MAX_DEPTH]; // index in the parent nodes
|
||||
int d; // the current depth in the B+-tree
|
||||
} rpitr_t;
|
||||
|
||||
typedef struct {
|
||||
int beg;
|
||||
int64_t bc[6];
|
||||
uint8_t *p;
|
||||
} rpcache_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
rope_t *rope_init(int max_nodes, int block_len);
|
||||
void rope_destroy(rope_t *rope);
|
||||
int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache);
|
||||
void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy);
|
||||
#define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0)
|
||||
|
||||
void rope_itr_first(const rope_t *rope, rpitr_t *i);
|
||||
const uint8_t *rope_itr_next_block(rpitr_t *i);
|
||||
|
||||
void rope_print_node(const rpnode_t *p);
|
||||
void rope_dump(const rope_t *r, FILE *fp);
|
||||
rope_t *rope_restore(FILE *fp);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
#ifndef SCALAR_SSE_H
|
||||
#define SCALAR_SSE_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef union m128i {
|
||||
uint8_t u8[16];
|
||||
int16_t i16[8];
|
||||
} __m128i;
|
||||
|
||||
static inline __m128i _mm_set1_epi32(int32_t n) {
|
||||
assert(n >= 0 && n <= 255);
|
||||
__m128i r; memset(&r, n, sizeof r); return r;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_load_si128(const __m128i *ptr) { __m128i r; memcpy(&r, ptr, sizeof r); return r; }
|
||||
static inline void _mm_store_si128(__m128i *ptr, __m128i a) { memcpy(ptr, &a, sizeof a); }
|
||||
|
||||
static inline int m128i_allzero(__m128i a) {
|
||||
static const char zero[] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
|
||||
return memcmp(&a, zero, sizeof a) == 0;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_slli_si128(__m128i a, int n) {
|
||||
int i;
|
||||
memmove(&a.u8[n], &a.u8[0], 16 - n);
|
||||
for (i = 0; i < n; i++) a.u8[i] = 0;
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) {
|
||||
int i;
|
||||
for (i = 0; i < 16; i++) {
|
||||
uint16_t aa = a.u8[i];
|
||||
aa += b.u8[i];
|
||||
a.u8[i] = (aa < 256)? aa : 255;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_max_epu8(__m128i a, __m128i b) {
|
||||
int i;
|
||||
for (i = 0; i < 16; i++)
|
||||
if (a.u8[i] < b.u8[i]) a.u8[i] = b.u8[i];
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline uint8_t m128i_max_u8(__m128i a) {
|
||||
uint8_t max = 0;
|
||||
int i;
|
||||
for (i = 0; i < 16; i++)
|
||||
if (max < a.u8[i]) max = a.u8[i];
|
||||
return max;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_set1_epi8(int8_t n) { __m128i r; memset(&r, n, sizeof r); return r; }
|
||||
|
||||
static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) {
|
||||
int i;
|
||||
for (i = 0; i < 16; i++) {
|
||||
int16_t aa = a.u8[i];
|
||||
aa -= b.u8[i];
|
||||
a.u8[i] = (aa >= 0)? aa : 0;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) {
|
||||
int i;
|
||||
for (i = 0; i < 8; i++) {
|
||||
int32_t aa = a.i16[i];
|
||||
aa += b.i16[i];
|
||||
a.i16[i] = (aa < 32768)? aa : 32767;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) {
|
||||
int i;
|
||||
for (i = 0; i < 8; i++)
|
||||
a.i16[i] = (a.i16[i] > b.i16[i])? 0xffff : 0x0000;
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_max_epi16(__m128i a, __m128i b) {
|
||||
int i;
|
||||
for (i = 0; i < 8; i++)
|
||||
if (a.i16[i] < b.i16[i]) a.i16[i] = b.i16[i];
|
||||
return a;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_set1_epi16(int16_t n) {
|
||||
__m128i r;
|
||||
r.i16[0] = r.i16[1] = r.i16[2] = r.i16[3] =
|
||||
r.i16[4] = r.i16[5] = r.i16[6] = r.i16[7] = n;
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline int16_t m128i_max_s16(__m128i a) {
|
||||
int16_t max = -32768;
|
||||
int i;
|
||||
for (i = 0; i < 8; i++)
|
||||
if (max < a.i16[i]) max = a.i16[i];
|
||||
return max;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) {
|
||||
int i;
|
||||
for (i = 0; i < 8; i++) {
|
||||
int32_t aa = a.i16[i];
|
||||
aa -= b.i16[i];
|
||||
a.i16[i] = (aa >= 0)? aa : 0;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,306 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#define FSYNC_ON_FLUSH
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include <errno.h>
|
||||
#ifdef FSYNC_ON_FLUSH
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <sys/resource.h>
|
||||
#include <sys/time.h>
|
||||
#include "utils.h"
|
||||
|
||||
#include "ksort.h"
|
||||
#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
|
||||
KSORT_INIT(128, pair64_t, pair64_lt)
|
||||
KSORT_INIT(64, uint64_t, ks_lt_generic)
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT2(, gzFile, err_gzread)
|
||||
|
||||
/********************
|
||||
* System utilities *
|
||||
********************/
|
||||
|
||||
FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
|
||||
{
|
||||
FILE *fp = 0;
|
||||
if (strcmp(fn, "-") == 0)
|
||||
return (strstr(mode, "r"))? stdin : stdout;
|
||||
if ((fp = fopen(fn, mode)) == 0) {
|
||||
err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
|
||||
}
|
||||
return fp;
|
||||
}
|
||||
|
||||
FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp)
|
||||
{
|
||||
if (freopen(fn, mode, fp) == 0) {
|
||||
err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
|
||||
}
|
||||
return fp;
|
||||
}
|
||||
|
||||
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
|
||||
{
|
||||
gzFile fp;
|
||||
if (strcmp(fn, "-") == 0) {
|
||||
fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode);
|
||||
/* According to zlib.h, this is the only reason gzdopen can fail */
|
||||
if (!fp) err_fatal(func, "Out of memory");
|
||||
return fp;
|
||||
}
|
||||
if ((fp = gzopen(fn, mode)) == 0) {
|
||||
err_fatal(func, "fail to open file '%s' : %s", fn, errno ? strerror(errno) : "Out of memory");
|
||||
}
|
||||
return fp;
|
||||
}
|
||||
|
||||
void err_fatal(const char *header, const char *fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
fprintf(stderr, "[%s] ", header);
|
||||
vfprintf(stderr, fmt, args);
|
||||
fprintf(stderr, "\n");
|
||||
va_end(args);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
void err_fatal_core(const char *header, const char *fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
fprintf(stderr, "[%s] ", header);
|
||||
vfprintf(stderr, fmt, args);
|
||||
fprintf(stderr, " Abort!\n");
|
||||
va_end(args);
|
||||
abort();
|
||||
}
|
||||
|
||||
void _err_fatal_simple(const char *func, const char *msg)
|
||||
{
|
||||
fprintf(stderr, "[%s] %s\n", func, msg);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
void _err_fatal_simple_core(const char *func, const char *msg)
|
||||
{
|
||||
fprintf(stderr, "[%s] %s Abort!\n", func, msg);
|
||||
abort();
|
||||
}
|
||||
|
||||
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
|
||||
{
|
||||
size_t ret = fwrite(ptr, size, nmemb, stream);
|
||||
if (ret != nmemb)
|
||||
_err_fatal_simple("fwrite", strerror(errno));
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
|
||||
{
|
||||
size_t ret = fread(ptr, size, nmemb, stream);
|
||||
if (ret != nmemb)
|
||||
{
|
||||
_err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file");
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int err_gzread(gzFile file, void *ptr, unsigned int len)
|
||||
{
|
||||
int ret = gzread(file, ptr, len);
|
||||
|
||||
if (ret < 0)
|
||||
{
|
||||
int errnum = 0;
|
||||
const char *msg = gzerror(file, &errnum);
|
||||
_err_fatal_simple("gzread", Z_ERRNO == errnum ? strerror(errno) : msg);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int err_fseek(FILE *stream, long offset, int whence)
|
||||
{
|
||||
int ret = fseek(stream, offset, whence);
|
||||
if (0 != ret)
|
||||
{
|
||||
_err_fatal_simple("fseek", strerror(errno));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
long err_ftell(FILE *stream)
|
||||
{
|
||||
long ret = ftell(stream);
|
||||
if (-1 == ret)
|
||||
{
|
||||
_err_fatal_simple("ftell", strerror(errno));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int err_printf(const char *format, ...)
|
||||
{
|
||||
va_list arg;
|
||||
int done;
|
||||
va_start(arg, format);
|
||||
done = vfprintf(stdout, format, arg);
|
||||
int saveErrno = errno;
|
||||
va_end(arg);
|
||||
if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno));
|
||||
return done;
|
||||
}
|
||||
|
||||
int err_fprintf(FILE *stream, const char *format, ...)
|
||||
{
|
||||
va_list arg;
|
||||
int done;
|
||||
va_start(arg, format);
|
||||
done = vfprintf(stream, format, arg);
|
||||
int saveErrno = errno;
|
||||
va_end(arg);
|
||||
if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno));
|
||||
return done;
|
||||
}
|
||||
|
||||
int err_fputc(int c, FILE *stream)
|
||||
{
|
||||
int ret = putc(c, stream);
|
||||
if (EOF == ret)
|
||||
{
|
||||
_err_fatal_simple("fputc", strerror(errno));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int err_fputs(const char *s, FILE *stream)
|
||||
{
|
||||
int ret = fputs(s, stream);
|
||||
if (EOF == ret)
|
||||
{
|
||||
_err_fatal_simple("fputs", strerror(errno));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int err_puts(const char *s)
|
||||
{
|
||||
int ret = puts(s);
|
||||
if (EOF == ret)
|
||||
{
|
||||
_err_fatal_simple("puts", strerror(errno));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int err_fflush(FILE *stream)
|
||||
{
|
||||
int ret = fflush(stream);
|
||||
if (ret != 0) _err_fatal_simple("fflush", strerror(errno));
|
||||
|
||||
#ifdef FSYNC_ON_FLUSH
|
||||
/* Calling fflush() ensures that all the data has made it to the
|
||||
kernel buffers, but this may not be sufficient for remote filesystems
|
||||
(e.g. NFS, lustre) as an error may still occur while the kernel
|
||||
is copying the buffered data to the file server. To be sure of
|
||||
catching these errors, we need to call fsync() on the file
|
||||
descriptor, but only if it is a regular file. */
|
||||
{
|
||||
struct stat sbuf;
|
||||
if (0 != fstat(fileno(stream), &sbuf))
|
||||
_err_fatal_simple("fstat", strerror(errno));
|
||||
|
||||
if (S_ISREG(sbuf.st_mode))
|
||||
{
|
||||
if (0 != fsync(fileno(stream)))
|
||||
_err_fatal_simple("fsync", strerror(errno));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
int err_fclose(FILE *stream)
|
||||
{
|
||||
int ret = fclose(stream);
|
||||
if (ret != 0) _err_fatal_simple("fclose", strerror(errno));
|
||||
return ret;
|
||||
}
|
||||
|
||||
int err_gzclose(gzFile file)
|
||||
{
|
||||
int ret = gzclose(file);
|
||||
if (Z_OK != ret)
|
||||
{
|
||||
_err_fatal_simple("gzclose", Z_ERRNO == ret ? strerror(errno) : zError(ret));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*********
|
||||
* Timer *
|
||||
*********/
|
||||
|
||||
double cputime(void)
|
||||
{
|
||||
struct rusage r;
|
||||
getrusage(RUSAGE_SELF, &r);
|
||||
return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
|
||||
}
|
||||
|
||||
double realtime(void)
|
||||
{
|
||||
struct timeval tp;
|
||||
struct timezone tzp;
|
||||
gettimeofday(&tp, &tzp);
|
||||
return tp.tv_sec + tp.tv_usec * 1e-6;
|
||||
}
|
||||
|
||||
long peakrss(void)
|
||||
{
|
||||
struct rusage r;
|
||||
getrusage(RUSAGE_SELF, &r);
|
||||
#ifdef __linux__
|
||||
return r.ru_maxrss * 1024;
|
||||
#else
|
||||
return r.ru_maxrss;
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||
2009-2018 Broad Institute, Inc.
|
||||
2008-2009 Genome Research Ltd. (GRL)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
#ifndef LH3_UTILS_H
|
||||
#define LH3_UTILS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
|
||||
#ifdef __GNUC__
|
||||
// Tell GCC to validate printf format string and args
|
||||
#define ATTRIBUTE(list) __attribute__ (list)
|
||||
#else
|
||||
#define ATTRIBUTE(list)
|
||||
#endif
|
||||
|
||||
#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg)
|
||||
#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg)
|
||||
|
||||
#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
|
||||
#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp)
|
||||
#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode)
|
||||
|
||||
#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg)
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y;
|
||||
} pair64_t;
|
||||
|
||||
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
|
||||
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void err_fatal(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn));
|
||||
void err_fatal_core(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn));
|
||||
void _err_fatal_simple(const char *func, const char *msg) ATTRIBUTE((noreturn));
|
||||
void _err_fatal_simple_core(const char *func, const char *msg) ATTRIBUTE((noreturn));
|
||||
FILE *err_xopen_core(const char *func, const char *fn, const char *mode);
|
||||
FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp);
|
||||
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode);
|
||||
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
|
||||
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream);
|
||||
|
||||
int err_gzread(gzFile file, void *ptr, unsigned int len);
|
||||
int err_fseek(FILE *stream, long offset, int whence);
|
||||
#define err_rewind(FP) err_fseek((FP), 0, SEEK_SET)
|
||||
long err_ftell(FILE *stream);
|
||||
int err_fprintf(FILE *stream, const char *format, ...)
|
||||
ATTRIBUTE((format(printf, 2, 3)));
|
||||
int err_printf(const char *format, ...)
|
||||
ATTRIBUTE((format(printf, 1, 2)));
|
||||
int err_fputc(int c, FILE *stream);
|
||||
#define err_putchar(C) err_fputc((C), stdout)
|
||||
int err_fputs(const char *s, FILE *stream);
|
||||
int err_puts(const char *s);
|
||||
int err_fflush(FILE *stream);
|
||||
int err_fclose(FILE *stream);
|
||||
int err_gzclose(gzFile file);
|
||||
|
||||
double cputime(void);
|
||||
double realtime(void);
|
||||
long peakrss(void);
|
||||
|
||||
void ks_introsort_64 (size_t n, uint64_t *a);
|
||||
void ks_introsort_128(size_t n, pair64_t *a);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline uint64_t hash_64(uint64_t key)
|
||||
{
|
||||
key += ~(key << 32);
|
||||
key ^= (key >> 22);
|
||||
key += ~(key << 13);
|
||||
key ^= (key >> 8);
|
||||
key += (key << 3);
|
||||
key ^= (key >> 15);
|
||||
key += ~(key << 27);
|
||||
key ^= (key >> 31);
|
||||
return key;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
while (<>) {
|
||||
if (/\tXA:Z:(\S+)/) {
|
||||
my $l = $1;
|
||||
print;
|
||||
my @t = split("\t");
|
||||
while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) {
|
||||
my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated!
|
||||
my $seq = $t[9];
|
||||
my $phred = $t[10];
|
||||
# if alternative alignment has other orientation than primary,
|
||||
# then print the reverse (complement) of sequence and phred string
|
||||
if ((($t[1]&0x10)>0) xor ($2<0)) {
|
||||
$seq = reverse $seq;
|
||||
$seq =~ tr/ACGTacgt/TGCAtgca/;
|
||||
$phred = reverse $phred;
|
||||
}
|
||||
print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, $seq, $phred, "NM:i:$4"), "\n");
|
||||
}
|
||||
} else { print; }
|
||||
}
|
||||
Loading…
Reference in New Issue