基于bwa Release bwa-0.7.19 (r1273) 开始实现hybrid-index

This commit is contained in:
zzh 2025-10-23 10:33:55 +08:00
parent e10ac34984
commit b5ee622884
83 changed files with 25911 additions and 2 deletions

8
.gitignore vendored
View File

@ -1,3 +1,11 @@
*.[oa]
bwa
test
test64
.*.swp
Makefile.bak
bwamem-lite
# ---> C
# Prerequisites
*.d

122
.vscode/launch.json vendored 100644
View File

@ -0,0 +1,122 @@
{
// 使 IntelliSense
//
// 访: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "bwa-mem",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"args": [
"mem",
"-t",
"1",
"-M",
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/data/fmt_ref/human_g1k_v37_decoy.fasta",
"./b1.fq",
"./b2.fq",
//"./b1.fq",
//"~/data/dataset/real/D1/n1.fq",
//"~/data/dataset/real/D1/n2.fq",
//"~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq",
//"~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq",
//"~/data/dataset/real/D3/n1.fq",
//"~/data/dataset/real/D3/n2.fq",
//"~/data/dataset/real/D1/n1.fq.gz",
//"~/data/dataset/real/D1/n2.fq.gz",
//"~/data/dataset/real/D3/1w1.fq",
//"~/data/dataset/real/D3/1w2.fq",
"-o",
"/dev/null",
//"-Z",
],
"cwd": "${workspaceFolder}", //
},
{
"name": "index",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"args": [
"index",
"~/data/reference/human_g1k_v37_decoy.fasta"
],
"cwd": "${workspaceFolder}", //
},
{
"name": "buildkmer",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"args": [
"buildkmer",
"~/data/reference/human_g1k_v37_decoy.fasta.256.64.fmt",
"~/data/reference/human_g1k_v37_decoy.fasta.kmer"
],
"cwd": "${workspaceFolder}", //
},
{
"name": "share mem",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"args": [
"shm",
"-Z",
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta"
],
"cwd": "${workspaceFolder}", //
},
{
"name": "pac to bref",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"args": [
"pac2bref",
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta"
],
"cwd": "${workspaceFolder}", //
},
{
"name": "build hybrid index",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"args": [
"bwt2hybrid",
"-e",
"-t",
"1",
"~/data/fmt_ref/human_g1k_v37_decoy.fasta"
],
"cwd": "${workspaceFolder}", //
},
{
"name": "train hybrid index",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"args": [
"trainhybrid",
"-t",
"1",
"~/data/fmt_ref/human_g1k_v37_decoy.fasta",
"~/data/dataset/real/D1/n1.fq.gz",
"~/data/dataset/real/D1/n2.fq.gz"
],
"cwd": "${workspaceFolder}", //
},
]
}

75
.vscode/settings.json vendored 100644
View File

@ -0,0 +1,75 @@
{
"files.associations": {
"random": "c",
"bwt.h": "c",
"bwa.h": "c",
"*.tcc": "c",
"functional": "c",
"string_view": "c",
"istream": "c",
"limits": "c",
"bit": "c",
"numeric": "c",
"typeinfo": "c",
"yarn.h": "c",
"malloc_wrap.h": "c",
"emmintrin.h": "c",
"bwamem.h": "c",
"utils.h": "c",
"stdio.h": "c",
"kvec.h": "c",
"string.h": "c",
"stdlib.h": "c",
"array": "c",
"initializer_list": "c",
"utility": "c",
"fmt_idx.h": "c",
"profiling.h": "c",
"neon_sse.h": "c",
"scalar_sse.h": "c",
"immintrin.h": "c",
"ksw.h": "c",
"debug.h": "c",
"type_traits": "c",
"cstdint": "c",
"bitset": "c",
"iterator": "c",
"memory": "c",
"__locale": "c",
"stdint.h": "c",
"bntseq.h": "c",
"inttypes.h": "c",
"ertindex.h": "c",
"ertseeding.h": "c",
"algorithm": "c",
"filesystem": "c",
"chrono": "c",
"queue": "c",
"limits.h": "c",
"deque": "c",
"string": "c",
"unordered_map": "c",
"vector": "c",
"__bit_reference": "c",
"__hash_table": "c",
"__split_buffer": "c",
"compare": "c",
"ratio": "c",
"tuple": "c",
"__functional_base": "c",
"__functional_base_03": "c",
"__tuple": "c",
"cassert": "c",
"climits": "c",
"__threading_support": "c",
"optional": "c",
"semaphore": "c",
"ios": "c",
"hybrid_idx.h": "c",
"assert.h": "c",
"share_mem.h": "c",
"kseq.h": "c",
"ostream": "c",
"streambuf": "c"
}
}

17
.vscode/tasks.json vendored 100644
View File

@ -0,0 +1,17 @@
{
// See https://go.microsoft.com/fwlink/?LinkId=733558
// for the documentation about the tasks.json format
"version": "2.0.0",
"tasks": [
{
"label": "Build",
"type": "shell",
"command": "make clean; make -j 16",
"problemMatcher": [],
"group": {
"kind": "build",
"isDefault": true
}
}
]
}

674
COPYING 100644
View File

@ -0,0 +1,674 @@
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
<program> Copyright (C) <year> <name of author>
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.

3864
ChangeLog 100644

File diff suppressed because it is too large Load Diff

96
Makefile 100644
View File

@ -0,0 +1,96 @@
CC= gcc
#CC= clang --analyze
CFLAGS= -g -Wall -Wno-unused-function -O3
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
bwape.o kopen.o pemerge.o maxk.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa
INCLUDES=
LIBS= -lm -lz -lpthread
SUBDIRS= .
ifeq ($(shell uname -s),Linux)
LIBS += -lrt
endif
ifeq ($(shell uname -s),GNU/kFreeBSD)
LIBS += -lrt
endif
ifneq ($(asan),)
CFLAGS+=-fsanitize=address
LIBS+=-fsanitize=address -ldl
endif
.SUFFIXES:.c .o .cc
.c.o:
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $(CPPFLAGS) $< -o $@
all:$(PROG)
bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
bwamem-lite:libbwa.a example.o
$(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)
clean:
rm -f gmon.out *.o a.out $(PROG) *~ *.a
depend:
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) $(CPPFLAGS) -- *.c )
# DO NOT DELETE THIS LINE -- make depend depends on it.
QSufSort.o: QSufSort.h
bamlite.o: bamlite.h malloc_wrap.h
bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h khash.h
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kvec.h
bwa.o: kseq.h
bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
bwamem.o: ksort.h utils.h kbtree.h
bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h
bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h
bwamem_pair.o: utils.h ksw.h
bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h
bwape.o: ksw.h khash.h
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
bwase.o: bwa.h ksw.h
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
bwashm.o: bwa.h bntseq.h bwt.h
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
bwt_gen.o: QSufSort.h malloc_wrap.h
bwt_lite.o: bwt_lite.h malloc_wrap.h
bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
bwtindex.o: bntseq.h bwa.h bwt.h utils.h rle.h rope.h malloc_wrap.h
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h
bwtsw2_core.o: khash.h ksort.h
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
bwtsw2_pair.o: malloc_wrap.h ksw.h
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
is.o: malloc_wrap.h
kopen.o: malloc_wrap.h
kstring.o: kstring.h malloc_wrap.h
ksw.o: scalar_sse.h ksw.h malloc_wrap.h
main.o: kstring.h malloc_wrap.h utils.h
malloc_wrap.o: malloc_wrap.h
maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
rle.o: rle.h
rope.o: rle.h rope.h
utils.o: utils.h ksort.h malloc_wrap.h kseq.h

1252
NEWS.md 100644

File diff suppressed because it is too large Load Diff

402
QSufSort.c 100644
View File

@ -0,0 +1,402 @@
/* QSufSort.c
Original source from qsufsort.c
Copyright 1999, N. Jesper Larsson, all rights reserved.
This file contains an implementation of the algorithm presented in "Faster
Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
Sadakane (sada@is.s.u-tokyo.ac.jp).
This software may be used freely for any purpose. However, when distributed,
the original source must be clearly stated, and, when the source code is
distributed, the copyright notice must be retained and any alterations in
the code must be clearly marked. No warranty is given regarding the quality
of this software.
Modified by Wong Chi-Kwong, 2004
Changes summary: - Used long variable and function names
- Removed global variables
- Replace pointer references with array references
- Used insertion sort in place of selection sort and increased insertion sort threshold
- Reconstructing suffix array from inverse becomes an option
- Add handling where end-of-text symbol is not necessary < all characters
- Removed codes for supporting alphabet size > number of characters
No warrenty is given regarding the quality of the modifications.
*/
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include "QSufSort.h"
#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
#define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
#define swap(a, b, t); t = a; a = b; b = t;
// Static functions
static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar);
static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar);
static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar);
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize);
static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated);
/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
contents of x[n] is disregarded, the n-th symbol being regarded as
end-of-string smaller than all other symbols.*/
void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const int skipTransform)
{
qsint_t i, j;
qsint_t s, negatedSortedGroupLength;
qsint_t numSymbolAggregated;
qsint_t numSortedPos = 1;
qsint_t newAlphabetSize;
if (!skipTransform) {
/* bucketing possible*/
newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
numChar, &numSymbolAggregated);
QSufSortBucketSort(V, I, numChar, newAlphabetSize);
I[0] = -1;
V[numChar] = 0;
numSortedPos = numSymbolAggregated;
}
while ((qsint_t)(I[0]) >= -(qsint_t)numChar) {
i = 0;
negatedSortedGroupLength = 0;
do {
s = I[i];
if (s < 0) {
i -= s; /* skip over sorted group.*/
negatedSortedGroupLength += s;
} else {
if (negatedSortedGroupLength) {
I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */
negatedSortedGroupLength = 0;
}
j = V[s] + 1;
QSufSortSortSplit(V, I, i, j - 1, numSortedPos);
i = j;
}
} while (i <= numChar);
if (negatedSortedGroupLength) {
/* array ends with a sorted group.*/
I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/
}
numSortedPos *= 2; /* double sorted-depth.*/
}
}
void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar)
{
qsint_t i;
for (i=0; i<=numChar; i++)
I[V[i]] = i + 1;
}
/* Sorting routine called for each unsorted group. Sorts the array of integers
(suffix numbers) of length n starting at p. The algorithm is a ternary-split
quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
function is based on Program 7.*/
static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar) {
qsint_t a, b, c, d;
qsint_t l, m;
qsint_t f, v, s, t;
qsint_t tmp;
qsint_t numItem;
numItem = highestPos - lowestPos + 1;
if (numItem <= INSERT_SORT_NUM_ITEM) {
QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar);
return;
}
v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar);
a = b = lowestPos;
c = d = highestPos;
while (1) {
while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
if (f == v) {
swap(I[a], I[b], tmp);
a++;
}
b++;
}
while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) {
if (f == v) {
swap(I[c], I[d], tmp);
d--;
}
c--;
}
if (b > c)
break;
swap(I[b], I[c], tmp);
b++;
c--;
}
s = a - lowestPos;
t = b - a;
s = min(s, t);
for (l = lowestPos, m = b - s; m < b; l++, m++) {
swap(I[l], I[m], tmp);
}
s = d - c;
t = highestPos - d;
s = min(s, t);
for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) {
swap(I[l], I[m], tmp);
}
s = b - a;
t = d - c;
if (s > 0)
QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
// Update group number for equal portion
a = lowestPos + s;
b = highestPos - t;
if (a == b) {
// Sorted group
V[I[a]] = a;
I[a] = -1;
} else {
// Unsorted group
for (c=a; c<=b; c++)
V[I[c]] = b;
}
if (t > 0)
QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
}
/* Algorithm by Bentley & McIlroy.*/
static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar) {
qsint_t m;
qsint_t keyl, keym, keyn;
qsint_t key1, key2, key3;
qsint_t s;
qsint_t numItem;
numItem = highestPos - lowestPos + 1;
m = lowestPos + numItem / 2;
s = numItem / 8;
key1 = KEY(V, I, lowestPos, numSortedChar);
key2 = KEY(V, I, lowestPos+s, numSortedChar);
key3 = KEY(V, I, lowestPos+2*s, numSortedChar);
keyl = med3(key1, key2, key3);
key1 = KEY(V, I, m-s, numSortedChar);
key2 = KEY(V, I, m, numSortedChar);
key3 = KEY(V, I, m+s, numSortedChar);
keym = med3(key1, key2, key3);
key1 = KEY(V, I, highestPos-2*s, numSortedChar);
key2 = KEY(V, I, highestPos-s, numSortedChar);
key3 = KEY(V, I, highestPos, numSortedChar);
keyn = med3(key1, key2, key3);
return med3(keyl, keym, keyn);
}
/* Quadratic sorting method to use for small subarrays. */
static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar)
{
qsint_t i, j;
qsint_t tmpKey, tmpPos;
qsint_t numItem;
qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
qsint_t negativeSortedLength;
qsint_t groupNum;
numItem = highestPos - lowestPos + 1;
for (i=0; i<numItem; i++) {
pos[i] = I[lowestPos + i];
key[i] = V[pos[i] + numSortedChar];
}
for (i=1; i<numItem; i++) {
tmpKey = key[i];
tmpPos = pos[i];
for (j=i; j>0 && key[j-1] > tmpKey; j--) {
key[j] = key[j-1];
pos[j] = pos[j-1];
}
key[j] = tmpKey;
pos[j] = tmpPos;
}
negativeSortedLength = -1;
i = numItem - 1;
groupNum = highestPos;
while (i > 0) {
I[i+lowestPos] = pos[i];
V[I[i+lowestPos]] = groupNum;
if (key[i-1] == key[i]) {
negativeSortedLength = 0;
} else {
if (negativeSortedLength < 0)
I[i+lowestPos] = negativeSortedLength;
groupNum = i + lowestPos - 1;
negativeSortedLength--;
}
i--;
}
I[lowestPos] = pos[0];
V[I[lowestPos]] = groupNum;
if (negativeSortedLength < 0)
I[lowestPos] = negativeSortedLength;
}
/* Bucketsort for first iteration.
Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
at least once. x[n] is 0. (This is the corresponding output of transform.) k
must be at most n+1. p is array of size n+1 whose contents are disregarded.
Output: x is V and p is I after the initial sorting stage of the refined
suffix sorting algorithm.*/
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize)
{
qsint_t i, c;
qsint_t d;
qsint_t groupNum;
qsint_t currentIndex;
// mark linked list empty
for (i=0; i<alphabetSize; i++)
I[i] = -1;
// insert to linked list
for (i=0; i<=numChar; i++) {
c = V[i];
V[i] = (qsint_t)(I[c]);
I[c] = i;
}
currentIndex = numChar;
for (i=alphabetSize; i>0; i--) {
c = I[i-1];
d = (qsint_t)(V[c]);
groupNum = currentIndex;
V[c] = groupNum;
if (d >= 0) {
I[currentIndex] = c;
while (d >= 0) {
c = d;
d = V[c];
V[c] = groupNum;
currentIndex--;
I[currentIndex] = c;
}
} else {
// sorted group
I[currentIndex] = -1;
}
currentIndex--;
}
}
/* Transforms the alphabet of x by attempting to aggregate several symbols into
one, while preserving the suffix order of x. The alphabet may also be
compacted, so that x on output comprises all integers of the new alphabet
with no skipped numbers.
Input: x is an array of size n+1 whose first n elements are positive
integers in the range l...k-1. p is array of size n+1, used for temporary
storage. q controls aggregation and compaction by defining the maximum intue
for any symbol during transformation: q must be at least k-l; if q<=n,
compaction is guaranteed; if k-l>n, compaction is never done; if q is
INT_MAX, the maximum number of symbols are aggregated into one.
Output: Returns an integer j in the range 1...q representing the size of the
new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
set to the number of old symbols grouped into one. Only x[n] is 0.*/
static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated)
{
qsint_t c, i, j;
qsint_t a; // numSymbolAggregated
qsint_t mask;
qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0;
qsint_t newAlphabetSize;
qsint_t maxNumInputSymbol, maxNumBit, maxSymbol;
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit;
maxSymbol = QSINT_MAX >> maxNumBit;
c = maxNumInputSymbol;
for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1);
maxSymbolInChunk = c;
c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol;
}
mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/
V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
/* bucketing possible, compact alphabet.*/
for (i=0; i<=maxSymbolInChunk; i++)
I[i] = 0; /* zero transformation table.*/
c = minSymbolInChunk;
for (i=a; i<=numChar; i++) {
I[c] = 1; /* mark used chunk symbol.*/
c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
}
for (i=1; i<a; i++) { /* handle last r-1 positions.*/
I[c] = 1; /* mark used chunk symbol.*/
c = (c & mask) << maxNumBit; /* shift in next old symbol in chunk.*/
}
newAlphabetSize = 1;
for (i=0; i<=maxSymbolInChunk; i++) {
if (I[i]) {
I[i] = newAlphabetSize;
newAlphabetSize++;
}
}
c = minSymbolInChunk;
for (i=0, j=a; j<=numChar; i++, j++) {
V[i] = I[c]; /* transform to new alphabet.*/
c = ((c & mask) << maxNumBit) | (V[j] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
}
for (; i<numChar; i++) { /* handle last a-1 positions.*/
V[i] = I[c]; /* transform to new alphabet.*/
c = (c & mask) << maxNumBit; /* shift right-end zero in chunk.*/
}
V[numChar] = 0; /* end-of-string symbol is zero.*/
*numSymbolAggregated = a;
return newAlphabetSize;
}

45
QSufSort.h 100644
View File

@ -0,0 +1,45 @@
/* QSufSort.h
Header file for QSufSort.c
This file contains an implementation of the algorithm presented in "Faster
Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
Sadakane (sada@is.s.u-tokyo.ac.jp).
This software may be used freely for any purpose. However, when distributed,
the original source must be clearly stated, and, when the source code is
distributed, the copyright notice must be retained and any alterations in
the code must be clearly marked. No warranty is given regarding the quality
of this software.
Modified by Wong Chi-Kwong, 2004
Changes summary: - Used long variable and function names
- Removed global variables
- Replace pointer references with array references
- Used insertion sort in place of selection sort and increased insertion sort threshold
- Reconstructing suffix array from inverse becomes an option
- Add handling where end-of-text symbol is not necessary < all characters
- Removed codes for supporting alphabet size > number of characters
No warrenty is given regarding the quality of the modifications.
*/
#ifndef __QSUFSORT_H__
#define __QSUFSORT_H__
#include <stdint.h>
#define KEY(V, I, p, h) ( V[ I[p] + h ] )
#define INSERT_SORT_NUM_ITEM 16
typedef int64_t qsint_t;
#define QSINT_MAX INT64_MAX
void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const int skipTransform);
void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar);
#endif

178
README-alt.md 100644
View File

@ -0,0 +1,178 @@
## For the Impatient
```sh
# Download bwakit (or from <http://sourceforge.net/projects/bio-bwa/files/bwakit/> manually)
wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \
| gzip -dc | tar xf -
# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
bwa.kit/bwa index hs38DH.fa # create BWA index
# mapping
bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh # skip "|sh" to show command lines
```
This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA
genotypes on each gene and `out.hla.all` for other possible HLA genotypes.
Please check out [bwa/bwakit/README.md][kithelp] for details.
## Background
GRCh38 consists of several components: chromosomal assembly, unlocalized contigs
(chromosome known but location unknown), unplaced contigs (chromosome unknown)
and ALT contigs (long clustered variations). The combination of the first three
components is called the *primary assembly*. It is recommended to use the
complete primary assembly for all analyses. Using ALT contigs in read mapping is
tricky.
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp of the primary
assembly. However, sequences that are highly diverged from the primary assembly
only contribute a few million bp. Most subsequences of ALT contigs are nearly
identical to the primary assembly. If we align sequence reads to GRCh38+ALT
blindly, we will get many additional reads with zero mapping quality and miss
variants on them. It is crucial to make mappers aware of ALTs.
BWA-MEM is ALT-aware. It essentially computes mapping quality across the
non-redundant content of the primary assembly plus the ALT contigs and is free
of the problem above.
## Methods
### Sequence alignment
As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
postprocessing. The `bwa.kit/run-bwamem` script performs the two steps when ALT
contigs are present. The following picture shows an example about how BWA-MEM
infers mapping quality and reports alignment after step 2:
![](http://lh3lh3.users.sourceforge.net/images/alt-demo.png)
#### Step 1: BWA-MEM mapping
At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring
the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
alignments and assigns mapQ following these two rules:
1. The mapQ of a non-ALT hit is computed across non-ALT hits only. The mapQ of
an ALT hit is computed across all hits.
2. If there are no non-ALT hits, the best ALT hit is outputted as the primary
alignment. If there are both ALT and non-ALT hits, non-ALT hits will be
primary and ALT hits be supplementary (SAM flag 0x800).
In theory, non-ALT alignments from step 1 should be identical to alignments
against the reference genome with ALT contigs. In practice, the two types of
alignments may differ in rare cases due to seeding heuristics. When an ALT hit
is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
non-ALT hits.
If we don't care about ALT hits, we may skip postprocessing (step 2).
Nonetheless, postprocessing is recommended as it improves mapQ and gives more
information about ALT hits.
#### Step 2: Postprocessing
Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
potential hits reported in the XA tag, lifts ALT hits to the chromosomal
positions using the ALT-to-ref alignment, groups them based on overlaps between
their lifted positions, and then re-estimates mapQ across the best scoring hit
in each group. Being aware of the ALT-to-ref alignment, this script can greatly
improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. It also
writes each hit overlapping the reported hit into a separate SAM line. This
enables variant calling on each ALT contig independent of others.
### On the completeness of GRCh38+ALT
While GRCh38 is much more complete than GRCh37, it is still missing some true
human sequences. To make sure every piece of sequence in the reference assembly
is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig
to have enough support from multiple sources before considering to add it to the
reference assembly. This careful and sophisticated procedure has left out some
sequences, one of which is [this example][novel], a 10kb contig assembled from
CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or
[BLAST][blast] to see where it maps.
For a more complete reference genome, we compiled a new set of decoy sequences
from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
The sequences are included in `hs38DH-extra.fa` from the [BWA binary
package][res].
In addition to decoy, we also put multiple alleles of HLA genes in
`hs38DH-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
version 3.18.0 and are used to collect reads sequenced from these genes.
### HLA typing
HLA genes are known to be associated with many autoimmune diseases, infectious
diseases and drug responses. They are among the most important genes but are
rarely studied by WGS projects due to the high sequence divergence between
HLA genes and the reference genome in these regions.
By including the HLA gene regions in the reference assembly as ALT contigs, we
are able to effectively identify reads coming from these genes. We also provide
a pipeline, which is included in the [BWA binary package][res], to type the
several classic HLA genes. The pipeline is conceptually simple. It de novo
assembles sequence reads mapped to each gene, aligns exon sequences of each
allele to the assembled contigs and then finds the pairs of alleles that best
explain the contigs. In practice, however, the completeness of IMGT/HLA and
copy-number changes related to these genes are not so straightforward to
resolve. HLA typing may not always be successful. Users may also consider to use
other programs for typing such as [Warren et al (2012)][hla4], [Liu et al
(2013)][hla2], [Bai et al (2014)][hla3] and [Dilthey et al (2014)][hla1], though
most of them are distributed under restrictive licenses.
## Preliminary Evaluation
To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878
unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy
(hs38DH), and called small variants from the alignment. CHM1 is haploid.
Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The
true positive (TP) heterozygous calls from NA12878 are approximately equal
to the difference between NA12878 and CHM1 heterozygous calls. A better assembly
should yield higher TP and lower FP. The following table shows the numbers for
these assemblies:
|Assembly|hs37 |hs38 |hs38DH|CHM1_1.1| huref|
|:------:|------:|------:|------:|------:|------:|
|FP | 255706| 168068| 142516|307172 | 575634|
|TP |2142260|2163113|2150844|2167235|2137053|
With this measurement, hs38 is clearly better than hs37. Genome hs38DH reduces
FP by ~25k but also reduces TP by ~12k. We manually inspected variants called
from hs38 only and found the majority of them are associated with excessive read
depth, clustered variants or weak alignment. We believe most hs38-only calls are
problematic. In addition, if we compare two NA12878 replicates from HiSeq X10
with nearly identical library construction, the difference is ~140k, an order
of magnitude higher than the difference between hs38 and hs38DH. ALT contigs,
decoy and HLA genes in hs38DH improve variant calling and enable the analyses of
ALT contigs and HLA typing at little cost.
## Problems and Future Development
There are some uncertainties about ALT mappings - we are not sure whether they
help biological discovery and don't know the best way to analyze them. Without
clear demand from downstream analyses, it is very difficult to design the
optimal mapping strategy. The current BWA-MEM method is just a start. If it
turns out to be useful in research, we will probably rewrite bwa-postalt.js in C
for performance; if not, we may make changes. It is also possible that we might
make breakthrough on the representation of multiple genomes, in which case, we
can even get rid of ALT contigs for good.
[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
[sb]: https://github.com/GregoryFaust/samblaster
[grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/
[novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc
[blat]: https://genome.ucsc.edu/cgi-bin/hgBlat
[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/
[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/
[grcdef]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml
[hla1]: http://biorxiv.org/content/early/2014/07/08/006973
[hlalink]: http://www.hladiseaseassociations.com
[hlatools]: https://www.biostars.org/p/93245/
[hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html
[hla3]: http://www.biomedcentral.com/1471-2164/15/325
[hla4]: http://genomemedicine.com/content/4/12/95
[kithelp]: https://github.com/lh3/bwa/tree/master/bwakit

196
README.md
View File

@ -1,3 +1,195 @@
# hyb-align
[![Build Status](https://github.com/lh3/bwa/actions/workflows/ci.yaml/badge.svg)](https://github.com/lh3/bwa/actions)
[![SourceForge Downloads](https://img.shields.io/sourceforge/dt/bio-bwa.svg?label=SF%20downloads)](https://sourceforge.net/projects/bio-bwa/files/?source=navbar)
[![GitHub Downloads](https://img.shields.io/github/downloads/lh3/bwa/total.svg?style=flat&label=GitHub%20downloads)](https://github.com/lh3/bwa/releases)
[![BioConda Install](https://img.shields.io/conda/dn/bioconda/bwa.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/bwa)
提出基于hybrid-index的seeding算法提升bwa-mem的seeding阶段的性能
**Note: [minimap2][minimap2] has replaced BWA-MEM for __PacBio and Nanopore__ read
alignment.** It retains all major BWA-MEM features, but is ~50 times as fast,
more versatile, more accurate and produces better base-level alignment.
[BWA-MEM2][bwa-mem2] is 50-100% faster than BWA-MEM and outputs identical alignments.
[minimap2]: https://github.com/lh3/minimap2
[bwa-mem2]: https://github.com/bwa-mem2/bwa-mem2
## Getting started
git clone https://github.com/lh3/bwa.git
cd bwa; make
./bwa index ref.fa
./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz
./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz
## Introduction
BWA is a software package for mapping DNA sequences against a large reference
genome, such as the human genome. It consists of three algorithms:
BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
sequence reads up to 100bp, while the rest two for longer sequences ranged from
70bp to a few megabases. BWA-MEM and BWA-SW share similar features such as the
support of long reads and chimeric alignment, but BWA-MEM, which is the latest,
is generally recommended as it is faster and more accurate. BWA-MEM also has
better performance than BWA-backtrack for 70-100bp Illumina reads.
For all the algorithms, BWA first needs to construct the FM-index for the
reference genome (the **index** command). Alignment algorithms are invoked with
different sub-commands: **aln/samse/sampe** for BWA-backtrack,
**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm.
## Availability
BWA is released under [GPLv3][1]. The latest source code is [freely
available at github][2]. Released packages can [be downloaded][3] at
SourceForge. After you acquire the source code, simply use `make` to compile
and copy the single executable `bwa` to the destination you want. The only
dependency required to build BWA is [zlib][14].
Since 0.7.11, precompiled binary for x86\_64-linux is available in [bwakit][17].
In addition to BWA, this self-consistent package also comes with bwa-associated
and 3rd-party tools for proper BAM-to-FASTQ conversion, mapping to ALT contigs,
adapter triming, duplicate marking, HLA typing and associated data files.
## Seeking help
The detailed usage is described in the man page available together with the
source code. You can use `man ./bwa.1` to view the man page in a terminal. The
[HTML version][4] of the man page can be found at the [BWA website][5]. If you
have questions about BWA, you may [sign up the mailing list][6] and then send
the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions
in forums such as [BioStar][8] and [SEQanswers][9].
## Citing BWA
* Li H. and Durbin R. (2009) Fast and accurate short read alignment with
Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID:
[19451168][10]]. (if you use the BWA-backtrack algorithm)
* Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID:
[20080505][11]]. (if you use the BWA-SW algorithm)
* Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs
with BWA-MEM. [arXiv:1303.3997v2][12] [q-bio.GN]. (if you use the BWA-MEM
algorithm or the **fastmap** command, or want to cite the whole BWA package)
Please note that the last reference is a preprint hosted at [arXiv.org][13]. I
do not have plan to submit it to a peer-reviewed journal in the near future.
## Frequently asked questions (FAQs)
1. [What types of data does BWA work with?](#type)
2. [Why does a read appear multiple times in the output SAM?](#multihit)
3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
6. [Does BWA work with ALT contigs in the GRCh38 release?](#altctg)
7. [Can I just run BWA-MEM against GRCh38+ALT without post-processing?](#postalt)
8. [Why does BWA use a lot of memory?](#largemem)
#### <a name="type"></a>1. What types of data does BWA work with?
BWA works with a variety types of DNA sequence data, though the optimal
algorithm and setting may vary. The following list gives the recommended
settings:
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
contigs up to a few megabases mapped to a closely related reference genome:
bwa mem ref.fa reads.fq > aln.sam
* Illumina single-end reads shorter than ~70bp:
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
* Illumina/454/IonTorrent paired-end reads longer than ~70bp:
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
* Illumina paired-end reads shorter than ~70bp:
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
* PacBio subreads or Oxford Nanopore reads to a reference genome:
bwa mem -x pacbio ref.fa reads.fq > aln.sam
bwa mem -x ont2d ref.fa reads.fq > aln.sam
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
errors given longer query sequences as the chance of missing all seeds is small.
As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
reads with a sequencing error rate over 20%.
#### <a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
BWA-SW and BWA-MEM perform local alignments. If there is a translocation, a gene
fusion or a long deletion, a read bridging the break point may have two hits,
occupying two lines in the SAM output. With the default setting of BWA-MEM, one
and only one line is primary and is soft clipped; other lines are tagged with
0x800 SAM flag (supplementary alignment) and are hard clipped.
#### <a name="4gb"></a>3. Does BWA work on reference sequences longer than 4GB in total?
Yes. Since 0.6.x, all BWA algorithms work with a genome with total length over
4GB. However, individual chromosome should not be longer than 2GB.
#### <a name="pe0"></a>4. Why can one read in a pair have a high mapping quality but the other has zero?
This is correct. Mapping quality is assigned for individual read, not for a read
pair. It is possible that one read can be mapped unambiguously, but its mate
falls in a tandem repeat and thus its accurate position cannot be determined.
#### <a name="endref"></a>5. How can a BWA-backtrack alignment stand out of the end of a chromosome?
Internally BWA concatenates all reference sequences into one long sequence. A
read may be mapped to the junction of two adjacent reference sequences. In this
case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
as well. BWA-MEM does not have this problem.
#### <a name="altctg"></a>6. Does BWA work with ALT contigs in the GRCh38 release?
Yes, since 0.7.11, BWA-MEM officially supports mapping to GRCh38+ALT.
BWA-backtrack and BWA-SW don't properly support ALT mapping as of now. Please
see [README-alt.md][18] for details. Briefly, it is recommended to use
[bwakit][17], the binary release of BWA, for generating the reference genome
and for mapping.
#### <a name="postalt"></a>7. Can I just run BWA-MEM against GRCh38+ALT without post-processing?
If you are not interested in hits to ALT contigs, it is okay to run BWA-MEM
without post-processing. The alignments produced this way are very close to
alignments against GRCh38 without ALT contigs. Nonetheless, applying
post-processing helps to reduce false mappings caused by reads from the
diverged part of ALT contigs and also enables HLA typing. It is recommended to
run the post-processing script.
### <a name="largemem"></a>8. Why does BWA use a lot of memory?
This is typically caused by FASTQ generated from a coordinate-sorted BAM.
BWA uses a lot more memory for centromeric reads than for unique reads.
In a FASTQ file generated from a sequencing run, centromeric reads are rare in each batch and rarely cause troubles.
However, in a coordinate-sorted FASTQ file, a whole batch could consist of centromeric reads.
Such a batch will take a lot more memory and time to map; the insert size estimate will be distorted as well.
General rule: ***NEVER*** use Picard SamToFastq on coordiate-sorted BAM;
use samtools [collate+fastq][remap] instead.
[remap]: https://lh3.github.io/2021/07/06/remapping-an-aligned-bam
[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License
[2]: https://github.com/lh3/bwa
[3]: http://sourceforge.net/projects/bio-bwa/files/
[4]: http://bio-bwa.sourceforge.net/bwa.shtml
[5]: http://bio-bwa.sourceforge.net/
[6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help
[7]: mailto:bio-bwa-help@sourceforge.net
[8]: http://biostars.org
[9]: http://seqanswers.com/
[10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168
[11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505
[12]: http://arxiv.org/abs/1303.3997
[13]: http://arxiv.org/
[14]: http://zlib.net/
[15]: https://github.com/lh3/bwa/tree/mem
[16]: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/
[17]: http://sourceforge.net/projects/bio-bwa/files/bwakit/
[18]: https://github.com/lh3/bwa/blob/master/README-alt.md

210
bamlite.c 100644
View File

@ -0,0 +1,210 @@
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include "bamlite.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
/*********************
* from bam_endian.c *
*********************/
static inline int bam_is_big_endian()
{
long one= 1;
return !(*((char *)(&one)));
}
static inline uint16_t bam_swap_endian_2(uint16_t v)
{
return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
}
static inline void *bam_swap_endian_2p(void *x)
{
*(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
return x;
}
static inline uint32_t bam_swap_endian_4(uint32_t v)
{
v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
}
static inline void *bam_swap_endian_4p(void *x)
{
*(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
return x;
}
static inline uint64_t bam_swap_endian_8(uint64_t v)
{
v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
}
static inline void *bam_swap_endian_8p(void *x)
{
*(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
return x;
}
/**************
* from bam.c *
**************/
int bam_is_be;
bam_header_t *bam_header_init()
{
bam_is_be = bam_is_big_endian();
return (bam_header_t*)calloc(1, sizeof(bam_header_t));
}
void bam_header_destroy(bam_header_t *header)
{
int32_t i;
if (header == 0) return;
if (header->target_name) {
for (i = 0; i < header->n_targets; ++i)
if (header->target_name[i]) free(header->target_name[i]);
if (header->target_len) free(header->target_len);
free(header->target_name);
}
if (header->text) free(header->text);
free(header);
}
bam_header_t *bam_header_read(bamFile fp)
{
bam_header_t *header;
char buf[4];
int magic_len;
int32_t i = 1, name_len;
// read "BAM1"
magic_len = bam_read(fp, buf, 4);
if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
return NULL;
}
header = bam_header_init();
// read plain text and the number of reference sequences
if (bam_read(fp, &header->l_text, 4) != 4) goto fail;
if (bam_is_be) bam_swap_endian_4p(&header->l_text);
header->text = (char*)calloc(header->l_text + 1, 1);
if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail;
if (bam_read(fp, &header->n_targets, 4) != 4) goto fail;
if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
// read reference sequence names and lengths
header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
header->target_len = (uint32_t*)calloc(header->n_targets, 4);
for (i = 0; i != header->n_targets; ++i) {
if (bam_read(fp, &name_len, 4) != 4) goto fail;
if (bam_is_be) bam_swap_endian_4p(&name_len);
header->target_name[i] = (char*)calloc(name_len, 1);
if (bam_read(fp, header->target_name[i], name_len) != name_len) {
goto fail;
}
if (bam_read(fp, &header->target_len[i], 4) != 4) goto fail;
if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
}
return header;
fail:
bam_header_destroy(header);
return NULL;
}
static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
{
uint8_t *s;
uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
while (s < data + data_len) {
uint8_t type;
s += 2; // skip key
type = toupper(*s); ++s; // skip type
if (type == 'C' || type == 'A') ++s;
else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
}
}
int bam_read1(bamFile fp, bam1_t *b)
{
bam1_core_t *c = &b->core;
int32_t block_len, ret, i;
uint32_t x[8];
if ((ret = bam_read(fp, &block_len, 4)) != 4) {
if (ret == 0) return -1; // normal end-of-file
else return -2; // truncated
}
if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3;
if (bam_is_be) {
bam_swap_endian_4p(&block_len);
for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
}
c->tid = x[0]; c->pos = x[1];
c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
c->l_qseq = x[4];
c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
b->data_len = block_len - sizeof(bam1_core_t);
if (b->m_data < b->data_len) {
b->m_data = b->data_len;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
}
if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
return 4 + block_len;
}
#ifdef USE_VERBOSE_ZLIB_WRAPPERS
// Versions of gzopen, gzread and gzclose that print up error messages
gzFile bamlite_gzopen(const char *fn, const char *mode) {
gzFile fp;
if (strcmp(fn, "-") == 0) {
fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode);
if (!fp) {
fprintf(stderr, "Couldn't open %s : %s",
(strstr(mode, "r"))? "stdin" : "stdout",
strerror(errno));
}
return fp;
}
if ((fp = gzopen(fn, mode)) == 0) {
fprintf(stderr, "Couldn't open %s : %s\n", fn,
errno ? strerror(errno) : "Out of memory");
}
return fp;
}
int bamlite_gzread(gzFile file, void *ptr, unsigned int len) {
int ret = gzread(file, ptr, len);
if (ret < 0) {
int errnum = 0;
const char *msg = gzerror(file, &errnum);
fprintf(stderr, "gzread error: %s\n",
Z_ERRNO == errnum ? strerror(errno) : msg);
}
return ret;
}
int bamlite_gzclose(gzFile file) {
int ret = gzclose(file);
if (Z_OK != ret) {
fprintf(stderr, "gzclose error: %s\n",
Z_ERRNO == ret ? strerror(errno) : zError(ret));
}
return ret;
}
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */

114
bamlite.h 100644
View File

@ -0,0 +1,114 @@
#ifndef BAMLITE_H_
#define BAMLITE_H_
#include <stdint.h>
#include <zlib.h>
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#define USE_VERBOSE_ZLIB_WRAPPERS
typedef gzFile bamFile;
#ifdef USE_VERBOSE_ZLIB_WRAPPERS
/* These print error messages on failure */
# define bam_open(fn, mode) bamlite_gzopen(fn, mode)
# define bam_dopen(fd, mode) gzdopen(fd, mode)
# define bam_close(fp) bamlite_gzclose(fp)
# define bam_read(fp, buf, size) bamlite_gzread(fp, buf, size)
#else
# define bam_open(fn, mode) gzopen(fn, mode)
# define bam_dopen(fd, mode) gzdopen(fd, mode)
# define bam_close(fp) gzclose(fp)
# define bam_read(fp, buf, size) gzread(fp, buf, size)
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
typedef struct {
int32_t n_targets;
char **target_name;
uint32_t *target_len;
size_t l_text, n_text;
char *text;
} bam_header_t;
#define BAM_FPAIRED 1
#define BAM_FPROPER_PAIR 2
#define BAM_FUNMAP 4
#define BAM_FMUNMAP 8
#define BAM_FREVERSE 16
#define BAM_FMREVERSE 32
#define BAM_FREAD1 64
#define BAM_FREAD2 128
#define BAM_FSECONDARY 256
#define BAM_FQCFAIL 512
#define BAM_FDUP 1024
#define BAM_CIGAR_SHIFT 4
#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
#define BAM_CMATCH 0
#define BAM_CINS 1
#define BAM_CDEL 2
#define BAM_CREF_SKIP 3
#define BAM_CSOFT_CLIP 4
#define BAM_CHARD_CLIP 5
#define BAM_CPAD 6
typedef struct {
int32_t tid;
int32_t pos;
uint32_t bin:16, qual:8, l_qname:8;
uint32_t flag:16, n_cigar:16;
int32_t l_qseq;
int32_t mtid;
int32_t mpos;
int32_t isize;
} bam1_core_t;
typedef struct {
bam1_core_t core;
int l_aux, data_len, m_data;
uint8_t *data;
} bam1_t;
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
#define bam1_qname(b) ((char*)((b)->data))
#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
#define bam_destroy1(b) do { \
if (b) { free((b)->data); free(b); } \
} while (0)
extern int bam_is_be;
#ifdef __cplusplus
extern "C" {
#endif
bam_header_t *bam_header_init(void);
void bam_header_destroy(bam_header_t *header);
bam_header_t *bam_header_read(bamFile fp);
int bam_read1(bamFile fp, bam1_t *b);
#ifdef USE_VERBOSE_ZLIB_WRAPPERS
gzFile bamlite_gzopen(const char *fn, const char *mode);
int bamlite_gzread(gzFile file, void *ptr, unsigned int len);
int bamlite_gzclose(gzFile file);
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
#ifdef __cplusplus
}
#endif
#endif

451
bntseq.c 100644
View File

@ -0,0 +1,451 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include <unistd.h>
#include <errno.h>
#include "bntseq.h"
#include "utils.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
#include "khash.h"
KHASH_MAP_INIT_STR(str, int)
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
void bns_dump(const bntseq_t *bns, const char *prefix)
{
char str[1024];
FILE *fp;
int i;
{ // dump .ann
strcpy(str, prefix); strcat(str, ".ann");
fp = xopen(str, "w");
err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
for (i = 0; i != bns->n_seqs; ++i) {
bntann1_t *p = bns->anns + i;
err_fprintf(fp, "%d %s", p->gi, p->name);
if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno);
else err_fprintf(fp, "\n");
err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
}
err_fflush(fp);
err_fclose(fp);
}
{ // dump .amb
strcpy(str, prefix); strcat(str, ".amb");
fp = xopen(str, "w");
err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
for (i = 0; i != bns->n_holes; ++i) {
bntamb1_t *p = bns->ambs + i;
err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
}
err_fflush(fp);
err_fclose(fp);
}
}
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
{
char str[8192];
FILE *fp;
const char *fname;
bntseq_t *bns;
long long xx;
int i;
int scanres;
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
{ // read .ann
fp = xopen(fname = ann_filename, "r");
scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
if (scanres != 3) goto badread;
bns->l_pac = xx;
bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
for (i = 0; i < bns->n_seqs; ++i) {
bntann1_t *p = bns->anns + i;
char *q = str;
int c;
// read gi and sequence name
scanres = fscanf(fp, "%u%s", &p->gi, str);
if (scanres != 2) goto badread;
p->name = strdup(str);
// read fasta comments
while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
while (c != '\n' && c != EOF) c = fgetc(fp);
if (c == EOF) {
scanres = EOF;
goto badread;
}
*q = 0;
if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space
else p->anno = strdup("");
// read the rest
scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
if (scanres != 3) goto badread;
p->offset = xx;
}
err_fclose(fp);
}
{ // read .amb
int64_t l_pac;
int32_t n_seqs;
fp = xopen(fname = amb_filename, "r");
scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
if (scanres != 3) goto badread;
l_pac = xx;
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0;
for (i = 0; i < bns->n_holes; ++i) {
bntamb1_t *p = bns->ambs + i;
scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str);
if (scanres != 3) goto badread;
p->offset = xx;
p->amb = str[0];
}
err_fclose(fp);
}
{ // open .pac
bns->fp_pac = xopen(pac_filename, "rb");
}
return bns;
badread:
if (EOF == scanres) {
err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file");
}
err_fatal(__func__, "Parse error reading %s\n", fname);
}
bntseq_t *bns_restore(const char *prefix)
{
char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024];
FILE *fp;
bntseq_t *bns;
strcat(strcpy(ann_filename, prefix), ".ann");
strcat(strcpy(amb_filename, prefix), ".amb");
strcat(strcpy(pac_filename, prefix), ".pac");
bns = bns_restore_core(ann_filename, amb_filename, pac_filename);
if (bns == 0) return 0;
if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
char str[1024];
khash_t(str) *h;
int c, i, absent;
khint_t k;
h = kh_init(str);
for (i = 0; i < bns->n_seqs; ++i) {
k = kh_put(str, h, bns->anns[i].name, &absent);
kh_val(h, k) = i;
}
i = 0;
while ((c = fgetc(fp)) != EOF) {
if (c == '\t' || c == '\n' || c == '\r') {
str[i] = 0;
if (str[0] != '@') {
k = kh_get(str, h, str);
if (k != kh_end(h))
bns->anns[kh_val(h, k)].is_alt = 1;
}
while (c != '\n' && c != EOF) c = fgetc(fp);
i = 0;
} else {
if (i >= 1022) {
fprintf(stderr, "[E::%s] sequence name longer than 1023 characters. Abort!\n", __func__);
exit(1);
}
str[i++] = c;
}
}
kh_destroy(str, h);
fclose(fp);
}
return bns;
}
void bns_destroy(bntseq_t *bns)
{
if (bns == 0) return;
else {
int i;
if (bns->fp_pac) err_fclose(bns->fp_pac);
free(bns->ambs);
for (i = 0; i < bns->n_seqs; ++i) {
free(bns->anns[i].name);
free(bns->anns[i].anno);
}
free(bns->anns);
free(bns);
}
}
#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1))
#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3)
static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
{
bntann1_t *p;
int i, lasts;
if (bns->n_seqs == *m_seqs) {
*m_seqs <<= 1;
bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
}
p = bns->anns + bns->n_seqs;
p->name = strdup((char*)seq->name.s);
p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
p->gi = 0; p->len = seq->seq.l;
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
p->n_ambs = 0;
for (i = lasts = 0; i < seq->seq.l; ++i) {
int c = nst_nt4_table[(int)seq->seq.s[i]];
if (c >= 4) { // N
if (lasts == seq->seq.s[i]) { // contiguous N
++(*q)->len;
} else {
if (bns->n_holes == *m_holes) {
(*m_holes) <<= 1;
bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
}
*q = bns->ambs + bns->n_holes;
(*q)->len = 1;
(*q)->offset = p->offset + i;
(*q)->amb = seq->seq.s[i];
++p->n_ambs;
++bns->n_holes;
}
}
lasts = seq->seq.s[i];
{ // fill buffer
if (c >= 4) c = lrand48()&3;
if (bns->l_pac == *m_pac) { // double the pac size
*m_pac <<= 1;
pac = realloc(pac, *m_pac/4);
memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4);
}
_set_pac(pac, bns->l_pac, c);
++bns->l_pac;
}
}
++bns->n_seqs;
return pac;
}
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
{
extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
kseq_t *seq;
char name[1024];
bntseq_t *bns;
uint8_t *pac = 0;
int32_t m_seqs, m_holes;
int64_t ret = -1, m_pac, l;
bntamb1_t *q;
FILE *fp;
// initialization
seq = kseq_init(fp_fa);
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
bns->seed = 11; // fixed seed for random generator
srand48(bns->seed);
m_seqs = m_holes = 8; m_pac = 0x10000;
bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
pac = calloc(m_pac/4, 1);
q = bns->ambs;
strcpy(name, prefix); strcat(name, ".pac");
fp = xopen(name, "wb");
// read sequences
while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
if (!for_only) { // add the reverse complemented sequence
int64_t ll_pac = (bns->l_pac * 2 + 3) / 4 * 4;
if (ll_pac > m_pac) pac = realloc(pac, ll_pac/4);
memset(pac + (bns->l_pac+3)/4, 0, (ll_pac - (bns->l_pac+3)/4*4) / 4);
for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
}
ret = bns->l_pac;
{ // finalize .pac file
ubyte_t ct;
err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
// the following codes make the pac file size always (l_pac/4+1+1)
if (bns->l_pac % 4 == 0) {
ct = 0;
err_fwrite(&ct, 1, 1, fp);
}
ct = bns->l_pac % 4;
err_fwrite(&ct, 1, 1, fp);
// close .pac file
err_fflush(fp);
err_fclose(fp);
}
bns_dump(bns, prefix);
bns_destroy(bns);
kseq_destroy(seq);
free(pac);
return ret;
}
int bwa_fa2pac(int argc, char *argv[])
{
int c, for_only = 0;
gzFile fp;
while ((c = getopt(argc, argv, "f")) >= 0) {
switch (c) {
case 'f': for_only = 1; break;
}
}
if (argc == optind) {
fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
return 1;
}
fp = xzopen(argv[optind], "r");
bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
err_gzclose(fp);
return 0;
}
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
{
int left, mid, right;
if (pos_f >= bns->l_pac) return -1;
left = 0; mid = 0; right = bns->n_seqs;
while (left < right) { // binary search
mid = (left + right) >> 1;
if (pos_f >= bns->anns[mid].offset) {
if (mid == bns->n_seqs - 1) break;
if (pos_f < bns->anns[mid+1].offset) break; // bracketed
left = mid + 1;
} else right = mid;
}
return mid;
}
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
{
int is_rev, rid_b, rid_e;
if (rb < bns->l_pac && re > bns->l_pac) return -2;
assert(rb <= re);
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b;
return rid_b == rid_e? rid_b : -1;
}
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
{
int left, mid, right, nn;
if (ref_id) *ref_id = bns_pos2rid(bns, pos_f);
left = 0; right = bns->n_holes; nn = 0;
while (left < right) {
mid = (left + right) >> 1;
if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
else if (pos_f + len <= bns->ambs[mid].offset) right = mid;
else { // overlap
if (pos_f >= bns->ambs[mid].offset) {
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len;
} else {
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f);
}
break;
}
}
return nn;
}
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len)
{
uint8_t *seq = 0;
if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
if (end > l_pac<<1) end = l_pac<<1;
if (beg < 0) beg = 0;
if (beg >= l_pac || end <= l_pac) {
int64_t k, l = 0;
*len = end - beg;
seq = malloc(end - beg);
if (beg >= l_pac) { // reverse strand
int64_t beg_f = (l_pac<<1) - 1 - end;
int64_t end_f = (l_pac<<1) - 1 - beg;
for (k = end_f; k > beg_f; --k)
seq[l++] = 3 - _get_pac(pac, k);
} else { // forward strand
for (k = beg; k < end; ++k)
seq[l++] = _get_pac(pac, k);
}
} else *len = 0; // if bridging the forward-reverse boundary, return nothing
return seq;
}
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid)
{
int64_t far_beg, far_end, len;
int is_rev;
uint8_t *seq;
if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap
assert(*beg <= mid && mid < *end);
*rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev));
far_beg = bns->anns[*rid].offset;
far_end = far_beg + bns->anns[*rid].len;
if (is_rev) { // flip to the reverse strand
int64_t tmp = far_beg;
far_beg = (bns->l_pac<<1) - far_end;
far_end = (bns->l_pac<<1) - tmp;
}
*beg = *beg > far_beg? *beg : far_beg;
*end = *end < far_end? *end : far_end;
seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len);
if (seq == 0 || *end - *beg != len) {
fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n",
__func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end);
}
assert(seq && *end - *beg == len); // assertion failure should never happen
return seq;
}

92
bntseq.h 100644
View File

@ -0,0 +1,92 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef BWT_BNTSEQ_H
#define BWT_BNTSEQ_H
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <zlib.h>
#ifndef BWA_UBYTE
#define BWA_UBYTE
typedef uint8_t ubyte_t;
#endif
typedef struct {
int64_t offset;
int32_t len;
int32_t n_ambs;
uint32_t gi;
int32_t is_alt;
char *name, *anno;
} bntann1_t;
typedef struct {
int64_t offset;
int32_t len;
char amb;
} bntamb1_t;
typedef struct {
int64_t l_pac;
int32_t n_seqs;
uint32_t seed;
bntann1_t *anns; // n_seqs elements
int32_t n_holes;
bntamb1_t *ambs; // n_holes elements
FILE *fp_pac;
} bntseq_t;
extern unsigned char nst_nt4_table[256];
#ifdef __cplusplus
extern "C" {
#endif
void bns_dump(const bntseq_t *bns, const char *prefix);
bntseq_t *bns_restore(const char *prefix);
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
void bns_destroy(bntseq_t *bns);
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid);
int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re);
#ifdef __cplusplus
}
#endif
static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev)
{
return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - 1 - pos : pos;
}
#endif

871
bwa.1 100644
View File

@ -0,0 +1,871 @@
.TH bwa 1 "22 March 2025" "bwa-0.7.19-r1273" "Bioinformatics tools"
.SH NAME
.PP
bwa - Burrows-Wheeler Alignment Tool
.SH SYNOPSIS
.PP
bwa index ref.fa
.PP
bwa mem ref.fa reads.fq > aln-se.sam
.PP
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
.PP
bwa aln ref.fa short_read.fq > aln_sa.sai
.PP
bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
.PP
bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
.PP
bwa bwasw ref.fa long_read.fq > aln.sam
.SH DESCRIPTION
.PP
BWA is a software package for mapping low-divergent sequences against a large
reference genome, such as the human genome. It consists of three algorithms:
BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
sequence reads up to 100bp, while the rest two for longer sequences ranged from
70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read
support and split alignment, but BWA-MEM, which is the latest, is generally
recommended for high-quality queries as it is faster and more accurate.
BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina
reads.
For all the algorithms, BWA first needs to construct the FM-index for
the reference genome (the
.B index
command). Alignment algorithms are invoked with different sub-commands:
.BR aln / samse / sampe
for BWA-backtrack,
.B bwasw
for BWA-SW and
.B mem
for the BWA-MEM algorithm.
.SH COMMANDS AND OPTIONS
.TP
.B index
.B bwa index
.RB [ -p
.IR prefix ]
.RB [ -a
.IR algoType ]
.I db.fa
Index database sequences in the FASTA format.
.B OPTIONS:
.RS
.TP 10
.BI -p \ STR
Prefix of the output database [same as db filename]
.TP
.BI -a \ STR
Algorithm for constructing BWT index. BWA implements three algorithms for BWT
construction:
.BR is ,
.B bwtsw
and
.BR rb2 .
The first algorithm is a little faster for small database but requires large
RAM and does not work for databases with total length longer than 2GB. The
second algorithm is adapted from the BWT-SW source code. It in theory works
with database with trillions of bases. When this option is not specified, the
appropriate algorithm will be chosen automatically.
.RE
.TP
.B mem
.B bwa mem
.RB [ -aCHjMpP ]
.RB [ -t
.IR nThreads ]
.RB [ -k
.IR minSeedLen ]
.RB [ -w
.IR bandWidth ]
.RB [ -d
.IR zDropoff ]
.RB [ -r
.IR seedSplitRatio ]
.RB [ -c
.IR maxOcc ]
.RB [ -D
.IR chainShadow ]
.RB [ -m
.IR maxMateSW ]
.RB [ -W
.IR minSeedMatch ]
.RB [ -A
.IR matchScore ]
.RB [ -B
.IR mmPenalty ]
.RB [ -O
.IR gapOpenPen ]
.RB [ -E
.IR gapExtPen ]
.RB [ -L
.IR clipPen ]
.RB [ -U
.IR unpairPen ]
.RB [ -x
.IR readType ]
.RB [ -R
.IR RGline ]
.RB [ -H
.IR HDlines ]
.RB [ -v
.IR verboseLevel ]
.I db.prefix
.I reads.fq
.RI [ mates.fq ]
Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the
algorithm works by seeding alignments with maximal exact matches (MEMs) and
then extending seeds with the affine-gap Smith-Waterman algorithm (SW).
If
.I mates.fq
file is absent and option
.B -p
is not set, this command regards input reads are single-end. If
.I mates.fq
is present, this command assumes the
.IR i -th
read in
.I reads.fq
and the
.IR i -th
read in
.I mates.fq
constitute a read pair. If
.B -p
is used, the command assumes the
.RI 2 i -th
and the
.RI (2 i +1)-th
read in
.I reads.fq
constitute a read pair (such input file is said to be interleaved). In this case,
.I mates.fq
is ignored. In the paired-end mode, the
.B mem
command will infer the read orientation and the insert size distribution from a
batch of reads.
The BWA-MEM algorithm performs local alignment. It may produce multiple primary
alignments for different part of a query sequence. This is a crucial feature
for long sequences. However, some tools may not work with split alignments.
One may consider to use option
.B -M
to flag shorter split hits as secondary.
.RS
.TP 10
.B ALGORITHM OPTIONS:
.TP
.BI -t \ INT
Number of threads [1]
.TP
.BI -k \ INT
Minimum seed length. Matches shorter than
.I INT
will be missed. The alignment speed is usually insensitive to this value unless
it significantly deviates from 20. [19]
.TP
.BI -w \ INT
Band width. Essentially, gaps longer than
.I INT
will not be found. Note that the maximum gap length is also affected by the
scoring matrix and the hit length, not solely determined by this option. [100]
.TP
.BI -d \ INT
Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between
the best and the current extension score is above
.RI | i - j |* A + INT ,
where
.I i
and
.I j
are the current positions of the query and reference, respectively, and
.I A
is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it
doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not
only avoids unnecessary extension, but also reduces poor alignments inside a
long good alignment. [100]
.TP
.BI -r \ FLOAT
Trigger re-seeding for a MEM longer than
.IR minSeedLen * FLOAT .
This is a key heuristic parameter for tuning the performance. Larger value
yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
.TP
.BI -c \ INT
Discard a MEM if it has more than
.I INT
occurrence in the genome. This is an insensitive parameter. [500]
.TP
.BI -D \ FLOAT
Drop chains shorter than
.I FLOAT
fraction of the longest overlapping chain [0.5]
.TP
.BI -m \ INT
Perform at most
.I INT
rounds of mate-SW [50]
.TP
.BI -W \ INT
Drop a chain if the number of bases in seeds is smaller than
.IR INT .
This option is primarily used for longer contigs/reads. When positive, it also
affects seed filtering. [0]
.TP
.B -P
In the paired-end mode, perform SW to rescue missing hits only but do not try to find
hits that fit a proper pair.
.TP
.B SCORING OPTIONS:
.TP
.BI -A \ INT
Matching score. [1]
.TP
.BI -B \ INT
Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
.TP
.BI -O \ INT[,INT]
Gap open penalty. If two numbers are specified, the first is the penalty of
opening a deletion and the second for opening an insertion. [6]
.TP
.BI -E \ INT[,INT]
Gap extension penalty. If two numbers are specified, the first is the penalty
of extending a deletion and second for extending an insertion. A gap of length
k costs O + k*E (i.e.
.B -O
is for opening a zero-length gap). [1]
.TP
.BI -L \ INT[,INT]
Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
score reaching the end of query. If this score is larger than the best SW score
minus the clipping penalty, clipping will not be applied. Note that in this
case, the SAM AS tag reports the best SW score; clipping penalty is not
deduced. If two numbers are provided, the first is for 5'-end clipping and
second for 3'-end clipping. [5]
.TP
.BI -U \ INT
Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
.RI scoreRead1+scoreRead2- INT
and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
two scores to determine whether we should force pairing. A larger value leads to
more aggressive read pair. [17]
.TP
.BI -x \ STR
Read type. Changes multiple parameters unless overridden [null]
.RS
.TP 10
.BR pacbio :
.B -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0
(PacBio reads to ref)
.TP
.BR ont2d :
.B -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0
(Oxford Nanopore 2D-reads to ref)
.TP
.BR intractg :
.B -B9 -O16 -L5
(intra-species contigs to ref)
.RE
.TP
.B INPUT/OUTPUT OPTIONS:
.TP
.B -p
Smart pairing. If two adjacent reads have the same name, they are considered
to form a read pair. This way, paired-end and single-end reads can be mixed
in a single FASTA/Q stream.
.TP
.BI -R \ STR
Complete read group header line. '\\t' can be used in
.I STR
and will be converted to a TAB in the output SAM. The read group ID will be
attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
[null]
.TP
.BI -H \ ARG
If ARG starts with @, it is interpreted as a string and gets inserted into the
output SAM header; otherwise, ARG is interpreted as a file with all lines
starting with @ in the file inserted into the SAM header. [null]
.TP
.BI -o \ FILE
Write the output SAM file to
.IR FILE .
For compatibility with other BWA commands, this option may also be given as
.B -f
.IR FILE .
[standard output]
.TP
.B -q
Don't reduce the mapping quality of split alignment of lower alignment score.
.TP
.B -5
For split alignment, mark the segment with the smallest coordinate as the
primary. It automatically applies option
.B -q
as well. This option may help some Hi-C pipelines. By default, BWA-MEM marks
highest scoring segment as primary.
.TP
.B -K \ INT
Process
.I INT
input bases in each batch regardless of the number of threads in use
.RI [10000000* nThreads ].
By default, the batch size is proportional to the number of threads in use.
Because the inferred insert size distribution slightly depends on the batch
size, using different number of threads may produce different output.
Specifying this option helps reproducibility.
.TP
.BI -T \ INT
Don't output alignment with score lower than
.IR INT .
This option affects output and occasionally SAM flag 2. [30]
.TP
.BI -j
Treat ALT contigs as part of the primary assembly (i.e. ignore the
.I db.prefix.alt
file).
.TP
.BI -h \ INT[,INT2]
If a query has not more than
.I INT
hits with score higher than 80% of the best hit, output them all in the XA tag.
If
.I INT2
is specified, BWA-MEM outputs up to
.I INT2
hits if the list contains a hit to an ALT contig. [5,200]
.TP
.B -a
Output all found alignments for single-end or unpaired paired-end reads. These
alignments will be flagged as secondary alignments.
.TP
.B -C
Append FASTA/Q comment to SAM output. This option can be used to
transfer read meta information (e.g. barcode) to the SAM output. Note that the
FASTA/Q comment (the string after a space in the header line) must conform the SAM
spec (e.g. BC:Z:CGTAC). Malformed comments lead to incorrect SAM output.
.TP
.B -Y
Use soft clipping CIGAR operation for supplementary alignments. By default, BWA-MEM
uses soft clipping for the primary alignment and hard clipping for
supplementary alignments.
.TP
.B -M
Mark shorter split hits as secondary
.TP
.BI -v \ INT
Control the verbosity level of the output. This option has not been fully
supported throughout BWA. Ideally, a value 0 for disabling all the output to
stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
all normal messages; 4 or higher for debugging. When this option takes value
4, the output is not SAM. [3]
.TP
.BI -I \ FLOAT[,FLOAT[,INT[,INT]]]
Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma
from the mean if absent) and min (4 sigma if absent) of the insert size
distribution. Only applicable to the FR orientation. By default, BWA-MEM infers
these numbers and the pair orientations given enough reads. [inferred]
.RE
.TP
.B aln
bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc]
[-O gapOsc] [-E gapEsc] [-q trimQual] <in.db.fasta> <in.query.fq> >
<out.sai>
Find the SA coordinates of the input reads. Maximum
.I maxSeedDiff
differences are allowed in the first
.I seedLen
subsequence and maximum
.I maxDiff
differences are allowed in the whole sequence.
.B OPTIONS:
.RS
.TP 10
.BI -n \ NUM
Maximum edit distance if the value is INT, or the fraction of missing
alignments given 2% uniform base error rate if FLOAT. In the latter
case, the maximum edit distance is automatically chosen for different
read lengths. [0.04]
.TP
.BI -o \ INT
Maximum number of gap opens [1]
.TP
.BI -e \ INT
Maximum number of gap extensions, -1 for k-difference mode (disallowing
long gaps) [-1]
.TP
.BI -d \ INT
Disallow a long deletion within INT bp towards the 3'-end [16]
.TP
.BI -i \ INT
Disallow an indel within INT bp towards the ends [5]
.TP
.BI -l \ INT
Take the first INT subsequence as seed. If INT is larger than the query
sequence, seeding will be disabled. For long reads, this option is
typically ranged from 25 to 35 for `-k 2'. [inf]
.TP
.BI -k \ INT
Maximum edit distance in the seed [2]
.TP
.BI -t \ INT
Number of threads (multi-threading mode) [1]
.TP
.BI -M \ INT
Mismatch penalty. BWA will not search for suboptimal hits with a score
lower than (bestScore-misMsc). [3]
.TP
.BI -O \ INT
Gap open penalty [11]
.TP
.BI -E \ INT
Gap extension penalty [4]
.TP
.BI -R \ INT
Proceed with suboptimal alignments if there are no more than INT equally
best hits. This option only affects paired-end mapping. Increasing this
threshold helps to improve the pairing accuracy at the cost of speed,
especially for short reads (~32bp).
.TP
.B -c
Reverse query but not complement it, which is required for alignment in
the color space. (Disabled since 0.6.x)
.TP
.B -N
Disable iterative search. All hits with no more than
.I maxDiff
differences will be found. This mode is much slower than the default.
.TP
.BI -q \ INT
Parameter for read trimming. BWA trims a read down to
argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l<INT where l is the original
read length. [0]
.TP
.B -I
The input is in the Illumina 1.3+ read format (quality equals ASCII-64).
.TP
.BI -B \ INT
Length of barcode starting from the 5'-end. When
.I INT
is positive, the barcode of each read will be trimmed before mapping and will
be written at the
.B BC
SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]
.TP
.B -b
Specify the input read sequence file is the BAM format. For paired-end
data, two ends in a pair must be grouped together and options
.B -1
or
.B -2
are usually applied to specify which end should be mapped. Typical
command lines for mapping pair-end data in the BAM format are:
bwa aln ref.fa -b1 reads.bam > 1.sai
bwa aln ref.fa -b2 reads.bam > 2.sai
bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam
.TP
.B -0
When
.B -b
is specified, only use single-end reads in mapping.
.TP
.B -1
When
.B -b
is specified, only use the first read in a read pair in mapping (skip
single-end reads and the second reads).
.TP
.B -2
When
.B -b
is specified, only use the second read in a read pair in mapping.
.B
.RE
.TP
.B samse
bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam>
Generate alignments in the SAM format given single-end reads. Repetitive
hits will be randomly chosen.
.B OPTIONS:
.RS
.TP 10
.BI -n \ INT
Maximum number of alignments to output in the XA tag for reads paired
properly. If a read has more than INT hits, the XA tag will not be
written. [3]
.TP
.BI -r \ STR
Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
.RE
.TP
.B sampe
bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis]
[-P] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam>
Generate alignments in the SAM format given paired-end reads. Repetitive
read pairs will be placed randomly.
.B OPTIONS:
.RS
.TP 8
.BI -a \ INT
Maximum insert size for a read pair to be considered being mapped
properly. Since 0.4.5, this option is only used when there are not
enough good alignment to infer the distribution of insert sizes. [500]
.TP
.BI -o \ INT
Maximum occurrences of a read for pairing. A read with more occurrneces
will be treated as a single-end read. Reducing this parameter helps
faster pairing. [100000]
.TP
.B -P
Load the entire FM-index into memory to reduce disk operations
(base-space reads only). With this option, at least 1.25N bytes of
memory are required, where N is the length of the genome.
.TP
.BI -n \ INT
Maximum number of alignments to output in the XA tag for reads paired
properly. If a read has more than INT hits, the XA tag will not be
written. [3]
.TP
.BI -N \ INT
Maximum number of alignments to output in the XA tag for disconcordant
read pairs (excluding singletons). If a read has more than INT hits, the
XA tag will not be written. [10]
.TP
.BI -r \ STR
Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
.RE
.TP
.B bwasw
bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t
nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N
nHspRev] [-c thresCoef] <in.db.fasta> <in.fq> [mate.fq]
Align query sequences in the
.I in.fq
file. When
.I mate.fq
is present, perform paired-end alignment. The paired-end mode only works
for reads Illumina short-insert libraries. In the paired-end mode, BWA-SW
may still output split alignments but they are all marked as not properly
paired; the mate positions will not be written if the mate has multiple
local hits.
.B OPTIONS:
.RS
.TP 10
.BI -a \ INT
Score of a match [1]
.TP
.BI -b \ INT
Mismatch penalty [3]
.TP
.BI -q \ INT
Gap open penalty [5]
.TP
.BI -r \ INT
Gap extension penalty. The penalty for a contiguous gap of size k is
q+k*r. [2]
.TP
.BI -t \ INT
Number of threads in the multi-threading mode [1]
.TP
.BI -w \ INT
Band width in the banded alignment [33]
.TP
.BI -T \ INT
Minimum score threshold divided by a [37]
.TP
.BI -c \ FLOAT
Coefficient for threshold adjustment according to query length. Given an
l-long query, the threshold for a hit to be retained is
a*max{T,c*log(l)}. [5.5]
.TP
.BI -z \ INT
Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]
.TP
.BI -s \ INT
Maximum SA interval size for initiating a seed. Higher -s increases
accuracy at the cost of speed. [3]
.TP
.BI -N \ INT
Minimum number of seeds supporting the resultant alignment to skip
reverse alignment. [5]
.RE
.SH SAM ALIGNMENT FORMAT
.PP
The output of the
.B `aln'
command is binary and designed for BWA use only. BWA outputs the final
alignment in the SAM (Sequence Alignment/Map) format. Each line consists
of:
.TS
center box;
cb | cb | cb
n | l | l .
Col Field Description
_
1 QNAME Query (pair) NAME
2 FLAG bitwise FLAG
3 RNAME Reference sequence NAME
4 POS 1-based leftmost POSition/coordinate of clipped sequence
5 MAPQ MAPping Quality (Phred-scaled)
6 CIAGR extended CIGAR string
7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME)
8 MPOS 1-based Mate POSistion
9 ISIZE Inferred insert SIZE
10 SEQ query SEQuence on the same strand as the reference
11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE
.TE
.PP
Each bit in the FLAG field is defined as:
.TS
center box;
cb | cb | cb
c | l | l .
Chr Flag Description
_
p 0x0001 the read is paired in sequencing
P 0x0002 the read is mapped in a proper pair
u 0x0004 the query sequence itself is unmapped
U 0x0008 the mate is unmapped
r 0x0010 strand of the query (1 for reverse)
R 0x0020 strand of the mate
1 0x0040 the read is the first read in a pair
2 0x0080 the read is the second read in a pair
s 0x0100 the alignment is not primary
f 0x0200 QC failure
d 0x0400 optical or PCR duplicate
S 0x0800 supplementary alignment
.TE
.PP
The Please check <http://samtools.sourceforge.net> for the format
specification and the tools for post-processing the alignment.
BWA generates the following optional fields. Tags starting with `X' are
specific to BWA.
.TS
center box;
cb | cb
cB | l .
Tag Meaning
_
NM Edit distance
MD Mismatching positions/bases
AS Alignment score
BC Barcode sequence
SA Supplementary alignments
_
X0 Number of best hits
X1 Number of suboptimal hits found by BWA
XN Number of ambiguous bases in the referenece
XM Number of mismatches in the alignment
XO Number of gap opens
XG Number of gap extensions
XT Type: Unique/Repeat/N/Mate-sw
XA Alternative hits; format: /(chr,pos,CIGAR,NM;)*/
_
XS Suboptimal alignment score
XF Support from forward/reverse alignment
XE Number of supporting seeds
.TE
.PP
Note that XO and XG are generated by BWT search while the CIGAR string
by Smith-Waterman alignment. These two tags may be inconsistent with the
CIGAR string. This is not a bug.
.SH NOTES ON SHORT-READ ALIGNMENT
.SS Alignment Accuracy
.PP
When seeding is disabled, BWA guarantees to find an alignment
containing maximum
.I maxDiff
differences including
.I maxGapO
gap opens which do not occur within
.I nIndelEnd
bp towards either end of the query. Longer gaps may be found if
.I maxGapE
is positive, but it is not guaranteed to find all hits. When seeding is
enabled, BWA further requires that the first
.I seedLen
subsequence contains no more than
.I maxSeedDiff
differences.
.PP
When gapped alignment is disabled, BWA is expected to generate the same
alignment as Eland version 1, the Illumina alignment program. However, as BWA
change `N' in the database sequence to random nucleotides, hits to these
random sequences will also be counted. As a consequence, BWA may mark a
unique hit as a repeat, if the random sequences happen to be identical
to the sequences which should be unqiue in the database.
.PP
By default, if the best hit is not highly repetitive (controlled by -R), BWA
also finds all hits contains one more mismatch; otherwise, BWA finds all
equally best hits only. Base quality is NOT considered in evaluating
hits. In the paired-end mode, BWA pairs all hits it found. It further
performs Smith-Waterman alignment for unmapped reads to rescue reads with a
high erro rate, and for high-quality anomalous pairs to fix potential alignment
errors.
.SS Estimating Insert Size Distribution
.PP
BWA estimates the insert size distribution per 256*1024 read pairs. It
first collects pairs of reads with both ends mapped with a single-end
quality 20 or higher and then calculates median (Q2), lower and higher
quartile (Q1 and Q3). It estimates the mean and the variance of the
insert size distribution from pairs whose insert sizes are within
interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair
considered to be properly paired (SAM flag 0x2) is calculated by solving
equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the
standard error of the insert size distribution, L is the length of the
genome, p0 is prior of anomalous pair and Phi() is the standard
cumulative distribution function. For mapping Illumina short-insert
reads to the human genome, x is about 6-7 sigma away from the
mean. Quartiles, mean, variance and x will be printed to the standard
error output.
.SS Memory Requirement
.PP
With bwtsw algorithm, 5GB memory is required for indexing the complete
human genome sequences. For short reads, the
.B aln
command uses ~3.2GB memory and the
.B sampe
command uses ~5.4GB.
.SS Speed
.PP
Indexing the human genome sequences takes 3 hours with bwtsw
algorithm. Indexing smaller genomes with IS algorithms is
faster, but requires more memory.
.PP
The speed of alignment is largely determined by the error rate of the query
sequences (r). Firstly, BWA runs much faster for near perfect hits than
for hits with many differences, and it stops searching for a hit with
l+2 differences if a l-difference hit is found. This means BWA will be
very slow if r is high because in this case BWA has to visit hits with
many differences and looking for these hits is expensive. Secondly, the
alignment algorithm behind makes the speed sensitive to [k log(N)/m],
where k is the maximum allowed differences, N the size of database and m
the length of a query. In practice, we choose k w.r.t. r and therefore r
is the leading factor. I would not recommend to use BWA on data with
r>0.02.
.PP
Pairing is slower for shorter reads. This is mainly because shorter
reads have more spurious hits and converting SA coordinates to
chromosomal coordinates are very costly.
.SH CHANGES IN BWA-0.6
.PP
Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
This feature makes it possible to integrate the forward and reverse complemented
genome in one FM-index, which speeds up both BWA-short and BWA-SW. As a tradeoff,
BWA uses more memory because it has to keep all positions and ranks in 64-bit
integers, twice larger than 32-bit integers used in the previous versions.
The latest BWA-SW also works for paired-end reads longer than 100bp. In
comparison to BWA-short, BWA-SW tends to be more accurate for highly unique
reads and more robust to relative long INDELs and structural variants.
Nonetheless, BWA-short usually has higher power to distinguish the optimal hit
from many suboptimal hits. The choice of the mapping algorithm may depend on
the application.
.SH SEE ALSO
BWA website <http://bio-bwa.sourceforge.net>, Samtools website
<http://samtools.sourceforge.net>
.SH AUTHOR
Heng Li at the Sanger Institute wrote the key source codes and
integrated the following codes for BWT construction: bwtsw
<http://i.cs.hku.hk/~ckwong3/bwtsw/>, implemented by Chi-Kwong Wong at
the University of Hong Kong and IS
<http://yuta.256.googlepages.com/sais> originally proposed by Nong Ge
<http://www.cs.sysu.edu.cn/nong/> at the Sun Yat-Sen University and
implemented by Yuta Mori.
.SH LICENSE AND CITATION
.PP
The full BWA package is distributed under GPLv3 as it uses source codes
from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
libraries are distributed under the MIT license.
.PP
If you use the BWA-backtrack algorithm, please cite the following
paper:
.PP
Li H. and Durbin R. (2009) Fast and accurate short read alignment with
Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
.PP
If you use the BWA-SW algorithm, please cite:
.PP
Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
.PP
If you use BWA-MEM or the fastmap component of BWA, please cite:
.PP
Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with
BWA-MEM. arXiv:1303.3997v1 [q-bio.GN].
.PP
It is likely that the BWA-MEM manuscript will not appear in a peer-reviewed
journal.
.SH HISTORY
BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
and mimics its binary file formats; BWA-SW resembles BWT-SW in several
ways. The initial idea about BWT-based alignment also came from the
group who developed BWT-SW. At the same time, BWA is different enough
from BWT-SW. The short-read alignment algorithm bears no similarity to
Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it
introduces heuristics that can hardly be applied to the original
algorithm. In all, BWA does not guarantee to find all local hits as what
BWT-SW is designed to do, but it is much faster than BWT-SW on both
short and long query sequences.
I started to write the first piece of codes on 24 May 2008 and got the
initial stable version on 02 June 2008. During this period, I was
acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper,
was collaborating with Beijing Genomics Institute on SOAP2, the successor
to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in
November 2008. According to the SourceForge download page, the third
BWT-based short read aligner, bowtie, was first released in August
2008. At the time of writing this manual, at least three more BWT-based
short-read aligners are being implemented.
The BWA-SW algorithm is a new component of BWA. It was conceived in
November 2008 and implemented ten months later.
The BWA-MEM algorithm is based on an algorithm finding super-maximal exact
matches (SMEMs), which was first published with the fermi assembler paper
in 2012. I first implemented the basic SMEM algorithm in the
.B fastmap
command for an experiment and then extended the basic algorithm and added the
extension part in Feburary 2013 to make BWA-MEM a fully featured mapper.

502
bwa.c 100644
View File

@ -0,0 +1,502 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <string.h>
#include <stdio.h>
#include <zlib.h>
#include <assert.h>
#include "bntseq.h"
#include "bwa.h"
#include "ksw.h"
#include "utils.h"
#include "kstring.h"
#include "kvec.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
int bwa_verbose = 3;
int bwa_dbg = 0;
char bwa_rg_id[256];
char *bwa_pg;
/************************
* Batch FASTA/Q reader *
************************/
#include "kseq.h"
KSEQ_DECLARE(gzFile)
static inline void trim_readno(kstring_t *s)
{
if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
s->l -= 2, s->s[s->l] = 0;
}
static inline char *dupkstring(const kstring_t *str, int dupempty)
{
char *s = (str->l > 0 || dupempty)? malloc(str->l + 1) : NULL;
if (!s) return NULL;
memcpy(s, str->s, str->l);
s[str->l] = '\0';
return s;
}
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
s->name = dupkstring(&ks->name, 1);
s->comment = dupkstring(&ks->comment, 0);
s->seq = dupkstring(&ks->seq, 1);
s->qual = dupkstring(&ks->qual, 0);
s->l_seq = ks->seq.l;
}
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
{
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
int size = 0, m, n;
bseq1_t *seqs;
m = n = 0; seqs = 0;
while (kseq_read(ks) >= 0) {
if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
break;
}
if (n >= m) {
m = m? m<<1 : 256;
seqs = realloc(seqs, m * sizeof(bseq1_t));
}
trim_readno(&ks->name);
kseq2bseq1(ks, &seqs[n]);
seqs[n].id = n;
size += seqs[n++].l_seq;
if (ks2) {
trim_readno(&ks2->name);
kseq2bseq1(ks2, &seqs[n]);
seqs[n].id = n;
size += seqs[n++].l_seq;
}
if (size >= chunk_size && (n&1) == 0) break;
}
if (size == 0) { // test if the 2nd file is finished
if (ks2 && kseq_read(ks2) >= 0)
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
}
*n_ = n;
return seqs;
}
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
{
int i, has_last;
kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
for (i = 1, has_last = 1; i < n; ++i) {
if (has_last) {
if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
kv_push(bseq1_t, a[1], seqs[i-1]);
kv_push(bseq1_t, a[1], seqs[i]);
has_last = 0;
} else kv_push(bseq1_t, a[0], seqs[i-1]);
} else has_last = 1;
}
if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
sep[0] = a[0].a, m[0] = a[0].n;
sep[1] = a[1].a, m[1] = a[1].n;
}
/*****************
* CIGAR related *
*****************/
void bwa_fill_scmat(int a, int b, int8_t mat[25])
{
int i, j, k;
for (i = k = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
mat[k++] = i == j? a : -b;
mat[k++] = -1; // ambiguous base
}
for (j = 0; j < 5; ++j) mat[k++] = -1;
}
// Generate CIGAR when the alignment end points are known
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
{
uint32_t *cigar = 0;
uint8_t tmp, *rseq;
int i;
int64_t rlen;
kstring_t str;
const char *int2base;
if (n_cigar) *n_cigar = 0;
if (NM) *NM = -1;
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
for (i = 0; i < l_query>>1; ++i)
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
for (i = 0; i < rlen>>1; ++i)
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
}
if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
// UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
if (n_cigar) {
cigar = malloc(4);
cigar[0] = l_query<<4 | 0;
*n_cigar = 1;
}
for (i = 0, *score = 0; i < l_query; ++i)
*score += mat[rseq[i]*5 + query[i]];
} else {
int w, max_gap, max_ins, max_del, min_w;
// set the band-width
max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.);
max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.);
max_gap = max_ins > max_del? max_ins : max_del;
max_gap = max_gap > 1? max_gap : 1;
w = (max_gap + abs((int)rlen - l_query) + 1) >> 1;
w = w < w_? w : w_;
min_w = abs((int)rlen - l_query) + 3;
w = w > min_w? w : min_w;
// NW alignment
if (bwa_verbose >= 4) {
printf("* Global bandwidth: %d\n", w);
printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
}
*score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar);
}
if (NM && n_cigar) {// compute NM and MD
int k, x, y, u, n_mm = 0, n_gap = 0;
str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR
int2base = rb < l_pac? "ACGTN" : "TGCAN";
for (k = 0, x = y = u = 0; k < *n_cigar; ++k) {
int op, len;
cigar = (uint32_t*)str.s;
op = cigar[k]&0xf, len = cigar[k]>>4;
if (op == 0) { // match
for (i = 0; i < len; ++i) {
if (query[x + i] != rseq[y + i]) {
kputw(u, &str);
kputc(int2base[rseq[y+i]], &str);
++n_mm; u = 0;
} else ++u;
}
x += len; y += len;
} else if (op == 2) { // deletion
if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR
kputw(u, &str); kputc('^', &str);
for (i = 0; i < len; ++i)
kputc(int2base[rseq[y+i]], &str);
u = 0; n_gap += len;
}
y += len;
} else if (op == 1) x += len, n_gap += len; // insertion
}
kputw(u, &str); kputc(0, &str);
*NM = n_mm + n_gap;
cigar = (uint32_t*)str.s;
}
if (rb >= l_pac) // reverse back query
for (i = 0; i < l_query>>1; ++i)
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
ret_gen_cigar:
free(rseq);
return cigar;
}
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
{
return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM);
}
/*********************
* Full index reader *
*********************/
char *bwa_idx_infer_prefix(const char *hint)
{
char *prefix;
int l_hint;
FILE *fp;
l_hint = strlen(hint);
prefix = malloc(l_hint + 3 + 4 + 1);
strcpy(prefix, hint);
strcpy(prefix + l_hint, ".64.bwt");
if ((fp = fopen(prefix, "rb")) != 0) {
fclose(fp);
prefix[l_hint + 3] = 0;
return prefix;
} else {
strcpy(prefix + l_hint, ".bwt");
if ((fp = fopen(prefix, "rb")) == 0) {
free(prefix);
return 0;
} else {
fclose(fp);
prefix[l_hint] = 0;
return prefix;
}
}
}
bwt_t *bwa_idx_load_bwt(const char *hint)
{
char *tmp, *prefix;
bwt_t *bwt;
prefix = bwa_idx_infer_prefix(hint);
if (prefix == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
return 0;
}
tmp = calloc(strlen(prefix) + 5, 1);
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
bwt_restore_sa(tmp, bwt);
free(tmp); free(prefix);
return bwt;
}
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
{
bwaidx_t *idx;
char *prefix;
prefix = bwa_idx_infer_prefix(hint);
if (prefix == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
return 0;
}
idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
if (which & BWA_IDX_BNS) {
int i, c;
idx->bns = bns_restore(prefix);
for (i = c = 0; i < idx->bns->n_seqs; ++i)
if (idx->bns->anns[i].is_alt) ++c;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
if (which & BWA_IDX_PAC) {
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
err_fclose(idx->bns->fp_pac);
idx->bns->fp_pac = 0;
}
}
free(prefix);
return idx;
}
bwaidx_t *bwa_idx_load(const char *hint, int which)
{
return bwa_idx_load_from_disk(hint, which);
}
void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
if (idx->mem == 0) {
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
} else {
free(idx->bwt); free(idx->bns->anns); free(idx->bns);
if (!idx->is_shm) free(idx->mem);
}
free(idx);
}
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
{
int64_t k = 0, x;
int i;
// generate idx->bwt
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
// generate idx->bns and idx->pac
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
}
idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
assert(k == l_mem);
idx->l_mem = k; idx->mem = mem;
return 0;
}
int bwa_idx2mem(bwaidx_t *idx)
{
int i;
int64_t k, x, tmp;
uint8_t *mem;
// copy idx->bwt
x = idx->bwt->bwt_size * 4;
mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
memmove(mem + sizeof(bwt_t), mem, x);
memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
free(idx->bwt->sa);
free(idx->bwt); idx->bwt = 0;
// copy idx->bns
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
free(idx->bns->ambs);
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
}
free(idx->bns->anns);
// copy idx->pac
x = idx->bns->l_pac/4+1;
mem = realloc(mem, k + x);
memcpy(mem + k, idx->pac, x); k += x;
free(idx->bns); idx->bns = 0;
free(idx->pac); idx->pac = 0;
return bwa_mem2idx(k, mem, idx);
}
/***********************
* SAM header routines *
***********************/
void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
{
int i, n_HD = 0, n_SQ = 0;
extern char *bwa_pg;
if (hdr_line) {
// check for HD line
const char *p = hdr_line;
while ((p = strstr(p, "@HD\t")) != 0) {
if (p == hdr_line || *(p-1) == '\n') ++n_HD;
p += 4;
}
// check for SQ lines
p = hdr_line;
while ((p = strstr(p, "@SQ\t")) != 0) {
if (p == hdr_line || *(p-1) == '\n') ++n_SQ;
p += 4;
}
}
if (n_HD == 0) err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n");
else if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] please don't include @HD with option -H. Continue anyway.\n", __func__);
if (n_SQ == 0) {
for (i = 0; i < bns->n_seqs; ++i) {
err_printf("@SQ\tSN:%s\tLN:%d", bns->anns[i].name, bns->anns[i].len);
if (bns->anns[i].is_alt) err_printf("\tAH:*\n");
else err_fputc('\n', stdout);
}
} else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
if (hdr_line) err_printf("%s\n", hdr_line);
if (bwa_pg) err_printf("%s\n", bwa_pg);
}
static char *bwa_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
*q = '\0';
return s;
}
char *bwa_set_rg(const char *s)
{
char *p, *q, *r, *rg_line = 0;
memset(bwa_rg_id, 0, 256);
if (strstr(s, "@RG") != s) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
goto err_set_rg;
}
if (strstr(s, "\t") != NULL) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n", __func__);
goto err_set_rg;
}
rg_line = strdup(s);
bwa_escape(rg_line);
if ((p = strstr(rg_line, "\tID:")) == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__);
goto err_set_rg;
}
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
if (q - p + 1 > 256) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
goto err_set_rg;
}
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
return rg_line;
err_set_rg:
free(rg_line);
return 0;
}
char *bwa_insert_header(const char *s, char *hdr)
{
int len = 0;
if (s == 0 || s[0] != '@') return hdr;
if (hdr) {
len = strlen(hdr);
hdr = realloc(hdr, len + strlen(s) + 2);
hdr[len++] = '\n';
strcpy(hdr + len, s);
} else hdr = strdup(s);
bwa_escape(hdr + len);
return hdr;
}

97
bwa.h 100644
View File

@ -0,0 +1,97 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef BWA_H_
#define BWA_H_
#include <stdint.h>
#include "bntseq.h"
#include "bwt.h"
#define BWA_IDX_BWT 0x1
#define BWA_IDX_BNS 0x2
#define BWA_IDX_PAC 0x4
#define BWA_IDX_ALL 0x7
#define BWA_CTL_SIZE 0x10000
#define BWTALGO_AUTO 0
#define BWTALGO_RB2 1
#define BWTALGO_BWTSW 2
#define BWTALGO_IS 3
#define BWA_DBG_QNAME 0x1
typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
int is_shm;
int64_t l_mem;
uint8_t *mem;
} bwaidx_t;
typedef struct {
int l_seq, id;
char *name, *comment, *seq, *qual, *sam;
} bseq1_t;
extern int bwa_verbose, bwa_dbg;
extern char bwa_rg_id[256];
#ifdef __cplusplus
extern "C" {
#endif
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size);
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
bwaidx_t *bwa_idx_load_from_shm(const char *hint);
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);
int bwa_idx2mem(bwaidx_t *idx);
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line);
char *bwa_set_rg(const char *s);
char *bwa_insert_header(const char *s, char *hdr);
#ifdef __cplusplus
}
#endif
#endif

0
bwa_hyb.c 100644
View File

115
bwakit/README.md 100644
View File

@ -0,0 +1,115 @@
## Introduction
Bwakit is a self-consistent installation-free package of scripts and precompiled
binaries, providing an end-to-end solution to read mapping. In addition to the
basic mapping functionality implemented in bwa, bwakit is able to generate
proper human reference genome and to take advantage of ALT contigs, if present,
to improve read mapping and to perform HLA typing for high-coverage human data.
It can remap name- or coordinate-sorted BAM with read group and barcode
information retained. Bwakit also *optionally* trims adapters (via
[trimadap][ta]), marks duplicates (via [samblaster][sb]) and sorts the final
alignment (via [samtools][smtl]).
Bwakit has two entry scripts: `run-gen-ref` which downloads and generates human
reference genomes, and `run-bwamem` which prints mapping command lines on the
standard output that can be piped to `sh` to execute. The two scripts will call
other programs or use data in `bwa.kit`. The following shows an example about
how to use bwakit:
```sh
# Download the bwa-0.7.11 binary package (download link may change)
wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \
| gzip -dc | tar xf -
# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
bwa.kit/bwa index hs38DH.fa # create BWA index
# mapping
bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh
```
The last mapping command line will generate the following files:
* `out.aln.bam`: unsorted alignments with ALT-aware mapping quality. In this
file, one read may be placed on multiple overlapping ALT contigs at the same
time even if the read is mapped better to some contigs than others. This makes
it possible to analyze each contig independent of others.
* `out.hla.top`: best genotypes for HLA-A, -B, -C, -DQA1, -DQB1 and -DRB1 genes.
* `out.hla.all`: other possible genotypes on the six HLA genes.
* `out.log.*`: bwa-mem, samblaster and HLA typing log files.
Bwakit can be [downloaded here][res]. It is only available to x86_64-linux. The
scripts in the package are available in the [bwa/bwakit][kit] directory.
Packaging is done manually for now.
## Limitations
* HLA typing only works for high-coverage human data. The typing accuracy can
still be improved. We encourage researchers to develop better HLA typing tools
based on the intermediate output of bwakit (for each HLA gene included in the
index, bwakit writes all reads matching it in a separate file).
* Duplicate marking only works when all reads from a single paired-end library
are provided as the input. This limitation is the necessary tradeoff of fast
MarkDuplicate provided by samblaster.
* The adapter trimmer is chosen as it is fast, pipe friendly and does not
discard reads. However, it is conservative and suboptimal. If this is a
concern, it is recommended to preprocess input reads with a more sophisticated
adapter trimmer. We also hope existing trimmers can be modified to operate on
an interleaved FASTQ stream. We will replace trimadap once a better trimmer
meets our needs.
* Bwakit can be memory demanding depends on the functionality invoked. For 30X
human data, bwa-mem takes about 11GB RAM with 32 threads, samblaster uses
close to 10GB and BAM shuffling (if the input is sorted BAM) uses several GB.
In the current setting, sorting uses about 10GB.
## Package Contents
```
bwa.kit
|-- README.md This README file.
|-- run-bwamem *Entry script* for the entire mapping pipeline.
|-- bwa *BWA binary*
|-- k8 Interpretor for *.js scripts.
|-- bwa-postalt.js Post-process alignments to ALT contigs/decoys/HLA genes.
|-- htsbox Used by run-bwamem for shuffling BAMs and BAM=>FASTQ.
|-- samblaster MarkDuplicates for reads from the same library. v0.1.20
|-- samtools SAMtools for sorting and SAM=>BAM conversion. v1.1
|-- seqtk For FASTQ manipulation.
|-- trimadap Trim Illumina PE sequencing adapters.
|
|-- run-gen-ref *Entry script* for generating human reference genomes.
|-- resource-GRCh38 Resources for generating GRCh38
| |-- hs38DH-extra.fa Decoy and HLA gene sequences. Used by run-gen-ref.
| `-- hs38DH.fa.alt ALT-to-GRCh38 alignment. Used by run-gen-ref.
|
|-- run-HLA HLA typing for sequences extracted by bwa-postalt.js.
|-- typeHLA.sh Type one HLA-gene. Called by run-HLA.
|-- typeHLA.js HLA typing from exon-to-contig alignment. Used by typeHLA.sh.
|-- typeHLA-selctg.js Select contigs overlapping HLA exons. Used by typeHLA.sh.
|-- fermi2.pl Fermi2 wrapper. Used by typeHLA.sh for de novo assembly.
|-- fermi2 Fermi2 binary. Used by fermi2.pl.
|-- ropebwt2 RopeBWT2 binary. Used by fermi2.pl.
|-- resource-human-HLA Resources for HLA typing
| |-- HLA-ALT-exons.bed Exonic regions of HLA ALT contigs. Used by typeHLA.sh.
| |-- HLA-CDS.fa CDS of HLA-{A,B,C,DQA1,DQB1,DRB1} genes from IMGT/HLA-3.18.0.
| |-- HLA-ALT-type.txt HLA types for each HLA ALT contig. Not used.
| `-- HLA-ALT-idx BWA indices of each HLA ALT contig. Used by typeHLA.sh
| `-- (...)
|
`-- doc BWA documentations
|-- bwa.1 Manpage
|-- NEWS.md Release Notes
|-- README.md GitHub README page
`-- README-alt.md Documentation for ALT mapping
```
[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
[sb]: https://github.com/GregoryFaust/samblaster
[ta]: https://github.com/lh3/seqtk/blob/master/trimadap.c
[smtl]: http://www.htslib.org
[kit]: https://github.com/lh3/bwa/tree/master/bwakit

View File

@ -0,0 +1,524 @@
/*****************************************************************
* The K8 Javascript interpreter is required to run this script. *
* *
* Source code: https://github.com/attractivechaos/k8 *
* Binary: http://sourceforge.net/projects/lh3/files/k8/ *
* *
* Data file used for generating GRCh38 ALT alignments: *
* *
* http://sourceforge.net/projects/bio-bwa/files/ *
*****************************************************************/
/******************
*** From k8.js ***
******************/
// Parse command-line options. A BSD getopt() clone in javascript.
var getopt = function(args, ostr) {
var oli; // option letter list index
if (typeof(getopt.place) == 'undefined')
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
if (getopt.place == -1) { // update scanning pointer
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
getopt.place = -1;
return null;
}
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
++getopt.ind;
getopt.place = -1;
return null;
}
}
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
if (getopt.place < 0) ++getopt.ind;
return '?';
}
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
getopt.arg = null;
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
} else { // need an argument
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
getopt.arg = args[getopt.ind].substr(getopt.place);
else if (args.length <= ++getopt.ind) { // no arg
getopt.place = -1;
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
return '?';
} else getopt.arg = args[getopt.ind]; // white space
getopt.place = -1;
++getopt.ind;
}
return optopt;
}
// reverse a string
Bytes.prototype.reverse = function()
{
for (var i = 0; i < this.length>>1; ++i) {
var tmp = this[i];
this[i] = this[this.length - i - 1];
this[this.length - i - 1] = tmp;
}
}
// reverse complement a DNA string
Bytes.prototype.revcomp = function()
{
if (Bytes.rctab == null) {
var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn';
var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn';
Bytes.rctab = [];
for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0;
for (var i = 0; i < s1.length; ++i)
Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
}
for (var i = 0; i < this.length>>1; ++i) {
var tmp = this[this.length - i - 1];
this[this.length - i - 1] = Bytes.rctab[this[i]];
this[i] = Bytes.rctab[tmp];
}
if (this.length&1)
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
}
// create index for a list of intervals for fast interval queries; ported from bedidx.c in samtools
function intv_ovlp(intv, bits)
{
if (typeof bits == "undefined") bits = 13;
intv.sort(function(a,b) {return a[0]-b[0];});
// create the index
var idx = [], max = 0;
for (var i = 0; i < intv.length; ++i) {
var b = intv[i][0]>>bits;
var e = (intv[i][1]-1)>>bits;
if (b != e) {
for (var j = b; j <= e; ++j)
if (idx[j] == null) idx[j] = i;
} else if (idx[b] == null) idx[b] = i;
max = max > e? max : e;
}
// closure
return function(_b, _e) {
var x = _b >> bits;
if (x > max) return [];
var off = idx[x];
if (off == null) {
var i;
for (i = ((_e - 1) >> bits) - 1; i >= 0; --i)
if (idx[i] != null) break;
off = i < 0? 0 : idx[i];
}
var ovlp = [];
for (var i = off; i < intv.length && intv[i][0] < _e; ++i)
if (intv[i][1] > _b) ovlp.push(intv[i]);
return ovlp;
}
}
var re_cigar = /(\d+)([MIDSHN])/g;
/******************************
*** Generate ALT alignment ***
******************************/
// given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF
function cigar2pos(cigar, pos)
{
var x = 0, y = 0;
for (var i = 0; i < cigar.length; ++i) {
var op = cigar[i][0], len = cigar[i][1];
if (op == 'M') {
if (y <= pos && pos < y + len)
return x + (pos - y);
x += len, y += len;
} else if (op == 'D') {
x += len;
} else if (op == 'I') {
if (y <= pos && pos < y + len)
return x;
y += len;
} else if (op == 'S' || op == 'H') {
if (y <= pos && pos < y + len)
return -1;
y += len;
}
}
return -1;
}
// Parse a hit. $s is an array that looks something like ["chr1", "+12345", "100M", 5]
// Return an object keeping various information about the alignment.
function parse_hit(s, opt)
{
var h = {};
h.ctg = s[0];
h.start = parseInt(s[1].substr(1)) - 1;
h.rev = (s[1].charAt(0) == '-');
h.cigar = s[2];
h.NM = parseInt(s[3]);
h.hard = false;
var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip;
l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0;
while ((m = re_cigar.exec(h.cigar)) != null) {
var l = parseInt(m[1]);
if (m[2] == 'M') l_match += l;
else if (m[2] == 'D') ++n_del, l_del += l;
else if (m[2] == 'I') ++n_ins, l_ins += l;
else if (m[2] == 'N') l_skip += l;
else if (m[2] == 'H' || m[2] == 'S') {
l_clip += l;
if (m[2] == 'H') h.hard = true;
}
}
h.end = h.start + l_match + l_del + l_skip;
h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins;
h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499);
h.l_query = l_match + l_ins + l_clip;
return h;
}
function print_buffer(buf2, fp_hla, hla) // output alignments
{
if (buf2.length == 0) return;
for (var i = 0; i < buf2.length; ++i)
print(buf2[i].join("\t"));
if (fp_hla != null) {
var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+');
for (var x in hla) {
if (fp_hla[x] != null);
fp_hla[x].write('@' + name + '\n' + buf2[0][9] + '\n+\n' + buf2[0][10] + '\n');
}
}
}
function collect_hla_hits(idx, ctg, start, end, hla) // collect reads hit to HLA genes
{
var m, ofunc = idx[ctg];
if (ofunc == null) return;
var ovlp_alt = ofunc(start, end);
for (var i = 0; i < ovlp_alt.length; ++i)
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null)
hla[m[1]] = true;
}
function bwa_postalt(args)
{
var version = "r985";
var c, opt = { a:1, b:4, o:6, e:1, min_mapq:10, min_sc:90, max_nm_sc:10, min_pa_ratio:1 };
while ((c = getopt(args, 'vp:r:')) != null) {
if (c == 'p') opt.pre = getopt.arg;
else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg);
else if (c == 'v') { print(version); exit(0); }
}
if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.;
if (args.length == getopt.ind) {
print("");
print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
print("Options: -p STR prefix of output files containting sequences matching HLA genes [null]");
print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa<FLOAT ["+opt.min_pa_ratio+"]");
print(" -v show version number");
print("");
print("Note: This script extracts the XA tag, lifts the mapping positions of ALT hits to");
print(" the primary assembly, groups them and then estimates mapQ across groups. If");
print(" a non-ALT hit overlaps a lifted ALT hit, its mapping quality is set to the");
print(" smaller between its original mapQ and the adjusted mapQ of the ALT hit. If");
print(" multiple ALT hits are lifted to the same position, they will yield new SAM");
print(" lines with the same mapQ.");
print("");
exit(1);
}
var aux = new Bytes(); // used for reverse and reverse complement
var buf = new Bytes(); // line reading buffer
// read ALT-to-REF alignment
var intv_alt = {}, intv_pri = {}, hla_ctg = {}, is_alt = {}, hla_chr = null;
var file = new File(args[getopt.ind]);
while (file.readline(buf) >= 0) {
var line = buf.toString();
if (line.charAt(0) == '@') continue;
var t = line.split("\t");
if (t.length < 11) continue; // incomplete lines
is_alt[t[0]] = true;
var pos = parseInt(t[3]) - 1;
var flag = parseInt(t[1]);
if ((flag&4) || t[2] == '*') continue;
var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs
if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0;
++hla_ctg[m[1]];
hla_chr = t[2];
}
while ((m = re_cigar.exec(t[5])) != null) {
var l = parseInt(m[1]);
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
if (m[2] == 'M') l_qaln += l, l_tlen += l;
else if (m[2] == 'I') l_qaln += l;
else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
}
var j = flag&16? cigar.length-1 : 0;
var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
}
file.close();
var idx_alt = {}, idx_pri = {};
for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
// initialize the list of HLA contigs
var fp_hla = null;
if (opt.pre) {
fp_hla = {};
for (var h in hla_ctg)
fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w");
}
// process SAM
var buf2 = [], hla = {};
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
while (file.readline(buf) > 0) {
var m, line = buf.toString();
if (line.charAt(0) == '@') { // print and then skip the header line
print(line);
continue;
}
var t = line.split("\t");
t[1] = parseInt(t[1]); t[3] = parseInt(t[3]); t[4] = parseInt(t[4]);
// print bufferred reads
if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) {
print_buffer(buf2, fp_hla, hla);
buf2 = [], hla = {};
}
// skip unmapped lines
if (t[1]&4) {
buf2.push(t);
continue;
}
// parse the reported hit
var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
var flag = t[1];
var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
if (t[2] == hla_chr) collect_hla_hits(idx_pri, h.ctg, h.start, h.end, hla);
if (h.hard) { // the following does not work with hard clipped alignments
buf2.push(t);
continue;
}
var hits = [h];
// parse hits in the XA tag
if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) {
var XA_strs = m[1].split(";");
for (var i = 0; i < XA_strs.length; ++i)
if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
hits.push(parse_hit(XA_strs[i].split(","), opt));
}
// check if there are ALT hits
var has_alt = false;
for (var i = 0; i < hits.length; ++i)
if (is_alt[hits[i].ctg] != null) {
has_alt = true;
break;
}
if (!has_alt) {
buf2.push(t);
continue;
}
// lift mapping positions to the primary assembly
var n_rpt_lifted = 0, rpt_lifted = null;
for (var i = 0; i < hits.length; ++i) {
var a, h = hits[i];
if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0)
continue;
// find the approximate position on the primary assembly
var lifted = [];
for (var j = 0; j < a.length; ++j) {
var s, e;
if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly
s = cigar2pos(a[j][6], h.start);
e = cigar2pos(a[j][6], h.end - 1) + 1;
} else {
s = cigar2pos(a[j][6], a[j][2] - h.end);
e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1;
}
if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment
s += a[j][5]; e += a[j][5];
lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
if (i == 0) ++n_rpt_lifted;
}
if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0);
if (lifted.length) hits[i].lifted = lifted;
}
// prepare for hits grouping
for (var i = 0; i < hits.length; ++i) { // set keys for sorting
if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used
hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
hits[i].i = i; // keep the original index
}
// group hits based on the lifted positions on non-ALT sequences
if (hits.length > 1) {
hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
var last_chr = null, end = 0, g = -1;
for (var i = 0; i < hits.length; ++i) {
if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
else if (hits[i].pstart >= end) ++g;
hits[i].g = g;
end = end > hits[i].pend? end : hits[i].pend;
}
} else hits[0].g = 0;
// find the index and group id of the reported hit; find the size of the reported group
var reported_g = null, reported_i = null, n_group0 = 0;
if (hits.length > 1) {
for (var i = 0; i < hits.length; ++i)
if (hits[i].i == 0)
reported_g = hits[i].g, reported_i = i;
for (var i = 0; i < hits.length; ++i)
if (hits[i].g == reported_g)
++n_group0;
} else {
if (is_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
buf2.push(t);
continue;
}
reported_g = reported_i = 0, n_group0 = 1;
}
// re-estimate mapping quality if necessary
var mapQ, ori_mapQ = t[4];
if (n_group0 > 1) {
var group_max = [];
for (var i = 0; i < hits.length; ++i) {
var g = hits[i].g;
if (group_max[g] == null || group_max[g][0] < hits[i].score)
group_max[g] = [hits[i].score, g];
}
if (group_max.length > 1)
group_max.sort(function(x,y) {return y[0]-x[0]});
if (group_max[0][1] == reported_g) { // the best hit is the hit reported in SAM
mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
} else mapQ = 0;
mapQ = mapQ < 60? mapQ : 60;
if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ;
else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
} else mapQ = t[4];
// find out whether the read is overlapping HLA genes
if (hits[reported_i].pctg == hla_chr) {
var rpt_start = 1<<30, rpt_end = 0;
for (var i = 0; i < hits.length; ++i) {
var h = hits[i];
if (h.g == reported_g) {
rpt_start = rpt_start < h.pstart? rpt_start : h.pstart;
rpt_end = rpt_end > h.pend ? rpt_end : h.pend;
}
}
collect_hla_hits(idx_pri, hla_chr, rpt_start, rpt_end, hla);
}
// adjust the mapQ of the primary hits
if (n_rpt_lifted <= 1) {
var l = n_rpt_lifted == 1? rpt_lifted : null;
for (var i = 0; i < buf2.length; ++i) {
var s = buf2[i], is_ovlp = true;
if (l != null) {
if (l[0] != s[2]) is_ovlp = false; // different chr
else if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand
else {
var start = s[3] - 1, end = start;
while ((m = re_cigar.exec(t[5])) != null)
if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
end += parseInt(m[1]);
if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap
}
} else is_ovlp = false;
// get the "pa" tag if present
var om = -1, pa = 10.;
for (var j = 11; j < s.length; ++j)
if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
om = parseInt(m[1]);
else if ((m = /^pa:f:(\S+)/.exec(s[j])) != null)
pa = parseFloat(m[1]);
if (is_ovlp) { // overlapping the lifted hit
if (om > 0) s[4] = om;
s[4] = s[4] < mapQ? s[4] : mapQ;
} else if (pa < opt.min_pa_ratio) { // not overlapping; has a small pa
if (om < 0) s.push("om:i:" + s[4]);
s[4] = 0;
}
}
}
// generate lifted_str
for (var i = 0; i < hits.length; ++i) {
if (hits[i].lifted && hits[i].lifted.length) {
var u = '', lifted = hits[i].lifted;
for (var j = 0; j < lifted.length; ++j)
u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";";
hits[i].lifted_str = u;
}
}
// stage the reported hit
t[4] = mapQ;
if (n_group0 > 1) t.push("om:i:"+ori_mapQ);
if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str);
buf2.push(t);
// stage the hits generated from the XA tag
var cnt = 0, rs = null, rq = null; // rq: reverse quality; rs: reverse complement sequence
var rg = (m = /\t(RG:Z:\S+)/.exec(line)) != null? m[1] : null;
for (var i = 0; i < hits.length; ++i) {
if (hits[i].g != reported_g || i == reported_i) continue;
if (idx_alt[hits[i].ctg] == null) continue;
var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, t[6], t[7], t[8]];
if (t[6] == '=' && s[2] != t[2]) s[6] = t[2];
// print sequence/quality and set the rev flag
if (hits[i].rev == hits[reported_i].rev) {
s.push(t[9], t[10]);
s[1] = flag | 0x800;
} else { // we need to write the reverse sequence
if (rs == null || rq == null) {
aux.length = 0;
aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
aux.set(t[10],0); aux.reverse(); rq = aux.toString();
}
s.push(rs, rq);
s[1] = (flag ^ 0x10) | 0x800;
}
s.push("NM:i:" + hits[i].NM);
if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
if (rg != null) s.push(rg);
buf2.push(s);
}
}
print_buffer(buf2, fp_hla, hla);
file.close();
if (fp_hla != null)
for (var h in fp_hla)
fp_hla[h].close();
buf.destroy();
aux.destroy();
}
bwa_postalt(arguments);

20
bwakit/run-HLA 100755
View File

@ -0,0 +1,20 @@
#!/bin/bash
ctg_opt=""
if [ $# -gt 1 ] && [ $1 == '-A' ]; then
ctg_opt="-A"
shift
fi
if [ $# -eq 0 ]; then
echo "Usage: $0 <prefix>"
exit 1
fi
for f in $1.HLA-*.fq; do
gene=`echo $f | perl -pe 's/^.*(HLA-[A-Z]+[0-9]*).*fq$/$1/'`
echo -e "\n*** Processing gene $gene...\n" >&2
`dirname $0`/typeHLA.sh $ctg_opt $1 $gene
done
ls $1.HLA-*.gt | xargs -i echo grep ^GT {} \| head -1 | sh | sed "s,^GT,$1,"

187
bwakit/run-bwamem 100755
View File

@ -0,0 +1,187 @@
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Std;
my %opts = (t=>1);
getopts("MPSadskHo:R:x:t:", \%opts);
die('
Usage: run-bwamem [options] <idxbase> <file1> [file2]
Options: -o STR prefix for output files [inferred from input]
-R STR read group header line such as \'@RG\tID:foo\tSM:bar\' [null]
-x STR read type: pacbio, ont2d or intractg [default]
intractg: intra-species contig (kb query, highly similar)
pacbio: pacbio subreads (~10kb query, high error rate)
ont2d: Oxford Nanopore reads (~10kb query, higher error rate)
-t INT number of threads [1]
-H apply HLA typing
-a trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
-d mark duplicate (via samblaster)
-S for BAM input, don\'t shuffle
-s sort the output alignment (via samtools; requring more RAM)
-k keep temporary files generated by typeHLA
-M mark shorter split hits as secondary
Examples:
* Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38DH.fa read1.fq.gz read2.fq.gz
Note: HLA typing is only effective for high-coverage data. The typing accuracy varies
with the quality of input. It is only intended for research purpose, not for diagnostic.
* Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and
sort the output. The BAM may contain single-end or paired-end reads, or a mixture of
the two types. Specifying -R stops read group transfer.
run-bwamem -sao prefix hs38DH.fa old-srt.bam
Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current
mapping pipeline better. It is conservative and suboptimal. A more sophisticated
trimmer is recommended if this becomes a concern.
* Remap name-grouped BAM and mark duplicates:
run-bwamem -Sdo prefix hs38DH.fa old-unsrt.bam
Note: streamed duplicate marking requires all reads from a single paired-end library
to be aligned at the same time.
Output files:
{-o}.aln.bam - final alignment
{-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
{-o}.hla.all - additional HLA genotypes consistent with data
{-o}.log.* - log files
') if @ARGV < 2;
my $idx = $ARGV[0];
my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
$root = $exepath =~/^(\S+)\/[^\/\s]+/? $1 : undef if !defined($root);
die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root);
die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n")
unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb");
if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) {
warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n");
@ARGV = 2;
}
if (defined($opts{p}) && @ARGV >= 3) {
warn("WARNING: option -P is ignored as there are two input sequence files.\n");
delete $opts{p};
}
my $prefix;
if (defined $opts{o}) {
$prefix = $opts{o};
} elsif (@ARGV >= 3) {
my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]);
my $i;
for ($i = 0; $i < $len; ++$i) {
last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1)
}
$prefix = substr($ARGV[1], 0, $i) if $i > 0;
} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
$prefix = $1;
}
die("ERROR: failed to identify the prefix for output. Please specify -o.\n") unless defined($prefix);
my $size = 0;
my $comp_ratio = 3.;
for my $f (@ARGV[1..$#ARGV]) {
my @a = stat($f);
my $s = $a[7];
die("ERROR: failed to read file $f\n") if !defined($s);
$s *= $comp_ratio if $f =~ /\.(gz|bam)$/;
$size += int($s) + 1;
}
my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0;
my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
if (defined($opts{x})) {
delete($opts{d}); delete($opts{a}); delete $opts{p};
}
# for BAM input, find @RG header lines
my @RG_lines = ();
if ($is_bam && !defined($opts{R})) {
my $fh;
open($fh, "$root/samtools view -H $ARGV[1] |") || die;
while (<$fh>) {
chomp;
if (/^\@RG\t/) {
s/\t/\\t/g;
push(@RG_lines, "-H'$_'");
}
}
close($fh);
}
warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") if !defined($opts{R}) && @RG_lines == 0;
my $cmd = '';
if ($is_bam) {
my $cmd_sam2bam = "cat $ARGV[1] \\\n";
my $ntmps = int($size / 4e9) + 1;
my $cmd_shuf = !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
my $bam2fq_opt = @RG_lines > 0? " -t" : "";
my $cmd_bam2fq = " | $root/htsbox bam2fq -O$bam2fq_opt - \\\n";
$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
} elsif (@ARGV >= 3) {
$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
} else {
$cmd = "cat $ARGV[1] \\\n";
}
my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "") . (defined($opts{M})? "-M " : "");
$bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0;
$cmd .= " | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a});
$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
my $has_hla = 0;
if (-f "$ARGV[0].alt" && !defined($opts{P})) {
my $fh;
open($fh, "$ARGV[0].alt") || die;
while (<$fh>) {
$has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
}
close($fh);
my $hla_pre = $has_hla? "-p $prefix.hla " : "";
$cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
}
my $t_sort = $opts{t} < 4? $opts{t} : 4;
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - -o $prefix.aln.bam;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n";
if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
$cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
$cmd .= "touch $prefix.hla.HLA-dummy.gt; cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
$cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
}
print $cmd;
sub which
{
my $file = shift;
my $path = (@_)? shift : $ENV{PATH};
return if (!defined($path));
foreach my $x (split(":", $path)) {
$x =~ s/\/$//;
return "$x/$file" if (-x "$x/$file");
}
return;
}

39
bwakit/run-gen-ref 100755
View File

@ -0,0 +1,39 @@
#!/bin/bash
root=`dirname $0`
url38="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
if [ $# -eq 0 ]; then
echo "Usage: $0 <hs38|hs38a|hs38DH|hs37|hs37d5>"
echo "Analysis sets:"
echo " hs38 primary assembly of GRCh38 (incl. chromosomes, unplaced and unlocalized contigs) and EBV"
echo " hs38a hs38 plus ALT contigs"
echo " hs38DH hs38a plus decoy contigs and HLA genes (recommended for GRCh38 mapping)"
echo " hs37 primary assembly of GRCh37 (used by 1000g phase 1) plus the EBV genome"
echo " hs37d5 hs37 plus decoy contigs (used by 1000g phase 3)"
echo ""
echo "Note: This script downloads human reference genomes. For hs38a and hs38DH, it needs additional"
echo " sequences and ALT-to-REF mapping included in the bwa.kit package."
exit 1;
fi
if [ $1 == "hs38DH" ]; then
(wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38DH-extra.fa) > $1.fa
[ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38DH.fa.alt $1.fa.alt
elif [ $1 == "hs38a" ]; then
wget -O- $url38 | gzip -dc > $1.fa
[ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38DH.fa.alt > $1.fa.alt
elif [ $1 == "hs38" ]; then
wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa
elif [ $1 == "hs37d5" ]; then
wget -O- $url37d5 | gzip -dc > $1.fa 2>/dev/null
elif [ $1 == "hs37" ]; then
wget -O- $url37d5 | gzip -dc 2>/dev/null | awk '/^>/{f=/>hs37d5/?0:1}f' > $1.fa
else
echo "ERROR: unknown genome build"
fi
[ ! -f $1.fa.bwt ] && echo -e "\nPlease run 'bwa index $1.fa'...\n"

View File

@ -0,0 +1,62 @@
var min_ovlp = 30;
if (arguments.length < 3) {
print("Usage: k8 selctg.js <HLA-gene> <HLA-ALT-exons.bed> <ctg-to-ALT.sam> [min_ovlp="+min_ovlp+"]");
exit(1);
}
if (arguments.length >= 4) min_ovlp = parseInt(arguments[3]);
var gene = arguments[0];
var buf = new Bytes();
var h = {};
var file = new File(arguments[1]);
while (file.readline(buf) >= 0) {
var t = buf.toString().split("\t");
if (t[3] != gene) continue;
if (h[t[0]] == null) h[t[0]] = [];
h[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
}
file.close();
var s = {}, re = /(\d+)([MIDSHN])/g;
file = new File(arguments[2]);
while (file.readline(buf) >= 0) {
var line = buf.toString();
var m, t = line.split("\t");
var x = h[t[2]];
if (x == null) continue;
var start = parseInt(t[3]) - 1, end = start;
while ((m = re.exec(t[5])) != null) // parse CIGAR to get the end position
if (m[2] == 'M' || m[2] == 'D')
end += parseInt(m[1]);
var max_ovlp = 0;
for (var i = 0; i < x.length; ++i) {
var max_left = x[i][0] > start? x[i][0] : start;
var min_rght = x[i][1] < end ? x[i][1] : end;
max_ovlp = max_ovlp > min_rght - max_left? max_ovlp : min_rght - max_left;
}
var AS = null, XS = null;
if ((m = /AS:i:(\d+)/.exec(line)) != null) AS = parseInt(m[1]);
if ((m = /XS:i:(\d+)/.exec(line)) != null) XS = parseInt(m[1]);
if (s[t[0]] == null) s[t[0]] = [];
s[t[0]].push([AS, XS, max_ovlp]);
}
file.close();
buf.destroy();
for (var x in s) {
var is_rejected = false, y = s[x];
y.sort(function(a,b) {return b[0]-a[0]});
for (var i = 0; i < y.length && y[i][0] == y[0][0]; ++i)
if (y[0][2] < min_ovlp || y[i][0] == y[i][1])
is_rejected = true;
if (is_rejected) continue;
print(x);
}

496
bwakit/typeHLA.js 100644
View File

@ -0,0 +1,496 @@
/*****************************************************************
* The K8 Javascript interpreter is required to run this script. *
* *
* Source code: https://github.com/attractivechaos/k8 *
* Binary: http://sourceforge.net/projects/lh3/files/k8/ *
*****************************************************************/
var getopt = function(args, ostr) {
var oli; // option letter list index
if (typeof(getopt.place) == 'undefined')
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
if (getopt.place == -1) { // update scanning pointer
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
getopt.place = -1;
return null;
}
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
++getopt.ind;
getopt.place = -1;
return null;
}
}
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
if (getopt.place < 0) ++getopt.ind;
return '?';
}
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
getopt.arg = null;
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
} else { // need an argument
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
getopt.arg = args[getopt.ind].substr(getopt.place);
else if (args.length <= ++getopt.ind) { // no arg
getopt.place = -1;
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
return '?';
} else getopt.arg = args[getopt.ind]; // white space
getopt.place = -1;
++getopt.ind;
}
return optopt;
}
/************************
* Command line parsing *
************************/
var ver = "r19";
var c, thres_len = 50, thres_ratio = .8, thres_nm = 5, thres_frac = .33, dbg = false;
// parse command line options
while ((c = getopt(arguments, "vdl:n:f:")) != null) {
if (c == 'l') thres_len = parseInt(getopt.arg);
else if (c == 'n') thres_nm = parseInt(getopt.arg);
else if (c == 'd') dbg = true;
else if (c == 'f') thres_frac = parseFloat(getopt.arg);
else if (c == 'v') { print(ver); exit(0); }
}
if (arguments.length == getopt.ind) {
print("");
print("Usage: k8 typeHLA.js [options] <exon-to-contig.sam>\n");
print("Options: -n INT drop a contig if the edit distance to the closest gene is >INT ["+thres_nm+"]");
print(" -l INT drop a contig if its match too short ["+thres_len+"]");
print(" -f FLOAT drop inconsistent contigs if their length <FLOAT fraction of total length ["+thres_ratio.toFixed(2)+"]");
print(" -d output extra info for debugging");
print(" -v show version number");
print("");
print("Note: The output is TAB delimited with each GT line consisting of allele1, allele2,");
print(" #mismatches/gaps on primary exons, #mismatches/gaps on other exons and #exons");
print(" used in typing. If unusure, use the first GT line as the final genotype.\n");
exit(1);
}
/*********************************
* Read gene-to-contig alignment *
*********************************/
var file = new File(arguments[getopt.ind]);
var buf = new Bytes();
var re_cigar = /(\d+)([MIDSH])/g;
var len = {}, list = [], gcnt = [];
while (file.readline(buf) >= 0) {
var m, mm, line = buf.toString();
var t = line.split("\t");
var flag = parseInt(t[1]);
// SAM header
if (t[0].charAt(0) == '@') {
if (t[0] == '@SQ' && (m = /LN:(\d+)/.exec(line)) != null && (mm = /SN:(\S+)/.exec(line)) != null)
len[mm[1]] = parseInt(m[1]);
continue;
}
// parse gene name and exon number
var gene = null, exon = null;
if ((m = /^(HLA-[^\s_]+)_(\d+)/.exec(t[0])) != null) {
gene = m[1], exon = parseInt(m[2]) - 1;
if (gcnt[exon] == null) gcnt[exon] = {};
gcnt[exon][gene] = true;
}
if (gene == null || exon == null || t[2] == '*') continue;
// parse clipping and aligned length
var x = 0, ts = parseInt(t[3]) - 1, te = ts, clip = [0, 0];
while ((m = re_cigar.exec(t[5])) != null) {
var l = parseInt(m[1]);
if (m[2] == 'M') x += l, te += l;
else if (m[2] == 'I') x += l;
else if (m[2] == 'D') te += l;
else if (m[2] == 'S' || m[2] == 'H') clip[x==0?0:1] = l;
}
var tl = len[t[2]];
var left = ts < clip[0]? ts : clip[0];
var right = tl - te < clip[1]? tl - te : clip[1];
var qs, qe, ql = clip[0] + x + clip[1];
if (flag & 16) qs = clip[1], qe = ql - clip[0];
else qs = clip[0], qe = ql - clip[1];
var nm = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
list.push([t[2], gene, exon, ts, te, nm, left + right, qs, qe, ql]); // left+right should be 0 given a prefix-suffix alignment
}
buf.destroy();
file.close();
/**************************************
* Prepare data structures for typing *
**************************************/
// identify the primary exons, the exons associated with most genes
var pri_exon = [], n_pri_exons;
{
var cnt = [], max = 0;
// count the number of genes per exon and track the max
for (var e = 0; e < gcnt.length; ++e) {
if (gcnt[e] != null) {
var c = 0, h = gcnt[e];
for (var x in h) ++c;
cnt[e] = c;
max = max > c? max : c;
} else cnt[e] = 0;
}
warn("- Number of genes for each exon: [" +cnt.join(",") + "]");
// find primary exons
var pri_list = [];
for (var e = 0; e < cnt.length; ++e) {
if (cnt[e] == max) pri_list.push(e + 1);
pri_exon[e] = cnt[e] == max? 1 : 0;
}
warn("- List of primary exon(s): ["+pri_list.join(",")+"]");
n_pri_exons = pri_list.length;
}
// convert strings to integers (for performance)
var ghash = {}, glist = [], chash = {}, clist = [], elist = [];
for (var i = 0; i < list.length; ++i) {
if (ghash[list[i][1]] == null) {
ghash[list[i][1]] = glist.length;
glist.push(list[i][1]);
}
if (chash[list[i][0]] == null) {
chash[list[i][0]] = clist.length;
clist.push(list[i][0]);
}
var g = ghash[list[i][1]];
if (elist[g] == null) elist[g] = {};
elist[g][list[i][2]] = true;
}
// extract the 3rd and 4th digits
var gsub = [], gsuf = [];
for (var i = 0; i < glist.length; ++i) {
var m = /^HLA-[^*\s]+\*\d+:(\d+).*([A-Z]?)$/.exec(glist[i]);
gsub[i] = parseInt(m[1]);
gsuf[i] = /[A-Z]$/.test(glist[i])? 1 : 0;
}
/*************************************************
* Collect genes with perfect matches on primary *
*************************************************/
// collect exons with fully covered by perfect match(es)
var perf_exons = [];
function push_perf_exons(matches, last)
{
matches.sort(function(a, b) { return a[0]-b[0]; });
var cov = 0, start = 0, end = 0;
for (var i = 0; i < matches.length; ++i) {
if (matches[i][3] > 0) continue;
if (matches[i][0] <= end)
end = end > matches[i][1]? end : matches[i][1];
else cov += end - start, start = matches[i][0], end = matches[i][1];
}
cov += end - start;
if (matches[0][2] == cov) {
if (perf_exons[last[1]] == null) perf_exons[last[1]] = [];
//print(last[0], last[1], ghash[last[0]]);
perf_exons[last[1]].push(ghash[last[0]]);
}
}
var last = [null, -1], matches = [];
for (var i = 0; i < list.length; ++i) {
var li = list[i];
if (last[0] != li[1] || last[1] != li[2]) {
if (matches.length) push_perf_exons(matches, last);
matches = [];
last = [li[1], li[2]];
}
matches.push([li[7], li[8], li[9], li[5]+li[6]]);
}
if (matches.length) push_perf_exons(matches, last);
// for each gene, count how many primary exons are perfect
var pg_aux_cnt = {};
for (var e = 0; e < perf_exons.length; ++e) {
if (!pri_exon[e]) continue;
var pe = perf_exons[e];
var n = pe? pe.length : 0;
for (var i = 0; i < n; ++i) {
var g = pe[i];
if (pg_aux_cnt[g] == null) pg_aux_cnt[g] = 1;
else ++pg_aux_cnt[g];
}
}
// find genes with perfect matches on the primary exons
var perf_genes = [];
for (var g in pg_aux_cnt)
if (pg_aux_cnt[g] == n_pri_exons)
perf_genes.push(parseInt(g));
warn("- Found " +perf_genes.length+ " genes fully covered by perfect matches on the primary exon(s)");
var h_perf_genes = {};
for (var i = 0; i < perf_genes.length; ++i) {
if (dbg) print("PG", glist[perf_genes[i]]);
h_perf_genes[perf_genes[i]] = true;
}
/*******************
* Filter hit list *
*******************/
// reorganize hits to exons
function list2exons(list, flt_flag, perf_hash)
{
var exons = [];
for (var i = 0; i < list.length; ++i) {
var li = list[i], c = chash[li[0]], g = ghash[li[1]];
if (flt_flag != null && flt_flag[c] == 1) continue;
if (perf_hash != null && !perf_hash[g]) continue;
if (exons[li[2]] == null) exons[li[2]] = [];
exons[li[2]].push([c, g, li[5] + li[6], li[4] - li[3]]);
}
return exons;
}
var exons = list2exons(list), flt_flag = [], ovlp_len = [];
for (var c = 0; c < clist.length; ++c) flt_flag[c] = ovlp_len[c] = 0;
for (var e = 0; e < exons.length; ++e) {
if (!pri_exon[e]) continue;
var ee = exons[e];
var max_len = [];
for (var c = 0; c < clist.length; ++c) max_len[c] = 0;
for (var i = 0; i < ee.length; ++i) {
var l = ee[i][3] - ee[i][2];
if (l < 1) l = 1;
if (max_len[ee[i][0]] < l) max_len[ee[i][0]] = l;
}
for (var c = 0; c < clist.length; ++c) ovlp_len[c] += max_len[c];
for (var i = 0; i < ee.length; ++i)
flt_flag[ee[i][0]] |= (!h_perf_genes[ee[i][1]] || ee[i][2])? 1 : 1<<1;
}
var l_cons = 0, l_incons = 0;
for (var c = 0; c < clist.length; ++c)
if (flt_flag[c]&2) l_cons += ovlp_len[c];
else if (flt_flag[c] == 1) l_incons += ovlp_len[c];
warn("- Total length of contigs consistent/inconsistent with perfect genes: " +l_cons+ "/" +l_incons);
var attempt_perf = (l_incons/(l_cons+l_incons) < thres_frac);
/********************************
* Core function for genotyping *
********************************/
function type_gene(perf_mode)
{
if (perf_mode) {
var flt_list = [];
for (var c = 0; c < clist.length; ++c)
if (flt_flag[c] == 1) flt_list.push(clist[c]);
warn(" - Filtered " +flt_list.length+ " inconsistent contig(s): [" +flt_list.join(",")+ "]");
exons = list2exons(list, flt_flag, h_perf_genes);
} else exons = list2exons(list);
/***********************
* Score each genotype *
***********************/
// initialize genotype scores
var pair = [];
for (var i = 0; i < glist.length; ++i) {
pair[i] = [];
for (var j = 0; j <= i; ++j)
pair[i][j] = 0;
}
// these two arrays are used to output debugging information
var score = [], ctg = [];
function type_exon(e, gt_list)
{
function update_pair(x, m, is_pri)
{
var y, z;
y = (x>>14&0xff) + m < 0xff? (x>>14&0xff) + m : 0xff;
if (is_pri) z = (x>>22) + m < 0xff? (x>>22) + m : 0xff;
else z = x>>22;
return z<<22 | y<<14 | ((x&0x3fff) + (1<<6|is_pri));
}
score[e] = []; ctg[e] = [];
if (exons[e] == null) return;
var ee = exons[e], is_pri = pri_exon[e]? 1 : 0;
// find contigs and genes associated with the current exon
var ch = {}, gh = {};
for (var i = 0; i < ee.length; ++i)
if (elist[ee[i][1]][e] != null)
ch[ee[i][0]] = true, gh[ee[i][1]] = true;
var ga = [], ca = ctg[e];
for (var c in ch) ca.push(parseInt(c));
for (var g in gh) ga.push(parseInt(g));
var named_ca = [];
for (var i = 0; i < ca.length; ++i) named_ca.push(clist[ca[i]]);
warn(" - Processing exon "+(e+1)+" (" +ga.length+ " genes; " +ca.length+ " contigs: [" +named_ca.join(", ")+ "])...");
// set unmapped entries to high mismatch
var sc = score[e];
for (var k = 0; k < ga.length; ++k) {
var g = ga[k];
if (sc[g] == null) sc[g] = [];
for (var i = 0; i < ca.length; ++i)
sc[g][ca[i]] = 0xff;
}
// convert representation again and compute max_len[]
var max_len = [];
for (var i = 0; i < ee.length; ++i) {
var c = ee[i][0], g = ee[i][1];
if (gh[g] == null || ch[c] == null) continue;
sc[g][c] = sc[g][c] < ee[i][2]? sc[g][c] : ee[i][2];
if (max_len[c] == null) max_len[c] = 0;
max_len[c] = max_len[c] > ee[i][3]? max_len[c] : ee[i][3];
}
// drop mismapped contigs
var max_max_len = 0;
for (var k = 0; k < ca.length; ++k)
max_max_len = max_max_len > max_len[ca[k]]? max_max_len : max_len[ca[k]];
var dropped = [];
for (var k = 0; k < ca.length; ++k) {
var min = 0x7fffffff, c = ca[k];
for (var i = 0; i < ga.length; ++i) {
var g = ga[i];
min = min < sc[g][c]? min : sc[g][c];
}
dropped[c] = min > thres_nm? true : false;
if (max_len[c] < thres_len && max_len[c] < thres_ratio * max_max_len) dropped[c] = true;
if (dropped[c]) warn(" . Dropped low-quality contig " +clist[c]+ " (minNM=" +min+ "; maxLen=" +max_len[c]+ ")");
}
// fill the pair array
if (gt_list == null) {
for (var i = 0; i < ga.length; ++i) {
var m = 0, gi = ga[i], g1 = sc[gi];
// homozygous
for (var k = 0; k < ca.length; ++k) {
var c = ca[k];
if (!dropped[c]) m += g1[c];
}
pair[gi][gi] = update_pair(pair[gi][gi], m, is_pri);
// heterozygous
for (var j = i + 1; j < ga.length; ++j) {
var gj = ga[j], g2 = sc[gj], m = 0, a = [0, 0];
for (var k = 0; k < ca.length; ++k) {
var c = ca[k];
if (!dropped[c]) {
m += g1[c] < g2[c]? g1[c] : g2[c];
++a[g1[c]<g2[c]? 0:1];
}
}
if (a[0] == 0 || a[1] == 0) m = 0xff; // if all contigs are assigned to one gene, it is not good
if (gi < gj) pair[gj][gi] = update_pair(pair[gj][gi], m, is_pri);
else pair[gi][gj] = update_pair(pair[gi][gj], m, is_pri);
}
}
} else {
var tmp_pairs = [], min = 0xff;
for (var i = 0; i < gt_list.length; ++i) {
var gt = gt_list[i], m = 0;
var g1 = sc[gt[0]], g2 = sc[gt[1]], a = [0, 0];
if (g1 == null || g2 == null) continue;
if (gt[0] == gt[1]) {
for (var k = 0; k < ca.length; ++k) {
var c = ca[k];
if (!dropped[c]) m += g1[c];
}
} else {
var a = [0, 0];
for (k = 0; k < ca.length; ++k) {
var c = ca[k];
if (!dropped[c]) {
m += g1[c] < g2[c]? g1[c] : g2[c];
++a[g1[c]<g2[c]? 0:1];
}
}
if (a[0] == 0 || a[1] == 0) m = 0xff;
}
tmp_pairs.push([gt[0], gt[1], m]);
min = min < m? min : m;
}
if (min < 0xff) {
for (var i = 0; i < tmp_pairs.length; ++i) {
var t = tmp_pairs[i];
pair[t[0]][t[1]] = update_pair(pair[t[0]][t[1]], t[2], is_pri);
}
} else warn(" . Skipped exon " +(e+1)+ " as the assembly may be incomplete");
}
}
// type primary exons
warn(" - Processing primary exon(s)...");
for (var e = 0; e < exons.length; ++e)
if (pri_exon[e]) type_exon(e);
// generate the list of best genotypes on primary exons
var min_nm_pri = 0x7fffffff;
for (var i = 0; i < glist.length; ++i)
for (var j = 0; j <= i; ++j)
if ((pair[i][j]&63) == n_pri_exons)
min_nm_pri = min_nm_pri < pair[i][j]>>22? min_nm_pri : pair[i][j]>>22;
var gt_list = [];
for (var i = 0; i < glist.length; ++i)
for (var j = 0; j <= i; ++j)
if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>22 == min_nm_pri)
gt_list.push([i, j]);
warn(" - Collected " +gt_list.length+ " top genotypes on the primary exon(s); minimal edit distance: " +min_nm_pri);
// type other exons
warn(" - Processing other exon(s)...");
for (var e = 0; e < exons.length; ++e)
if (!pri_exon[e]) type_exon(e, gt_list);
/*****************************
* Choose the best genotypes *
*****************************/
// genotyping
var min_nm = 0x7fffffff;
for (var i = 0; i < glist.length; ++i)
for (var j = 0; j <= i; ++j)
if ((pair[i][j]&63) == n_pri_exons)
min_nm = min_nm < pair[i][j]>>14? min_nm : pair[i][j]>>14;
var out = [];
for (var i = 0; i < glist.length; ++i)
for (var j = 0; j <= i; ++j)
if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>14 <= min_nm + 1)
out.push([pair[i][j]>>14, pair[i][j]>>6&0xff, i, j, (gsuf[i] + gsuf[j])<<16|(gsub[i] + gsub[j])]);
out.sort(function(a, b) { return a[0]!=b[0]? a[0]-b[0] : a[1]!=b[1]? b[1]-a[1] : a[4]!=b[4]? a[4]-b[4] : a[2]!=b[2]? a[2]-b[2] : a[3]-b[3]});
return out;
}
/**********************
* Perform genotyping *
**********************/
warn("- Typing in the imperfect mode...");
var rst = type_gene(false);
if (attempt_perf) {
warn("- Typing in the perfect mode...");
var rst_perf = type_gene(true);
warn("- Imperfect vs perfect mode: [" +(rst[0][0]>>8&0xff)+ "," +(rst[0][0]&0xff)+ "] vs [" +(rst_perf[0][0]>>8&0xff)+ "," +(rst_perf[0][0]&0xff)+ "]");
if (rst_perf[0][0] < rst[0][0]) {
warn("- Chose the result from the perfect mode");
rst = rst_perf;
} else warn("- Chose the result from the imperfect mode");
} else warn("- Perfect mode is not attempted");
/**********
* Output *
**********/
for (var i = 0; i < rst.length; ++i)
print("GT", glist[rst[i][3]], glist[rst[i][2]], rst[i][0]>>8&0xff, rst[i][0]&0xff, rst[i][1]);

49
bwakit/typeHLA.sh 100755
View File

@ -0,0 +1,49 @@
#!/bin/bash
is_ctg=0
if [ $# -gt 1 ] && [ $1 == '-A' ]; then
is_ctg=1
shift
fi
if [ $# -lt 2 ]; then
echo "Usage: $0 [-A] <prefix> <gene>"
exit 1
fi
preres="resource-human-HLA"
root=`dirname $0`
pre=$1.$2
touch $pre.gt
if [ ! -s $pre.fq ]; then
echo '** Empty input file. Abort!' >&2
exit 0
fi
if [ $is_ctg -eq 0 ]; then
echo "** De novo assembling..." >&2
len=`$root/seqtk comp $pre.fq | awk '{++x;y+=$2}END{printf("%.0f\n", y/x)}'`
$root/fermi2.pl unitig -f $root/fermi2 -r $root/ropebwt2 -t2 -l$len -p $pre.tmp $pre.fq > $pre.tmp.mak
make -f $pre.tmp.mak >&2
cp $pre.tmp.mag.gz $pre.mag.gz
else
rm -f $pre.tmp.mag.gz
ln -s $pre.fq $pre.tmp.mag.gz
fi
echo "** Selecting contigs overlapping target exons..." >&2
(ls $root/$preres/HLA-ALT-idx/*.fa.bwt | sed s,.bwt,, | xargs -i $root/bwa mem -t2 -B1 -O1 -E1 {} $pre.tmp.mag.gz 2>/dev/null) | grep -v ^@ | sort -k3,3 -k4,4n | gzip > $pre.tmp.ALT.sam.gz
$root/k8 $root/typeHLA-selctg.js $2 $root/$preres/HLA-ALT-exons.bed $pre.tmp.ALT.sam.gz | $root/seqtk subseq $pre.tmp.mag.gz - | gzip -1 > $pre.tmp.fq.gz
echo "** Mapping exons to de novo contigs..." >&2
$root/bwa index -p $pre.tmp $pre.tmp.fq.gz 2>/dev/null
$root/seqtk comp $root/$preres/HLA-CDS.fa | cut -f1 | grep ^$2 | $root/seqtk subseq $root/$preres/HLA-CDS.fa - | $root/bwa mem -aD.1 -t2 $pre.tmp - 2>/dev/null | gzip -1 > $pre.sam.gz
echo "** Typing..." >&2
$root/k8 $root/typeHLA.js $pre.sam.gz > $pre.gt
# delete temporary files
rm -f $pre.tmp.*
[ $is_ctg -eq 1 ] && rm -f $pre.mag.gz

1264
bwamem.c 100644

File diff suppressed because it is too large Load Diff

213
bwamem.h 100644
View File

@ -0,0 +1,213 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef BWAMEM_H_
#define BWAMEM_H_
#include "bwt.h"
#include "bntseq.h"
#include "bwa.h"
#define MEM_MAPQ_COEF 30.0
#define MEM_MAPQ_MAX 60
struct __smem_i;
typedef struct __smem_i smem_i;
#define MEM_F_PE 0x2
#define MEM_F_NOPAIRING 0x4
#define MEM_F_ALL 0x8
#define MEM_F_NO_MULTI 0x10
#define MEM_F_NO_RESCUE 0x20
#define MEM_F_REF_HDR 0x100
#define MEM_F_SOFTCLIP 0x200
#define MEM_F_SMARTPE 0x400
#define MEM_F_PRIMARY5 0x800
#define MEM_F_KEEP_SUPP_MAPQ 0x1000
#define MEM_F_XB 0x2000
typedef struct {
int a, b; // match score and mismatch penalty
int o_del, e_del;
int o_ins, e_ins;
int pen_unpaired; // phred-scaled penalty for unpaired reads
int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score.
int w; // band width
int zdrop; // Z-dropoff
uint64_t max_mem_intv;
int T; // output score threshold; only affecting output
int flag; // see MEM_F_* macros
int min_seed_len; // minimum seed length
int min_chain_weight;
int max_chain_extend;
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
int split_width; // split into a seed if its occurrence is smaller than this value
int max_occ; // skip a seed if its occurrence is larger than this value
int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
int n_threads; // number of threads
int chunk_size; // process chunk_size-bp sequences in a batch
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
float mask_level_redun;
float mapQ_coef_len;
int mapQ_coef_fac;
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
} mem_opt_t;
typedef struct {
int64_t rb, re; // [rb,re): reference sequence in the alignment
int qb, qe; // [qb,qe): query sequence in the alignment
int rid; // reference seq ID
int score; // best local SW score
int truesc; // actual score corresponding to the aligned region; possibly smaller than $score
int sub; // 2nd best SW score
int alt_sc;
int csub; // SW score of a tandem hit
int sub_n; // approximate number of suboptimal hits
int w; // actual band width used in extension
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
int secondary_all;
int seedlen0; // length of the starting seed
int n_comp:30, is_alt:2; // number of sub-alignments chained together
float frac_rep;
uint64_t hash;
} mem_alnreg_t;
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
typedef struct {
int low, high; // lower and upper bounds within which a read pair is considered to be properly paired
int failed; // non-zero if the orientation is not supported by sufficient data
double avg, std; // mean and stddev of the insert size distribution
} mem_pestat_t;
typedef struct { // This struct is only used for the convenience of API.
int64_t pos; // forward strand 5'-end mapping position
int rid; // reference sequence index in bntseq_t; <0 for unmapped
int flag; // extra flag
uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
int n_cigar; // number of CIGAR operations
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
char *XA; // alternative mappings
int score, sub, alt_sc;
} mem_aln_t;
#ifdef __cplusplus
extern "C" {
#endif
smem_i *smem_itr_init(const bwt_t *bwt);
void smem_itr_destroy(smem_i *itr);
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv);
const bwtintv_v *smem_next(smem_i *itr);
mem_opt_t *mem_opt_init(void);
void mem_fill_scmat(int a, int b, int8_t mat[25]);
/**
* Align a batch of sequences and generate the alignments in the SAM format
*
* This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
* Note that $seqs[i].sam may consist of several SAM lines if the
* corresponding sequence has multiple primary hits.
*
* In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
* sequences must be interleaved: $n must be an even number and the 2i-th
* sequence and the (2i+1)-th sequence constitute a read pair. In this
* mode, there should be enough (typically >50) unique pairs for the
* routine to infer the orientation and insert size.
*
* @param opt alignment parameters
* @param bwt FM-index of the reference sequence
* @param bns Information of the reference
* @param pac 2-bit encoded reference
* @param n number of query sequences
* @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
* @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements,
* corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info.
*/
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
/**
* Find the aligned regions for one query sequence
*
* Note that this routine does not generate CIGAR. CIGAR should be
* generated later by mem_reg2aln() below.
*
* @param opt alignment parameters
* @param bwt FM-index of the reference sequence
* @param bns Information of the reference
* @param pac 2-bit encoded reference
* @param l_seq length of query sequence
* @param seq query sequence
*
* @return list of aligned regions.
*/
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq);
/**
* Generate CIGAR and forward-strand position from alignment region
*
* @param opt alignment parameters
* @param bns Information of the reference
* @param pac 2-bit encoded reference
* @param l_seq length of query sequence
* @param seq query sequence
* @param ar one alignment region
*
* @return CIGAR, strand, mapping quality and forward-strand position
*/
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar);
mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name);
/**
* Infer the insert size distribution from interleaved alignment regions
*
* This function can be called after mem_align1(), as long as paired-end
* reads are properly interleaved.
*
* @param opt alignment parameters
* @param l_pac length of concatenated reference sequence
* @param n number of query sequences; must be an even number
* @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
* @param pes inferred insert size distribution (output)
*/
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
#ifdef __cplusplus
}
#endif
#endif

172
bwamem_extra.c 100644
View File

@ -0,0 +1,172 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <limits.h>
#include "bwa.h"
#include "bwamem.h"
#include "bntseq.h"
#include "kstring.h"
/***************************
* SMEM iterator interface *
***************************/
struct __smem_i {
const bwt_t *bwt;
const uint8_t *query;
int start, len;
int min_intv, max_len;
uint64_t max_intv;
bwtintv_v *matches; // matches; to be returned by smem_next()
bwtintv_v *sub; // sub-matches inside the longest match; temporary
bwtintv_v *tmpvec[2]; // temporary arrays
};
smem_i *smem_itr_init(const bwt_t *bwt)
{
smem_i *itr;
itr = calloc(1, sizeof(smem_i));
itr->bwt = bwt;
itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
itr->matches = calloc(1, sizeof(bwtintv_v));
itr->sub = calloc(1, sizeof(bwtintv_v));
itr->min_intv = 1;
itr->max_len = INT_MAX;
itr->max_intv = 0;
return itr;
}
void smem_itr_destroy(smem_i *itr)
{
free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
free(itr->matches->a); free(itr->matches);
free(itr->sub->a); free(itr->sub);
free(itr);
}
void smem_set_query(smem_i *itr, int len, const uint8_t *query)
{
itr->query = query;
itr->start = 0;
itr->len = len;
}
void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
{
itr->min_intv = min_intv;
itr->max_len = max_len;
itr->max_intv = max_intv;
}
const bwtintv_v *smem_next(smem_i *itr)
{
int ori_start;
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
if (itr->start >= itr->len || itr->start < 0) return 0;
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
return itr->matches;
}
/***********************
*** Extra functions ***
***********************/
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_)
{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf);
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
mem_alnreg_v ar;
char *seq;
seq = malloc(l_seq);
memcpy(seq, seq_, l_seq); // makes a copy of seq_
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0);
mem_mark_primary_se(opt, ar.n, ar.a, lrand48());
free(seq);
return ar;
}
static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
{
int k = a[i].secondary_all;
if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
return -1;
}
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
{
int i, k, r, *cnt, tot;
kstring_t *aln = 0, str = {0,0,0};
char **XA = 0, *has_alt;
cnt = calloc(a->n, sizeof(int));
has_alt = calloc(a->n, 1);
for (i = 0, tot = 0; i < a->n; ++i) {
r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
if (r >= 0) {
++cnt[r], ++tot;
if (a->a[i].is_alt) has_alt[r] = 1;
}
}
if (tot == 0) goto end_gen_alt;
aln = calloc(a->n, sizeof(kstring_t));
for (i = 0; i < a->n; ++i) {
mem_aln_t t;
if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
str.l = 0;
kputs(bns->anns[t.rid].name, &str);
kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str);
kputc(',', &str);
for (k = 0; k < t.n_cigar; ++k) {
kputw(t.cigar[k]>>4, &str);
kputc("MIDSHN"[t.cigar[k]&0xf], &str);
}
kputc(',', &str); kputw(t.NM, &str);
if (opt->flag & MEM_F_XB) {
kputc(',', &str);
kputw(t.score, &str);
kputc(',', &str);
kputw(t.mapq, &str);
}
kputc(';', &str);
free(t.cigar);
kputsn(str.s, str.l, &aln[r]);
}
XA = calloc(a->n, sizeof(char*));
for (k = 0; k < a->n; ++k)
XA[k] = aln[k].s;
end_gen_alt:
free(has_alt); free(cnt); free(aln); free(str.s);
return XA;
}

419
bwamem_pair.c 100644
View File

@ -0,0 +1,419 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "kstring.h"
#include "bwamem.h"
#include "kvec.h"
#include "utils.h"
#include "ksw.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#define MIN_RATIO 0.8
#define MIN_DIR_CNT 10
#define MIN_DIR_RATIO 0.05
#define OUTLIER_BOUND 2.0
#define MAPPING_BOUND 3.0
#define MAX_STDDEV 4.0
static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
{
int64_t p2;
int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
*dist = p2 > b1? p2 - b1 : b1 - p2;
return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);
}
static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
{
int j;
for (j = 1; j < r->n; ++j) { // choose unique alignment
int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
if (e_min > b_max) { // have overlap
int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
}
}
return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
}
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
{
int i, d, max;
uint64_v isize[4];
memset(pes, 0, 4 * sizeof(mem_pestat_t));
memset(isize, 0, sizeof(kvec_t(int)) * 4);
for (i = 0; i < n>>1; ++i) {
int dir;
int64_t is;
mem_alnreg_v *r[2];
r[0] = (mem_alnreg_v*)&regs[i<<1|0];
r[1] = (mem_alnreg_v*)&regs[i<<1|1];
if (r[0]->n == 0 || r[1]->n == 0) continue;
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
}
if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
mem_pestat_t *r = &pes[d];
uint64_v *q = &isize[d];
int p25, p50, p75, x;
if (q->n < MIN_DIR_CNT) {
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
r->failed = 1;
free(q->a);
continue;
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
ks_introsort_64(q->n, q->a);
p25 = q->a[(int)(.25 * q->n + .499)];
p50 = q->a[(int)(.50 * q->n + .499)];
p75 = q->a[(int)(.75 * q->n + .499)];
r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
if (r->low < 1) r->low = 1;
r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
for (i = x = 0, r->avg = 0; i < q->n; ++i)
if (q->a[i] >= r->low && q->a[i] <= r->high)
r->avg += q->a[i], ++x;
r->avg /= x;
for (i = 0, r->std = 0; i < q->n; ++i)
if (q->a[i] >= r->low && q->a[i] <= r->high)
r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
r->std = sqrt(r->std / x);
fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499);
if (r->high < r->avg + MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
if (r->low < 1) r->low = 1;
fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
free(q->a);
}
for (d = 0, max = 0; d < 4; ++d)
max = max > isize[d].n? max : isize[d].n;
for (d = 0; d < 4; ++d)
if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) {
pes[d].failed = 1;
fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
}
}
int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
{
extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
int64_t l_pac = bns->l_pac;
int i, r, skip[4], n = 0, rid;
for (r = 0; r < 4; ++r)
skip[r] = pes[r].failed? 1 : 0;
for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
int64_t dist;
r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
if (dist >= pes[r].low && dist <= pes[r].high)
skip[r] = 1;
}
if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW
for (r = 0; r < 4; ++r) {
int is_rev, is_larger;
uint8_t *seq, *rev = 0, *ref = 0;
int64_t rb, re;
if (skip[r]) continue;
is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
is_larger = !(r>>1); // whether the mate has larger coordinate
if (is_rev) {
rev = malloc(l_ms); // this is the reverse complement of $ms
for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4;
seq = rev;
} else seq = (uint8_t*)ms;
if (!is_rev) {
rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high;
re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
} else {
rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low;
}
if (rb < 0) rb = 0;
if (re > l_pac<<1) re = l_pac<<1;
if (rb < re) ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid);
if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening
kswr_t aln;
mem_alnreg_t b;
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
memset(&b, 0, sizeof(mem_alnreg_t));
if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
b.rid = a->rid;
b.is_alt = a->is_alt;
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1;
b.score = aln.score;
b.csub = aln.score2;
b.secondary = -1;
b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
// move b s.t. ma is sorted
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
if (ma->a[i].score < b.score) break;
tmp = i;
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
ma->a[i] = b;
}
++n;
}
if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
if (rev) free(rev);
free(ref);
}
return n;
}
int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2])
{
pair64_v v, u;
int r, i, k, y[4], ret; // y[] keeps the last hit
int64_t l_pac = bns->l_pac;
kv_init(v); kv_init(u);
for (r = 0; r < 2; ++r) { // loop through read number
for (i = 0; i < n_pri[r]; ++i) {
pair64_t key;
mem_alnreg_t *e = &a[r].a[i];
key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
key.x = (uint64_t)e->rid<<32 | (key.x - bns->anns[e->rid].offset);
key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r;
kv_push(pair64_t, v, key);
}
}
ks_introsort_128(v.n, v.a);
y[0] = y[1] = y[2] = y[3] = -1;
//for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
for (i = 0; i < v.n; ++i) {
for (r = 0; r < 2; ++r) { // loop through direction
int dir = r<<1 | (v.a[i].y>>1&1), which;
if (pes[dir].failed) continue; // invalid orientation
which = r<<1 | ((v.a[i].y&1)^1);
if (y[which] < 0) continue; // no previous hits
for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt)
int64_t dist;
int q;
double ns;
pair64_t *p;
if ((v.a[k].y&3) != which) continue;
dist = (int64_t)v.a[i].x - v.a[k].x;
//printf("%d: %lld\n", k, dist);
if (dist > pes[dir].high) break;
if (dist < pes[dir].low) continue;
ns = (dist - pes[dir].avg) / pes[dir].std;
q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4)
if (q < 0) q = 0;
p = kv_pushp(pair64_t, u);
p->y = (uint64_t)k<<32 | i;
p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU);
//printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
}
}
y[v.a[i].y&3] = i;
}
if (u.n) { // found at least one proper pair
int tmp = opt->a + opt->b;
tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del;
tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins;
ks_introsort_128(u.n, u.a);
i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
z[v.a[k].y&1] = v.a[k].y<<32>>34;
ret = u.a[u.n-1].x >> 32;
*sub = u.n > 1? u.a[u.n-2].x>>32 : 0;
for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub;
} else ret = 0, *sub = 0, *n_sub = 0;
free(u.a); free(v.a);
return ret;
}
void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m);
void mem_reorder_primary5(int T, mem_alnreg_v *a);
#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
{
extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
kstring_t str;
mem_aln_t h[2], g[2], aa[2][2];
str.l = str.m = 0; str.s = 0;
memset(h, 0, sizeof(mem_aln_t) * 2);
memset(g, 0, sizeof(mem_aln_t) * 2);
n_aa[0] = n_aa[1] = 0;
if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
mem_alnreg_v b[2];
kv_init(b[0]); kv_init(b[1]);
for (i = 0; i < 2; ++i)
for (j = 0; j < a[i].n; ++j)
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
for (i = 0; i < 2; ++i)
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a);
}
n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
if (opt->flag & MEM_F_PRIMARY5) {
mem_reorder_primary5(opt->T, &a[0]);
mem_reorder_primary5(opt->T, &a[1]);
}
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
// pairing single-end hits
if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
int is_multi[2], q_pe, score_un, q_se[2];
char **XA[2];
// check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) {
for (j = 1; j < n_pri[i]; ++j)
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
is_multi[i] = j < n_pri[i]? 1 : 0;
}
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
// compute mapQ for the best SE hit
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
subo = subo > score_un? subo : score_un;
q_pe = raw_mapq(o - subo, opt->a);
if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
if (q_pe < 0) q_pe = 0;
if (q_pe > 60) q_pe = 60;
q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499);
// the following assumes no split hits
if (o > score_un) { // paired alignment is preferred
mem_alnreg_t *c[2];
c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];
for (i = 0; i < 2; ++i) {
if (c[i]->secondary >= 0)
c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
q_se[i] = mem_approx_mapq_se(opt, c[i]);
}
q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;
q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
extra_flag |= 2;
// cap at the tandem repeat score
q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
} else { // the unpaired alignment is preferred
z[0] = z[1] = 0;
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
}
for (i = 0; i < 2; ++i) {
int k = a[i].a[z[i]].secondary_all;
if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT
assert(a[i].a[k].secondary_all < 0);
for (j = 0; j < a[i].n; ++j)
if (a[i].a[j].secondary_all == k || j == k)
a[i].a[j].secondary_all = z[i];
a[i].a[z[i]].secondary_all = -1;
}
}
if (!(opt->flag & MEM_F_ALL)) {
for (i = 0; i < 2; ++i)
XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);
} else XA[0] = XA[1] = 0;
// write SAM
for (i = 0; i < 2; ++i) {
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
h[i].mapq = q_se[i];
h[i].flag |= 0x40<<i | extra_flag;
h[i].XA = XA[i]? XA[i][z[i]] : 0;
aa[i][n_aa[i]++] = h[i];
if (n_pri[i] < a[i].n) { // the read has ALT hits
mem_alnreg_t *p = &a[i].a[n_pri[i]];
if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) continue;
g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p);
g[i].flag |= 0x800 | 0x40<<i | extra_flag;
g[i].XA = XA[i]? XA[i][n_pri[i]] : 0;
aa[i][n_aa[i]++] = g[i];
}
}
for (i = 0; i < n_aa[0]; ++i)
mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
s[0].sam = strdup(str.s); str.l = 0;
for (i = 0; i < n_aa[1]; ++i)
mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
s[1].sam = str.s;
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
// free
for (i = 0; i < 2; ++i) {
free(h[i].cigar); free(g[i].cigar);
if (XA[i] == 0) continue;
for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
free(XA[i]);
}
} else goto no_pairing;
return n;
no_pairing:
for (i = 0; i < 2; ++i) {
int which = -1;
if (a[i].n) {
if (a[i].a[0].score >= opt->T) which = 0;
else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
which = n_pri[i];
}
if (which >= 0) h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
}
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
int64_t dist;
int d;
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;
}
mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);
mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar); free(h[1].cigar);
return n;
}

784
bwape.c 100644
View File

@ -0,0 +1,784 @@
#include <unistd.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
#include "bwtaln.h"
#include "kvec.h"
#include "bntseq.h"
#include "utils.h"
#include "bwase.h"
#include "bwa.h"
#include "ksw.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
typedef struct {
int n;
bwtint_t *a;
} poslist_t;
typedef struct {
double avg, std, ap_prior;
bwtint_t low, high, high_bayesian;
} isize_info_t;
#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
#define b128_hash(a) ((uint32_t)(a).x)
#include "khash.h"
KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
typedef struct {
pair64_v arr;
pair64_v pos[2];
kvec_t(bwt_aln1_t) aln[2];
} pe_data_t;
#define MIN_HASH_WIDTH 1000
extern int g_log_n[256]; // in bwase.c
static kh_b128_t *g_hash;
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
bntseq_t *bwa_open_nt(const char *prefix);
void bwa_print_sam_SQ(const bntseq_t *bns);
pe_opt_t *bwa_init_pe_opt()
{
pe_opt_t *po;
po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t));
po->max_isize = 500;
po->force_isize = 0;
po->max_occ = 100000;
po->n_multi = 3;
po->N_multi = 10;
po->type = BWA_PET_STD;
po->is_sw = 1;
po->ap_prior = 1e-5;
return po;
}
/*
static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
{
const double a = 0.140012;
double b, c;
b = log(x * (2 - x));
c = 2./M_PI/a + b / 2.;
return sqrt(sqrt(c * c - b / a) - c);
}
*/
// for normal distribution, this is about 3std
#define OUTLIER_BOUND 2.0
static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L)
{
uint64_t x, *isizes, n_ap = 0;
int n, i, tot, p25, p75, p50, max_len = 1, tmp;
double skewness = 0.0, kurtosis = 0.0, y;
ii->avg = ii->std = -1.0;
ii->low = ii->high = ii->high_bayesian = 0;
isizes = (uint64_t*)calloc(n_seqs, 8);
for (i = 0, tot = 0; i != n_seqs; ++i) {
bwa_seq_t *p[2];
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) {
x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos;
if (x < 100000) isizes[tot++] = x;
}
if (p[0]->len > max_len) max_len = p[0]->len;
if (p[1]->len > max_len) max_len = p[1]->len;
}
if (tot < 20) {
fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n");
free(isizes);
return -1;
}
ks_introsort_64(tot, isizes);
p25 = isizes[(int)(tot*0.25 + 0.5)];
p50 = isizes[(int)(tot*0.50 + 0.5)];
p75 = isizes[(int)(tot*0.75 + 0.5)];
tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned
ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
if (ii->low > ii->high) {
fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n");
free(isizes);
return -1;
}
for (i = 0, x = n = 0; i < tot; ++i)
if (isizes[i] >= ii->low && isizes[i] <= ii->high)
++n, x += isizes[i];
ii->avg = (double)x / n;
for (i = 0; i < tot; ++i) {
if (isizes[i] >= ii->low && isizes[i] <= ii->high) {
double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg);
ii->std += tmp;
skewness += tmp * (isizes[i] - ii->avg);
kurtosis += tmp * tmp;
}
}
kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3;
ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large
skewness = skewness / n / (ii->std * ii->std * ii->std);
for (y = 1.0; y < 10.0; y += 0.01)
if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
for (i = 0; i < tot; ++i)
if (isizes[i] > ii->high_bayesian) ++n_ap;
ii->ap_prior = .01 * (n_ap + .01) / tot;
if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior;
free(isizes);
fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75);
if (isnan(ii->std) || p75 > 100000) {
ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0;
fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n");
return -1;
}
for (y = 1.0; y < 10.0; y += 0.01)
if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
fprintf(stderr, "[infer_isize] low and high boundaries: %ld and %ld for estimating avg and std\n", (long)ii->low, (long)ii->high);
fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std);
fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior);
fprintf(stderr, "[infer_isize] inferred maximum insert size: %ld (%.2lf sigma)\n", (long)ii->high_bayesian, y);
return 0;
}
static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii)
{
int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
uint64_t o_score, subo_score;
pair64_t last_pos[2][2], o_pos[2];
max_len = p[0]->full_len;
if (max_len < p[1]->full_len) max_len = p[1]->full_len;
if (low_bound < max_len) low_bound = max_len;
// here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize
#define __pairing_aux(u,v) do { \
bwtint_t l = (v).x + p[(v).y&1]->len - ((u).x); \
if ((u).x != (uint64_t)-1 && (v).x > (u).x && l >= max_len \
&& ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \
{ \
uint64_t s = d->aln[(v).y&1].a[(v).y>>2].score + d->aln[(u).y&1].a[(u).y>>2].score; \
s *= 10; \
if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \
s = s<<32 | (uint32_t)hash_64((u).x<<32 | (v).x); \
if (s>>32 == o_score>>32) ++o_n; \
else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \
else ++subo_n; \
if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u).y&1] = (u), o_pos[(v).y&1] = (v); \
else if (s < subo_score) subo_score = s; \
} \
} while (0)
#define __pairing_aux2(q, w) do { \
const bwt_aln1_t *r = d->aln[(w).y&1].a + ((w).y>>2); \
(q)->extra_flag |= SAM_FPP; \
if ((q)->pos != (w).x || (q)->strand != ((w).y>>1&1)) { \
(q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = (w).y>>1&1; \
(q)->score = r->score; \
(q)->pos = (w).x; \
if ((q)->mapQ > 0) ++cnt_chg; \
} \
} while (0)
o_score = subo_score = (uint64_t)-1;
o_n = subo_n = 0;
ks_introsort_128(d->arr.n, d->arr.a);
for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
if (opt->type == BWA_PET_STD) {
for (i = 0; i < d->arr.n; ++i) {
pair64_t x = d->arr.a[i];
int strand = x.y>>1&1;
if (strand == 1) { // reverse strand, then check
int y = 1 - (x.y&1);
__pairing_aux(last_pos[y][1], x);
__pairing_aux(last_pos[y][0], x);
} else { // forward strand, then push
last_pos[x.y&1][0] = last_pos[x.y&1][1];
last_pos[x.y&1][1] = x;
}
}
} else {
fprintf(stderr, "[paring] not implemented yet!\n");
exit(1);
}
// set pairing
//fprintf(stderr, "[%ld, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n);
if (o_score != (uint64_t)-1) {
int mapQ_p = 0; // this is the maximum mapping quality when one end is moved
//fprintf(stderr, "%d, %d\n", o_n, subo_n);
if (o_n == 1) {
if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair
else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair
else {
int n = subo_n > 255? 255 : subo_n;
mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n];
if (mapQ_p < 0) mapQ_p = 0;
}
}
if ((p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) && (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1))) { // both ends not moved
if (p[0]->mapQ > 0 && p[1]->mapQ > 0) {
int mapQ = p[0]->mapQ + p[1]->mapQ;
if (mapQ > 60) mapQ = 60;
p[0]->mapQ = p[1]->mapQ = mapQ;
} else {
if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ;
if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ;
}
} else if (p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) { // [1] moved
p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ;
if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p;
} else if (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1)) { // [0] moved
p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ;
if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p;
} else { // both ends moved
p[0]->seQ = p[1]->seQ = 0;
mapQ_p -= 20;
if (mapQ_p < 0) mapQ_p = 0;
p[0]->mapQ = p[1]->mapQ = mapQ_p;
}
__pairing_aux2(p[0], o_pos[0]);
__pairing_aux2(p[1], o_pos[1]);
}
return cnt_chg;
}
typedef struct {
kvec_t(bwt_aln1_t) aln;
} aln_buf_t;
int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
{
int i, j, cnt_chg = 0;
char str[1024];
bwt_t *bwt;
pe_data_t *d;
aln_buf_t *buf[2];
d = (pe_data_t*)calloc(1, sizeof(pe_data_t));
buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
if (_bwt == 0) { // load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
} else bwt = _bwt;
// SE
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p[2];
for (j = 0; j < 2; ++j) {
int n_aln;
p[j] = seqs[j] + i;
p[j]->n_multi = 0;
p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2);
err_fread_noeof(&n_aln, 4, 1, fp_sa[j]);
if (n_aln > kv_max(d->aln[j]))
kv_resize(bwt_aln1_t, d->aln[j], n_aln);
d->aln[j].n = n_aln;
err_fread_noeof(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]);
kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j]
// generate SE alignment and mapping quality
bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
int strand;
int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len + p[j]->ref_shift, &strand);
p[j]->strand = strand;
if (p[j]->pos == (bwtint_t)-1) p[j]->type = BWA_TYPE_NO_MATCH;
}
}
}
// infer isize
infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2);
if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
if (opt->force_isize) {
fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
ii->low = ii->high = 0; ii->avg = ii->std = -1.0;
}
// PE
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p[2];
for (j = 0; j < 2; ++j) {
p[j] = seqs[j] + i;
kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln);
}
if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
{ // only when both ends mapped
pair64_t x;
int j, k;
long long n_occ[2];
for (j = 0; j < 2; ++j) {
n_occ[j] = 0;
for (k = 0; k < d->aln[j].n; ++k)
n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1;
}
if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue;
d->arr.n = 0;
for (j = 0; j < 2; ++j) {
for (k = 0; k < d->aln[j].n; ++k) {
bwt_aln1_t *r = d->aln[j].a + k;
bwtint_t l;
if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
pair64_t key;
int ret;
key.x = r->k; key.y = r->l;
khint_t iter = kh_put(b128, g_hash, key, &ret);
if (ret) { // not in the hash table; ret must equal 1 as we never remove elements
poslist_t *z = &kh_val(g_hash, iter);
z->n = r->l - r->k + 1;
z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
for (l = r->k; l <= r->l; ++l) {
int strand;
z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand)<<1;
z->a[l - r->k] |= strand;
}
}
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
x.x = kh_val(g_hash, iter).a[l]>>1;
x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
kv_push(pair64_t, d->arr, x);
}
} else { // then calculate on the fly
for (l = r->k; l <= r->l; ++l) {
int strand;
x.x = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand);
x.y = k<<2 | strand<<1 | j;
kv_push(pair64_t, d->arr, x);
}
}
}
}
cnt_chg += pairing(p, d, opt, gopt->s_mm, ii);
}
if (opt->N_multi || opt->n_multi) {
for (j = 0; j < 2; ++j) {
if (p[j]->type != BWA_TYPE_NO_MATCH) {
int k, n_multi;
if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) {
bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
} else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) {
int strand;
bwt_multi1_t *q = p[j]->multi + k;
q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand);
q->strand = strand;
if (q->pos != p[j]->pos && q->pos != (bwtint_t)-1)
p[j]->multi[n_multi++] = *q;
}
p[j]->n_multi = n_multi;
}
}
}
}
// free
for (i = 0; i < n_seqs; ++i) {
kv_destroy(buf[0][i].aln);
kv_destroy(buf[1][i].aln);
}
free(buf[0]); free(buf[1]);
if (_bwt == 0) bwt_destroy(bwt);
kv_destroy(d->arr);
kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
free(d);
return cnt_chg;
}
#define SW_MIN_MATCH_LEN 20
#define SW_MIN_MAPQ 17
// cnt = n_mm<<16 | n_gapo<<8 | n_gape
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt)
{
kswr_t r;
uint32_t *cigar32 = 0;
bwa_cigar_t *cigar = 0;
ubyte_t *ref_seq;
bwtint_t k, x, y, l;
int xtra, gscore;
int8_t mat[25];
bwa_fill_scmat(1, 3, mat);
// check whether there are too many N's
if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
for (k = 0, x = 0; k < len; ++k)
if (seq[k] >= 4) ++x;
if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0;
// get reference subsequence
ref_seq = (ubyte_t*)calloc(reglen, 1);
for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
// do alignment
xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0);
r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0);
gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32);
cigar = (bwa_cigar_t*)cigar32;
for (k = 0; k < *n_cigar; ++k)
cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment
free(cigar); free(ref_seq); *n_cigar = 0;
return 0;
}
// check whether the alignment is good enough
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
bwa_cigar_t c = cigar[k];
if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c);
else if (__cigar_op(c) == FROM_D) x += __cigar_len(c);
else y += __cigar_len(c);
}
if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
free(cigar); free(ref_seq);
*n_cigar = 0;
return 0;
}
{ // update cigar and coordinate;
int start = r.qb, end = r.qe + 1;
*beg += r.tb;
cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
if (start) {
memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
cigar[0] = __cigar_create(3, start);
++(*n_cigar);
}
if (end < len) {
/*cigar[*n_cigar] = 3<<14 | (len - end);*/
cigar[*n_cigar] = __cigar_create(3, (len - end));
++(*n_cigar);
}
}
{ // set *cnt
int n_mm, n_gapo, n_gape;
n_mm = n_gapo = n_gape = 0;
x = r.tb; y = r.qb;
for (k = 0; k < *n_cigar; ++k) {
bwa_cigar_t c = cigar[k];
if (__cigar_op(c) == FROM_M) {
for (l = 0; l < (__cigar_len(c)); ++l)
if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm;
x += __cigar_len(c), y += __cigar_len(c);
} else if (__cigar_op(c) == FROM_D) {
x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
} else if (__cigar_op(c) == FROM_I) {
y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
}
}
*_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
}
free(ref_seq);
return cigar;
}
ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
{
ubyte_t *pacseq;
int i;
uint64_t n_tot[2], n_mapped[2];
// load reference sequence
if (_pacseq == 0) {
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
err_rewind(bns->fp_pac);
err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
} else pacseq = (ubyte_t*)_pacseq;
if (!popt->is_sw || ii->avg < 0.0) return pacseq;
// perform mate alignment
n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0;
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p[2];
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ
int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2];
int64_t beg[2], end[2];
bwa_cigar_t *cigar[2];
uint32_t cnt[2];
/* In the following, _pref points to the reference read
* which must be aligned; _pmate points to its mate which is
* considered to be modified. */
#define __set_rght_coor(_a, _b, _pref, _pmate) do { \
(_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \
(_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \
if ((_b) > bns->l_pac) (_b) = bns->l_pac; \
} while (0)
#define __set_left_coor(_a, _b, _pref, _pmate) do { \
(_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \
(_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
if ((_a) < 0) (_a) = 0; \
if ((_b) > _pref->pos) (_b) = _pref->pos; \
} while (0)
#define __set_fixed(_pref, _pmate, _beg, _cnt) do { \
_pmate->type = BWA_TYPE_MATESW; \
_pmate->pos = _beg; \
_pmate->seQ = _pref->seQ; \
_pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \
_pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \
_pmate->extra_flag |= SAM_FPP; \
_pref->extra_flag |= SAM_FPP; \
} while (0)
mq_adjust[0] = mq_adjust[1] = 255; // not effective
is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0;
++n_tot[is_singleton];
cigar[0] = cigar[1] = 0;
n_cigar[0] = n_cigar[1] = 0;
if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
ubyte_t *seq;
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
{ // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
seq = p[k]->rseq;
} else { // then the mate is on forward stand and has smaller coordinate
__set_left_coor(beg[k], end[k], p[1-k], p[k]);
seq = p[k]->seq;
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
}
}
// perform SW alignment
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k]
int s_old, clip = 0, s_new;
if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]);
if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]);
s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499);
s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499);
s_old += -4.343 * log(ii->ap_prior / bns->l_pac);
s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma
if (s_old < s_new) { // reject SW alignment
mq_adjust[k] = s_new - s_old;
free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0;
} else mq_adjust[k] = s_old - s_new;
}
// now revserse sequence back such that p[*]->seq looks untouched
if (popt->type == BWA_PET_STD) {
if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0);
} else {
if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0);
}
}
k = -1; // no read to be changed
if (cigar[0] && cigar[1]) {
k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed
mapQ = abs(p[1]->mapQ - p[0]->mapQ);
} else if (cigar[0]) k = 0, mapQ = p[1]->mapQ;
else if (cigar[1]) k = 1, mapQ = p[0]->mapQ;
if (k >= 0 && p[k]->pos != beg[k]) {
++n_mapped[is_singleton];
{ // recalculate mapping quality
int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8;
if (tmp <= 0) tmp = 1;
if (mapQ > tmp) mapQ = tmp;
p[k]->mapQ = p[1-k]->mapQ = mapQ;
p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ;
if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k];
if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k];
}
// update CIGAR
free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0;
p[k]->n_cigar = n_cigar[k];
// update the rest of information
__set_fixed(p[1-k], p[k], beg[k], cnt[k]);
}
free(cigar[0]); free(cigar[1]);
}
}
fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n",
(long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ);
fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n",
(long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ);
return pacseq;
}
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
{
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, j, n_seqs;
long long tot_seqs = 0;
bwa_seq_t *seqs[2];
bwa_seqio_t *ks[2];
clock_t t;
bntseq_t *bns;
FILE *fp_sa[2];
gap_opt_t opt, opt0;
khint_t iter;
isize_info_t last_ii; // this is for the last batch of reads
char str[1024], magic[2][4];
bwt_t *bwt;
uint8_t *pac;
// initialization
bwase_initialize(); // initialize g_log_n[] in bwase.c
pac = 0; bwt = 0;
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
bns = bns_restore(prefix);
srand48(bns->seed);
fp_sa[0] = xopen(fn_sa[0], "r");
fp_sa[1] = xopen(fn_sa[1], "r");
g_hash = kh_init(b128);
last_ii.avg = -1.0;
err_fread_noeof(magic[0], 1, 4, fp_sa[0]);
err_fread_noeof(magic[1], 1, 4, fp_sa[1]);
if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) {
fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__);
exit(1);
}
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
opt0 = opt;
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
{ // for Illumina alignment only
if (popt->is_preload) {
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
err_rewind(bns->fp_pac);
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
}
}
// core loop
bwa_print_sam_hdr(bns, rg_line);
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
int cnt_chg;
isize_info_t ii;
ubyte_t *pacseq;
seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual);
tot_seqs += n_seqs;
t = clock();
fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);
fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n");
pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii);
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
for (j = 0; j < 2; ++j)
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
if (pac == 0) free(pacseq);
fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... ");
for (i = 0; i < n_seqs; ++i) {
bwa_seq_t *p[2];
p[0] = seqs[0] + i; p[1] = seqs[1] + i;
if (p[0]->bc[0] || p[1]->bc[0]) {
strcat(p[0]->bc, p[1]->bc);
strcpy(p[1]->bc, p[0]->bc);
}
bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2);
bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2);
if (strcmp(p[0]->name, p[1]->name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", p[0]->name, p[1]->name);
}
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
for (j = 0; j < 2; ++j)
bwa_free_read_seq(n_seqs, seqs[j]);
fprintf(stderr, "[bwa_sai2sam_pe_core] %lld sequences have been processed.\n", tot_seqs);
last_ii = ii;
}
// destroy
bns_destroy(bns);
for (i = 0; i < 2; ++i) {
bwa_seq_close(ks[i]);
err_fclose(fp_sa[i]);
}
for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter)
if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
kh_destroy(b128, g_hash);
if (pac) {
free(pac); bwt_destroy(bwt);
}
}
int bwa_sai2sam_pe(int argc, char *argv[])
{
int c;
pe_opt_t *popt;
char *prefix, *rg_line = 0;
popt = bwa_init_pe_opt();
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
switch (c) {
case 'r':
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
break;
case 'a': popt->max_isize = atoi(optarg); break;
case 'o': popt->max_occ = atoi(optarg); break;
case 's': popt->is_sw = 0; break;
case 'P': popt->is_preload = 1; break;
case 'n': popt->n_multi = atoi(optarg); break;
case 'N': popt->N_multi = atoi(optarg); break;
case 'c': popt->ap_prior = atof(optarg); break;
case 'f': xreopen(optarg, "w", stdout); break;
case 'A': popt->force_isize = 1; break;
default: return 1;
}
}
if (optind + 5 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);
fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi);
fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior);
fprintf(stderr, " -f FILE sam file to output results to [stdout]\n");
fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n");
fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n");
fprintf(stderr, " -A disable insert size estimate (force -s)\n\n");
fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n");
fprintf(stderr, "\n");
return 1;
}
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
return 1;
}
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
free(prefix); free(popt);
return 0;
}

606
bwase.c 100644
View File

@ -0,0 +1,606 @@
#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <assert.h>
#include "bwase.h"
#include "bwtaln.h"
#include "bntseq.h"
#include "utils.h"
#include "kstring.h"
#include "bwa.h"
#include "ksw.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
int g_log_n[256];
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
{
int i, cnt, best;
if (n_aln == 0) {
s->type = BWA_TYPE_NO_MATCH;
s->c1 = s->c2 = 0;
return;
}
if (set_main) {
best = aln[0].score;
for (i = cnt = 0; i < n_aln; ++i) {
const bwt_aln1_t *p = aln + i;
if (p->score > best) break;
if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape;
s->ref_shift = (int)p->n_del - (int)p->n_ins;
s->score = p->score;
s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
}
cnt += p->l - p->k + 1;
}
s->c1 = cnt;
for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
s->c2 = cnt - s->c1;
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
}
if (n_multi) {
int k, rest, n_occ, z = 0;
for (k = n_occ = 0; k < n_aln; ++k) {
const bwt_aln1_t *q = aln + k;
n_occ += q->l - q->k + 1;
}
if (s->multi) free(s->multi);
if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
s->multi = 0; s->n_multi = 0;
return;
}
/* The following code is more flexible than what is required
* here. In principle, due to the requirement above, we can
* simply output all hits, but the following samples "rest"
* number of random hits. */
rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
s->multi = calloc(rest, sizeof(bwt_multi1_t));
for (k = 0; k < n_aln; ++k) {
const bwt_aln1_t *q = aln + k;
if (q->l - q->k + 1 <= rest) {
bwtint_t l;
for (l = q->k; l <= q->l; ++l) {
s->multi[z].pos = l;
s->multi[z].gap = q->n_gapo + q->n_gape;
s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins;
s->multi[z++].mm = q->n_mm;
}
rest -= q->l - q->k + 1;
} else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
int j, i;
for (j = rest, i = q->l - q->k + 1; j > 0; --j) {
double p = 1.0, x = drand48();
while (x < p) p -= p * j / (i--);
s->multi[z].pos = q->l - i;
s->multi[z].gap = q->n_gapo + q->n_gape;
s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins;
s->multi[z++].mm = q->n_mm;
}
rest = 0;
break;
}
}
s->n_multi = z;
}
}
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
{
bwa_aln2seq_core(n_aln, aln, s, 1, 0);
}
int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
{
int n;
if (p->c1 == 0) return 23;
if (p->c1 > 1) return 0;
if (p->n_mm == mm) return 25;
if (p->c2 == 0) return 37;
n = (p->c2 >= 255)? 255 : p->c2;
return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
}
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand)
{
bwtint_t pos_f;
int is_rev;
*strand = 0; // initialise strand to 0 otherwise we could return without setting it
pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate
if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1;
pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base
*strand = !is_rev;
if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base
return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset
}
/**
* Derive the actual position in the read from the given suffix array
* coordinates. Note that the position will be approximate based on
* whether indels appear in the read and whether calculations are
* performed from the start or end of the read.
*/
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
{
int max_diff, strand;
if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
//fprintf(stderr, "%d\n", seq->ref_shift);
seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand);
seq->strand = strand;
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH;
}
void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
{
int i, j, strand, n_multi;
char str[1024];
bwt_t *bwt;
// load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p = &seqs[i];
bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr);
for (j = n_multi = 0; j < p->n_multi; ++j) {
bwt_multi1_t *q = p->multi + j;
q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + q->ref_shift, &strand);
q->strand = strand;
if (q->pos != p->pos && q->pos != (bwtint_t)-1)
p->multi[n_multi++] = *q;
}
p->n_multi = n_multi;
}
bwt_destroy(bwt);
}
#define SW_BW 50
bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar)
{
bwa_cigar_t *cigar = 0;
uint32_t *cigar32 = 0;
ubyte_t *rseq;
int64_t k, rb, re, rlen;
int8_t mat[25];
int w;
bwa_fill_scmat(1, 3, mat);
rb = *_rb; re = rb + len + ref_shift;
assert(re <= l_pac);
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
assert(re - rb == rlen);
w = abs((int)rlen - len) * 1.5;
ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > w? SW_BW : w, n_cigar, &cigar32);
assert(*n_cigar > 0);
if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping
if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping
if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del
if ((cigar32[0]&0xf) == 2) { // delete beginning del
*_rb += cigar32[0]>>4;
--*n_cigar;
memmove(cigar32, cigar32+1, (*n_cigar) * 4);
}
cigar = (bwa_cigar_t*)cigar32;
for (k = 0; k < *n_cigar; ++k)
cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
free(rseq);
return cigar;
}
char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
{
bwtint_t x, y;
int z, u, c, nm = 0;
str->l = 0; // reset
x = pos; y = 0;
if (cigar) {
int k, l;
for (k = u = 0; k < n_cigar; ++k) {
l = __cigar_len(cigar[k]);
if (__cigar_op(cigar[k]) == FROM_M) {
for (z = 0; z < l && x+z < l_pac; ++z) {
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
ksprintf(str, "%d", u);
kputc("ACGTN"[c], str);
++nm;
u = 0;
} else ++u;
}
x += l; y += l;
} else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
y += l;
if (__cigar_op(cigar[k]) == FROM_I) nm += l;
} else if (__cigar_op(cigar[k]) == FROM_D) {
ksprintf(str, "%d", u);
kputc('^', str);
for (z = 0; z < l && x+z < l_pac; ++z)
kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
u = 0;
x += l; nm += l;
}
}
} else { // no gaps
for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) {
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
ksprintf(str, "%d", u);
kputc("ACGTN"[c], str);
++nm;
u = 0;
} else ++u;
}
}
ksprintf(str, "%d", u);
*_nm = nm;
return strdup(str->s);
}
void bwa_correct_trimmed(bwa_seq_t *s)
{
if (s->len == s->full_len) return;
if (s->strand == 0) { // forward
if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
s->cigar[s->n_cigar-1] += s->full_len - s->len;
} else {
if (s->cigar == 0) {
s->n_cigar = 2;
s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
s->cigar[0] = __cigar_create(0, s->len);
} else {
++s->n_cigar;
s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
}
s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
}
} else { // reverse
if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
s->cigar[0] += s->full_len - s->len;
} else {
if (s->cigar == 0) {
s->n_cigar = 2;
s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
s->cigar[1] = __cigar_create(0, s->len);
} else {
++s->n_cigar;
s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
}
s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
}
}
s->len = s->full_len;
}
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
{
ubyte_t *pacseq;
int i, j, k;
kstring_t *str;
if (!_pacseq) {
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
err_rewind(bns->fp_pac);
err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
} else pacseq = _pacseq;
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *s = seqs + i;
seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
for (j = k = 0; j < s->n_multi; ++j) {
bwt_multi1_t *q = s->multi + j;
int n_cigar;
if (q->gap) { // gapped alignment
q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar);
q->n_cigar = n_cigar;
if (q->cigar) s->multi[k++] = *q;
} else s->multi[k++] = *q;
}
s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation
if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar);
if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH;
}
// generate MD tag
str = (kstring_t*)calloc(1, sizeof(kstring_t));
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *s = seqs + i;
if (s->type != BWA_TYPE_NO_MATCH) {
int nm;
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm);
s->nm = nm;
}
}
free(str->s); free(str);
// correct for trimmed reads
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
if (!_pacseq) free(pacseq);
}
int64_t pos_end(const bwa_seq_t *p)
{
if (p->cigar) {
int j;
int64_t x = p->pos;
for (j = 0; j != p->n_cigar; ++j) {
int op = __cigar_op(p->cigar[j]);
if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
}
return x;
} else return p->pos + p->len;
}
int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
{
if (p->cigar) {
int j;
int64_t x = p->pos;
for (j = 0; j != p->n_cigar; ++j) {
int op = __cigar_op(p->cigar[j]);
if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
}
return x;
} else return p->pos + len;
}
static int64_t pos_5(const bwa_seq_t *p)
{
if (p->type != BWA_TYPE_NO_MATCH)
return p->strand? pos_end(p) : p->pos;
return -1;
}
void bwa_print_seq(FILE *stream, bwa_seq_t *seq) {
char buffer[4096];
const int bsz = sizeof(buffer);
int i, j, l;
if (seq->strand == 0) {
for (i = 0; i < seq->full_len; i += bsz) {
l = seq->full_len - i > bsz ? bsz : seq->full_len - i;
for (j = 0; j < l; j++) buffer[j] = "ACGTN"[seq->seq[i + j]];
err_fwrite(buffer, 1, l, stream);
}
} else {
for (i = seq->full_len - 1; i >= 0; i -= bsz) {
l = i + 1 > bsz ? bsz : i + 1;
for (j = 0; j < l; j++) buffer[j] = "TGCAN"[seq->seq[i - j]];
err_fwrite(buffer, 1, l, stream);
}
}
}
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
{
int j;
if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
int seqid, nn, am = 0, flag = p->extra_flag;
char XT;
if (p->type == BWA_TYPE_NO_MATCH) {
p->pos = mate->pos;
p->strand = mate->strand;
flag |= SAM_FSU;
j = 1;
} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
// get seqid
nn = bns_cnt_ambi(bns, p->pos, j, &seqid);
if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
// update flag and print it
if (p->strand) flag |= SAM_FSR;
if (mate) {
if (mate->type != BWA_TYPE_NO_MATCH) {
if (mate->strand) flag |= SAM_FMR;
} else flag |= SAM_FMU;
}
err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
// print CIGAR
if (p->cigar) {
for (j = 0; j != p->n_cigar; ++j)
err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
} else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*");
else err_printf("%dM", p->len);
// print mate coordinate
if (mate && mate->type != BWA_TYPE_NO_MATCH) {
int m_seqid;
long long isize;
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
// redundant calculation here, but should not matter too much
bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
} else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
else err_printf("\t*\t0\t0\t");
// print sequence and quality
bwa_print_seq(stdout, p);
err_putchar('\t');
if (p->qual) {
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
err_printf("%s", p->qual);
} else err_printf("*");
if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
if (p->type != BWA_TYPE_NO_MATCH) {
int i;
// calculate XT tag
XT = "NURM"[p->type];
if (nn > 10) XT = 'N';
// print tags
err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
if (nn) err_printf("\tXN:i:%d", nn);
if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
err_printf("\tX0:i:%d", p->c1);
if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2);
}
err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
if (p->md) err_printf("\tMD:Z:%s", p->md);
// print multiple hits
if (p->n_multi) {
err_printf("\tXA:Z:");
for (i = 0; i < p->n_multi; ++i) {
bwt_multi1_t *q = p->multi + i;
int k;
j = pos_end_multi(q, p->len) - q->pos;
nn = bns_cnt_ambi(bns, q->pos, j, &seqid);
err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
(int)(q->pos - bns->anns[seqid].offset + 1));
if (q->cigar) {
for (k = 0; k < q->n_cigar; ++k)
err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
} else err_printf("%dM", p->len);
err_printf(",%d;", q->gap + q->mm);
}
}
}
err_putchar('\n');
} else { // this read has no match
//ubyte_t *s = p->strand? p->rseq : p->seq;
int flag = p->extra_flag | SAM_FSU;
if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
//Why did this work differently to the version above??
//for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
bwa_print_seq(stdout, p);
err_putchar('\t');
if (p->qual) {
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
err_printf("%s", p->qual);
} else err_printf("*");
if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
err_putchar('\n');
}
}
void bwase_initialize()
{
int i;
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
}
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
{
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, n_seqs, m_aln;
long long tot_seqs = 0;
bwt_aln1_t *aln = 0;
bwa_seq_t *seqs;
bwa_seqio_t *ks;
clock_t t;
bntseq_t *bns;
FILE *fp_sa;
gap_opt_t opt;
char magic[4];
// initialization
bwase_initialize();
bns = bns_restore(prefix);
srand48(bns->seed);
fp_sa = xopen(fn_sa, "r");
m_aln = 0;
err_fread_noeof(magic, 1, 4, fp_sa);
if (strncmp(magic, SAI_MAGIC, 4) != 0) {
fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__);
exit(1);
}
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa);
bwa_print_sam_hdr(bns, rg_line);
// set ks
ks = bwa_open_reads(opt.mode, fn_fa);
// core loop
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
tot_seqs += n_seqs;
t = clock();
// read alignment
for (i = 0; i < n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
int n_aln;
err_fread_noeof(&n_aln, 4, 1, fp_sa);
if (n_aln > m_aln) {
m_aln = n_aln;
aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
}
err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
}
fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
bwa_refine_gapped(bns, n_seqs, seqs, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] print alignments... ");
for (i = 0; i < n_seqs; ++i)
bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
bwa_free_read_seq(n_seqs, seqs);
fprintf(stderr, "[bwa_aln_core] %lld sequences have been processed.\n", tot_seqs);
}
// destroy
bwa_seq_close(ks);
bns_destroy(bns);
err_fclose(fp_sa);
free(aln);
}
int bwa_sai2sam_se(int argc, char *argv[])
{
int c, n_occ = 3;
char *prefix, *rg_line = 0;
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
switch (c) {
case 'h': break;
case 'r':
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
break;
case 'n': n_occ = atoi(optarg); break;
case 'f': xreopen(optarg, "w", stdout); break;
default: return 1;
}
}
if (optind + 3 > argc) {
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
return 1;
}
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
return 1;
}
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
free(prefix);
return 0;
}

29
bwase.h 100644
View File

@ -0,0 +1,29 @@
#ifndef BWASE_H
#define BWASE_H
#include "bntseq.h"
#include "bwt.h"
#include "bwtaln.h"
#ifdef __cplusplus
extern "C" {
#endif
// Initialize mapping tables in the bwa single-end mapper.
void bwase_initialize();
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
// Refine the approximate position of the sequence to an actual placement for the sequence.
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
// Backfill certain alignment properties mainly centering around number of matches.
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
// Calculate the end position of a read given a certain sequence.
int64_t pos_end(const bwa_seq_t *p);
//
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
#ifdef __cplusplus
}
#endif
#endif // BWASE_H

235
bwaseqio.c 100644
View File

@ -0,0 +1,235 @@
#include <zlib.h>
#include <ctype.h>
#include "bwtaln.h"
#include "utils.h"
#include "bamlite.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
extern unsigned char nst_nt4_table[256];
static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
struct __bwa_seqio_t {
// for BAM input
int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE
bamFile fp;
// for fastq input
kseq_t *ks;
};
bwa_seqio_t *bwa_bam_open(const char *fn, int which)
{
bwa_seqio_t *bs;
bam_header_t *h;
bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
bs->is_bam = 1;
bs->which = which;
bs->fp = bam_open(fn, "r");
if (0 == bs->fp) err_fatal_simple("Couldn't open bam file");
h = bam_header_read(bs->fp);
bam_header_destroy(h);
return bs;
}
bwa_seqio_t *bwa_seq_open(const char *fn)
{
gzFile fp;
bwa_seqio_t *bs;
bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
fp = xzopen(fn, "r");
bs->ks = kseq_init(fp);
return bs;
}
void bwa_seq_close(bwa_seqio_t *bs)
{
if (bs == 0) return;
if (bs->is_bam) {
if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file");
} else {
err_gzclose(bs->ks->f->f);
kseq_destroy(bs->ks);
}
free(bs);
}
void seq_reverse(int len, ubyte_t *seq, int is_comp)
{
int i;
if (is_comp) {
for (i = 0; i < len>>1; ++i) {
char tmp = seq[len-1-i];
if (tmp < 4) tmp = 3 - tmp;
seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
seq[i] = tmp;
}
if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
} else {
for (i = 0; i < len>>1; ++i) {
char tmp = seq[len-1-i];
seq[len-1-i] = seq[i]; seq[i] = tmp;
}
}
}
int bwa_trim_read(int trim_qual, bwa_seq_t *p)
{
int s = 0, l, max = 0, max_l = p->len;
if (trim_qual < 1 || p->qual == 0) return 0;
for (l = p->len - 1; l >= BWA_MIN_RDLEN; --l) {
s += trim_qual - (p->qual[l] - 33);
if (s < 0) break;
if (s > max) max = s, max_l = l;
}
p->clip_len = p->len = max_l;
return p->full_len - p->len;
}
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
{
bwa_seq_t *seqs, *p;
int n_seqs, l, i;
long n_trimmed = 0, n_tot = 0;
bam1_t *b;
int res;
b = bam_init1();
n_seqs = 0;
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
while ((res = bam_read1(bs->fp, b)) >= 0) {
uint8_t *s, *q;
int go = 0;
if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
if (go == 0) continue;
l = b->core.l_qseq;
p = &seqs[n_seqs++];
p->tid = -1; // no assigned to a thread
p->qual = 0;
p->full_len = p->clip_len = p->len = l;
n_tot += p->full_len;
s = bam1_seq(b); q = bam1_qual(b);
p->seq = (ubyte_t*)calloc(p->len + 1, 1);
p->qual = (ubyte_t*)calloc(p->len + 1, 1);
for (i = 0; i != p->full_len; ++i) {
p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
}
if (bam1_strand(b)) { // then reverse
seq_reverse(p->len, p->seq, 1);
seq_reverse(p->len, p->qual, 0);
}
if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
p->rseq = (ubyte_t*)calloc(p->full_len, 1);
memcpy(p->rseq, p->seq, p->len);
seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
seq_reverse(p->len, p->rseq, is_comp);
p->name = strdup((const char*)bam1_qname(b));
if (n_seqs == n_needed) break;
}
if (res < 0 && res != -1) err_fatal_simple("Error reading bam file");
*n = n_seqs;
if (n_seqs && trim_qual >= 1)
fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
if (n_seqs == 0) {
free(seqs);
bam_destroy1(b);
return 0;
}
bam_destroy1(b);
return seqs;
}
#define BARCODE_LOW_QUAL 13
bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
{
bwa_seq_t *seqs, *p;
kseq_t *seq = bs->ks;
int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
long n_trimmed = 0, n_tot = 0;
if (l_bc > BWA_MAX_BCLEN) {
fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN);
return 0;
}
if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
n_seqs = 0;
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
while ((l = kseq_read(seq)) >= 0) {
if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) {
// skip reads that are marked to be filtered by Casava
char *s = index(seq->comment.s, ':');
if (s && *(++s) == 'Y') {
continue;
}
}
if (is_64 && seq->qual.l)
for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
p = &seqs[n_seqs++];
if (l_bc) { // then trim barcode
for (i = 0; i < l_bc; ++i)
p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
p->bc[i] = 0;
for (; i < seq->seq.l; ++i)
seq->seq.s[i - l_bc] = seq->seq.s[i];
seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
if (seq->qual.l) {
for (i = l_bc; i < seq->qual.l; ++i)
seq->qual.s[i - l_bc] = seq->qual.s[i];
seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
}
l = seq->seq.l;
} else p->bc[0] = 0;
p->tid = -1; // no assigned to a thread
p->qual = 0;
p->full_len = p->clip_len = p->len = l;
n_tot += p->full_len;
p->seq = (ubyte_t*)calloc(p->full_len, 1);
for (i = 0; i != p->full_len; ++i)
p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
if (seq->qual.l) { // copy quality
p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
}
p->rseq = (ubyte_t*)calloc(p->full_len, 1);
memcpy(p->rseq, p->seq, p->len);
seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
seq_reverse(p->len, p->rseq, is_comp);
p->name = strdup((const char*)seq->name.s);
{ // trim /[12]$
int t = seq->name.l;
if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
}
if (n_seqs == n_needed) break;
}
*n = n_seqs;
if (n_seqs && trim_qual >= 1)
fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
if (n_seqs == 0) {
free(seqs);
return 0;
}
return seqs;
}
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
{
int i, j;
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
for (j = 0; j < p->n_multi; ++j)
if (p->multi[j].cigar) free(p->multi[j].cigar);
free(p->name);
free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
free(p->cigar);
}
free(seqs);
}

217
bwashm.c 100644
View File

@ -0,0 +1,217 @@
#include <sys/types.h>
#include <sys/mman.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include "bwa.h"
#ifndef PATH_MAX
# define PATH_MAX 1024
#endif
int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
{
const char *name;
uint8_t *shm, *shm_idx;
uint16_t *cnt;
int shmid, to_init = 0, l;
char path[PATH_MAX + 1], *tmpfn = (char*)_tmpfn;
if (hint == 0 || hint[0] == 0) return -1;
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
++name;
if ((shmid = shm_open("/bwactl", O_RDWR, 0)) < 0) {
shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
to_init = 1;
}
if (shmid < 0) return -1;
ftruncate(shmid, BWA_CTL_SIZE);
shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
if (to_init) {
memset(shm, 0, BWA_CTL_SIZE);
cnt[1] = 4;
}
if (idx->mem == 0) bwa_idx2mem(idx);
if (tmpfn) {
FILE *fp;
if ((fp = fopen(tmpfn, "wb")) != 0) {
int64_t rest = idx->l_mem;
while (rest > 0) {
int64_t l = rest < 0x1000000? rest : 0x1000000;
rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp);
}
fclose(fp);
free(idx->mem); idx->mem = 0;
} else {
fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__);
tmpfn = 0;
}
}
strcat(strcpy(path, "/bwaidx-"), name);
if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
shm_unlink(path);
perror("shm_open()");
return -1;
}
l = 8 + strlen(name) + 1;
if (cnt[1] + l > BWA_CTL_SIZE) return -1;
memcpy(shm + cnt[1], &idx->l_mem, 8);
memcpy(shm + cnt[1] + 8, name, l - 8);
cnt[1] += l; ++cnt[0];
ftruncate(shmid, idx->l_mem);
shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
if (tmpfn) {
FILE *fp;
fp = fopen(tmpfn, "rb");
int64_t rest = idx->l_mem;
while (rest > 0) {
int64_t l = rest < 0x1000000? rest : 0x1000000;
rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp);
}
fclose(fp);
unlink(tmpfn);
} else {
memcpy(shm_idx, idx->mem, idx->l_mem);
free(idx->mem);
}
bwa_mem2idx(idx->l_mem, shm_idx, idx);
idx->is_shm = 1;
return 0;
}
bwaidx_t *bwa_idx_load_from_shm(const char *hint)
{
const char *name;
uint8_t *shm, *shm_idx;
uint16_t *cnt, i;
char *p, path[PATH_MAX + 1];
int shmid;
int64_t l_mem;
bwaidx_t *idx;
if (hint == 0 || hint[0] == 0) return 0;
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
++name;
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
if (cnt[0] == 0) return 0;
for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) {
memcpy(&l_mem, p, 8); p += 8;
if (strcmp(p, name) == 0) break;
p += strlen(p) + 1;
}
if (i == cnt[0]) return 0;
strcat(strcpy(path, "/bwaidx-"), name);
if ((shmid = shm_open(path, O_RDONLY, 0)) < 0) return 0;
shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0);
idx = calloc(1, sizeof(bwaidx_t));
bwa_mem2idx(l_mem, shm_idx, idx);
idx->is_shm = 1;
return idx;
}
int bwa_shm_test(const char *hint)
{
int shmid;
uint16_t *cnt, i;
char *p, *shm;
const char *name;
if (hint == 0 || hint[0] == 0) return 0;
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
++name;
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
if (strcmp(p + 8, name) == 0) return 1;
p += strlen(p) + 9;
}
return 0;
}
int bwa_shm_list(void)
{
int shmid;
uint16_t *cnt, i;
char *p, *shm;
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
int64_t l_mem;
memcpy(&l_mem, p, 8); p += 8;
printf("%s\t%ld\n", p, (long)l_mem);
p += strlen(p) + 1;
}
return 0;
}
int bwa_shm_destroy(void)
{
int shmid;
uint16_t *cnt, i;
char *p, *shm;
char path[PATH_MAX + 1];
if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
int64_t l_mem;
memcpy(&l_mem, p, 8); p += 8;
strcat(strcpy(path, "/bwaidx-"), p);
shm_unlink(path);
p += strlen(p) + 1;
}
munmap(shm, BWA_CTL_SIZE);
shm_unlink("/bwactl");
return 0;
}
int main_shm(int argc, char *argv[])
{
int c, to_list = 0, to_drop = 0, ret = 0;
char *tmpfn = 0;
while ((c = getopt(argc, argv, "ldf:")) >= 0) {
if (c == 'l') to_list = 1;
else if (c == 'd') to_drop = 1;
else if (c == 'f') tmpfn = optarg;
}
if (optind == argc && !to_list && !to_drop) {
fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
fprintf(stderr, " -l list names of indices in shared memory\n");
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
return 1;
}
if (optind < argc && (to_list || to_drop)) {
fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__);
return 1;
}
if (optind < argc) {
if (bwa_shm_test(argv[optind]) == 0) {
bwaidx_t *idx;
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
ret = 1;
}
bwa_idx_destroy(idx);
} else fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
}
if (to_list) bwa_shm_list();
if (to_drop) bwa_shm_destroy();
return ret;
}

469
bwt.c 100644
View File

@ -0,0 +1,469 @@
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Contact: Heng Li <lh3@sanger.ac.uk> */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdint.h>
#include <limits.h>
#include "utils.h"
#include "bwt.h"
#include "kvec.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
void bwt_gen_cnt_table(bwt_t *bwt)
{
int i, j;
for (i = 0; i != 256; ++i) {
uint32_t x = 0;
for (j = 0; j != 4; ++j)
x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
bwt->cnt_table[i] = x;
}
}
static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
{
bwtint_t x = k - (k > bwt->primary);
x = bwt_B0(bwt, x);
x = bwt->L2[x] + bwt_occ(bwt, k, x);
return k == bwt->primary? 0 : x;
}
// bwt->bwt and bwt->occ must be precalculated
void bwt_cal_sa(bwt_t *bwt, int intv)
{
bwtint_t isa, sa, i; // S(isa) = sa
int intv_round = intv;
kv_roundup32(intv_round);
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
if (bwt->sa) free(bwt->sa);
bwt->sa_intv = intv;
bwt->n_sa = (bwt->seq_len + intv) / intv;
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
// calculate SA value
isa = 0; sa = bwt->seq_len;
for (i = 0; i < bwt->seq_len; ++i) {
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
--sa;
isa = bwt_invPsi(bwt, isa);
}
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
}
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
{
bwtint_t sa = 0, mask = bwt->sa_intv - 1;
while (k & mask) {
++sa;
k = bwt_invPsi(bwt, k);
}
/* without setting bwt->sa[0] = -1, the following line should be
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
return sa + bwt->sa[k/bwt->sa_intv];
}
static inline int __occ_aux(uint64_t y, int c)
{
// reduce nucleotide counting to bits counting
y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
// count the number of 1s in y
y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
}
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
{
bwtint_t n;
uint32_t *p, *end;
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
if (k == (bwtint_t)(-1)) return 0;
k -= (k >= bwt->primary); // because $ is not in bwt
// retrieve Occ at k/OCC_INTERVAL
n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
p += sizeof(bwtint_t); // jump to the start of the first BWT cell
// calculate Occ up to the last k/32
end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1);
for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
// calculate Occ
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
if (c == 0) n -= ~k&31; // corrected for the masked bits
return n;
}
// an analogy to bwt_occ() but more efficient, requiring k <= l
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
{
bwtint_t _k, _l;
_k = (k >= bwt->primary)? k-1 : k;
_l = (l >= bwt->primary)? l-1 : l;
if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
*ok = bwt_occ(bwt, k, c);
*ol = bwt_occ(bwt, l, c);
} else {
bwtint_t m, n, i, j;
uint32_t *p;
if (k >= bwt->primary) --k;
if (l >= bwt->primary) --l;
n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
p += sizeof(bwtint_t);
// calculate *ok
j = k >> 5 << 5;
for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
m = n;
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
if (c == 0) n -= ~k&31; // corrected for the masked bits
*ok = n;
// calculate *ol
j = l >> 5 << 5;
for (; i < j; i += 32, p += 2)
m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
if (c == 0) m -= ~l&31; // corrected for the masked bits
*ol = m;
}
}
#define __occ_aux4(bwt, b) \
((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
+ (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
{
bwtint_t x;
uint32_t *p, tmp, *end;
if (k == (bwtint_t)(-1)) {
memset(cnt, 0, 4 * sizeof(bwtint_t));
return;
}
k -= (k >= bwt->primary); // because $ is not in bwt
p = bwt_occ_intv(bwt, k);
memcpy(cnt, p, 4 * sizeof(bwtint_t));
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop
for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p);
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
x += __occ_aux4(bwt, tmp) - (~k&15);
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
}
// an analogy to bwt_occ4() but more efficient, requiring k <= l
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
{
bwtint_t _k, _l;
_k = k - (k >= bwt->primary);
_l = l - (l >= bwt->primary);
if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
bwt_occ4(bwt, k, cntk);
bwt_occ4(bwt, l, cntl);
} else {
bwtint_t x, y;
uint32_t *p, tmp, *endk, *endl;
k -= (k >= bwt->primary); // because $ is not in bwt
l -= (l >= bwt->primary);
p = bwt_occ_intv(bwt, k);
memcpy(cntk, p, 4 * sizeof(bwtint_t));
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
// prepare cntk[]
endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4));
endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4));
for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p);
y = x;
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
x += __occ_aux4(bwt, tmp) - (~k&15);
// calculate cntl[] and finalize cntk[]
for (; p < endl; ++p) y += __occ_aux4(bwt, *p);
tmp = *p & ~((1U<<((~l&15)<<1)) - 1);
y += __occ_aux4(bwt, tmp) - (~l&15);
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
}
}
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
{
bwtint_t k, l, ok, ol;
int i;
k = 0; l = bwt->seq_len;
for (i = len - 1; i >= 0; --i) {
ubyte_t c = str[i];
if (c > 3) return 0; // no match
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
k = bwt->L2[c] + ok + 1;
l = bwt->L2[c] + ol;
if (k > l) break; // no match
}
if (k > l) return 0; // no match
if (sa_begin) *sa_begin = k;
if (sa_end) *sa_end = l;
return l - k + 1;
}
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
{
int i;
bwtint_t k, l, ok, ol;
k = *k0; l = *l0;
for (i = len - 1; i >= 0; --i) {
ubyte_t c = str[i];
if (c > 3) return 0; // there is an N here. no match
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
k = bwt->L2[c] + ok + 1;
l = bwt->L2[c] + ol;
if (k > l) return 0; // no match
}
*k0 = k; *l0 = l;
return l - k + 1;
}
/*********************
* Bidirectional BWT *
*********************/
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
{
bwtint_t tk[4], tl[4];
int i;
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
for (i = 0; i != 4; ++i) {
ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i];
ok[i].x[2] = tl[i] - tk[i];
}
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary);
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
}
static void bwt_reverse_intvs(bwtintv_v *p)
{
if (p->n > 1) {
int j;
for (j = 0; j < p->n>>1; ++j) {
bwtintv_t tmp = p->a[p->n - 1 - j];
p->a[p->n - 1 - j] = p->a[j];
p->a[j] = tmp;
}
}
}
// NOTE: $max_intv is not currently used in BWA-MEM
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
bwtintv_v a[2], *prev, *curr, *swap;
mem->n = 0;
if (q[x] > 3) return x + 1;
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
kv_init(a[0]); kv_init(a[1]);
prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided
curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1];
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
ik.info = x + 1;
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
if (ik.x[2] < max_intv) { // an interval small enough
kv_push(bwtintv_t, *curr, ik);
break;
} else if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
kv_push(bwtintv_t, *curr, ik);
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
}
ik = ok[c]; ik.info = i + 1;
} else { // an ambiguous base
kv_push(bwtintv_t, *curr, ik);
break; // always terminate extension at an ambiguous base; in this case, i<len always stands
}
}
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
ret = curr->a[0].info; // this will be the returned value
swap = curr; curr = prev; prev = swap;
for (i = x - 1; i >= -1; --i) { // backward search for MEMs
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j];
if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
kv_push(bwtintv_t, *mem, ik);
}
} // otherwise the match is contained in another longer match
} else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) {
ok[c].info = p->info;
kv_push(bwtintv_t, *curr, ok[c]);
}
}
if (curr->n == 0) break;
swap = curr; curr = prev; prev = swap;
}
bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a);
if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a);
return ret;
}
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
}
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
{
int i, c;
bwtintv_t ik, ok[4];
memset(mem, 0, sizeof(bwtintv_t));
if (q[x] > 3) return x + 1;
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
for (i = x + 1; i < len; ++i) { // forward search
if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] < max_intv && i - x >= min_len) {
*mem = ok[c];
mem->info = (uint64_t)x<<32 | (i + 1);
return i + 1;
}
ik = ok[c];
} else return i + 1;
}
return len;
}
/*************************
* Read/write BWT and SA *
*************************/
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
err_fflush(fp);
err_fclose(fp);
}
void bwt_dump_sa(const char *fn, const bwt_t *bwt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
err_fflush(fp);
err_fclose(fp);
}
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
const int bufsize = 0x1000000; // 16M block
bwtint_t offset = 0;
while (size) {
int x = bufsize < size? bufsize : size;
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break;
size -= x; offset += x;
}
return offset;
}
void bwt_restore_sa(const char *fn, bwt_t *bwt)
{
char skipped[256];
FILE *fp;
bwtint_t primary;
fp = xopen(fn, "rb");
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
bwt->sa[0] = -1;
fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
err_fclose(fp);
}
bwt_t *bwt_restore_bwt(const char *fn)
{
bwt_t *bwt;
FILE *fp;
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
fp = xopen(fn, "rb");
err_fseek(fp, 0, SEEK_END);
bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
err_fseek(fp, 0, SEEK_SET);
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
fread_fix(fp, bwt->bwt_size<<2, bwt->bwt);
bwt->seq_len = bwt->L2[4];
err_fclose(fp);
bwt_gen_cnt_table(bwt);
return bwt;
}
void bwt_destroy(bwt_t *bwt)
{
if (bwt == 0) return;
free(bwt->sa); free(bwt->bwt);
free(bwt);
}

132
bwt.h 100644
View File

@ -0,0 +1,132 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Contact: Heng Li <hli@jimmy.harvard.edu> */
#ifndef BWA_BWT_H
#define BWA_BWT_H
#include <stdint.h>
#include <stddef.h>
// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
#define OCC_INTV_SHIFT 7
#define OCC_INTERVAL (1LL<<OCC_INTV_SHIFT)
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
#ifndef BWA_UBYTE
#define BWA_UBYTE
typedef unsigned char ubyte_t;
#endif
typedef uint64_t bwtint_t;
typedef struct {
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_table[256];
// suffix array
int sa_intv;
bwtint_t n_sa;
bwtint_t *sa;
} bwt_t;
typedef struct {
bwtint_t x[3], info;
} bwtintv_t;
typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
/* For general OCC_INTERVAL, the following is correct:
#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16])
#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4)
*/
// The following two lines are ONLY correct when OCC_INTERVAL==0x80
#define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)])
#define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4))
/* retrieve a character from the $-removed BWT string. Note that
* bwt_t::bwt is not exactly the BWT string and therefore this macro is
* called bwt_B0 instead of bwt_B */
#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
#ifdef __cplusplus
extern "C" {
#endif
void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
void bwt_dump_sa(const char *fn, const bwt_t *bwt);
bwt_t *bwt_restore_bwt(const char *fn);
void bwt_restore_sa(const char *fn, bwt_t *bwt);
void bwt_destroy(bwt_t *bwt);
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
void bwt_cal_sa(bwt_t *bwt, int intv);
void bwt_bwtupdate_core(bwt_t *bwt);
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
void bwt_gen_cnt_table(bwt_t *bwt);
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
/**
* Extend bi-SA-interval _ik_
*/
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
/**
* Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
* Return the end of the longest exact match starting from _x_.
*/
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
#ifdef __cplusplus
}
#endif
#endif

1623
bwt_gen.c 100644

File diff suppressed because it is too large Load Diff

98
bwt_lite.c 100644
View File

@ -0,0 +1,98 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "bwt_lite.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
int is_sa(const uint8_t *T, int *SA, int n);
int is_bwt(uint8_t *T, int n);
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
{
bwtl_t *b;
int i;
b = (bwtl_t*)calloc(1, sizeof(bwtl_t));
b->seq_len = len;
{ // calculate b->bwt
uint8_t *s;
b->sa = (uint32_t*)calloc(len + 1, 4);
is_sa(seq, (int*)b->sa, len);
s = (uint8_t*)calloc(len + 1, 1);
for (i = 0; i <= len; ++i) {
if (b->sa[i] == 0) b->primary = i;
else s[i] = seq[b->sa[i] - 1];
}
for (i = b->primary; i < len; ++i) s[i] = s[i + 1];
b->bwt_size = (len + 15) / 16;
b->bwt = (uint32_t*)calloc(b->bwt_size, 4);
for (i = 0; i < len; ++i)
b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1);
free(s);
}
{ // calculate b->occ
uint32_t c[4];
b->n_occ = (len + 15) / 16 * 4;
b->occ = (uint32_t*)calloc(b->n_occ, 4);
memset(c, 0, 16);
for (i = 0; i < len; ++i) {
if (i % 16 == 0)
memcpy(b->occ + (i/16) * 4, c, 16);
++c[bwtl_B0(b, i)];
}
memcpy(b->L2+1, c, 16);
for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1];
}
{ // generate cnt_table
for (i = 0; i != 256; ++i) {
uint32_t j, x = 0;
for (j = 0; j != 4; ++j)
x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
b->cnt_table[i] = x;
}
}
return b;
}
uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
{
uint32_t n, b;
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
if (k == (uint32_t)(-1)) return 0;
if (k >= bwt->primary) --k; // because $ is not in bwt
n = bwt->occ[k/16<<2|c];
b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1);
n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
+ bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff;
if (c == 0) n -= 15 - (k&15); // corrected for the masked bits
return n;
}
void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
{
uint32_t x, b;
if (k == (uint32_t)(-1)) {
memset(cnt, 0, 16);
return;
}
if (k >= bwt->primary) --k; // because $ is not in bwt
memcpy(cnt, bwt->occ + (k>>4<<2), 16);
b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1);
x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
+ bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24];
x -= 15 - (k&15);
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
}
void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
{
bwtl_occ4(bwt, k, cntk);
bwtl_occ4(bwt, l, cntl);
}
void bwtl_destroy(bwtl_t *bwt)
{
if (bwt) {
free(bwt->occ); free(bwt->bwt); free(bwt->sa);
free(bwt);
}
}

29
bwt_lite.h 100644
View File

@ -0,0 +1,29 @@
#ifndef BWT_LITE_H_
#define BWT_LITE_H_
#include <stdint.h>
typedef struct {
uint32_t seq_len, bwt_size, n_occ;
uint32_t primary;
uint32_t *bwt, *occ, *sa, L2[5];
uint32_t cnt_table[256];
} bwtl_t;
#define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
#ifdef __cplusplus
extern "C" {
#endif
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
void bwtl_destroy(bwtl_t *bwt);
#ifdef __cplusplus
}
#endif
#endif

321
bwtaln.c 100644
View File

@ -0,0 +1,321 @@
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "bwtaln.h"
#include "bwtgap.h"
#include "utils.h"
#include "bwa.h"
#ifdef HAVE_PTHREAD
#include <pthread.h>
#endif
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
gap_opt_t *gap_init_opt()
{
gap_opt_t *o;
o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
/* IMPORTANT: s_mm*10 should be about the average base error
rate. Voilating this requirement will break pairing! */
o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
o->seed_len = 32; o->max_seed_diff = 2;
o->fnr = 0.04;
o->n_threads = 1;
o->max_top2 = 30;
o->trim_qual = 0;
return o;
}
int bwa_cal_maxdiff(int l, double err, double thres)
{
double elambda = exp(-l * err);
double sum, y = 1.0;
int k, x = 1;
for (k = 1, sum = elambda; k < 1000; ++k) {
y *= l * err;
x *= k;
sum += elambda * y / x;
if (1.0 - sum < thres) return k;
}
return 2;
}
// width must be filled as zero
int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width)
{
bwtint_t k, l, ok, ol;
int i, bid;
bid = 0;
k = 0; l = bwt->seq_len;
for (i = 0; i < len; ++i) {
ubyte_t c = str[i];
if (c < 4) {
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
k = bwt->L2[c] + ok + 1;
l = bwt->L2[c] + ol;
}
if (k > l || c > 3) { // then restart
k = 0;
l = bwt->seq_len;
++bid;
}
width[i].w = l - k + 1;
width[i].bid = bid;
}
width[len].w = 0;
width[len].bid = ++bid;
return bid;
}
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
{
int i, j, max_l = 0, max_len;
gap_stack_t *stack;
bwt_width_t *w, *seed_w;
gap_opt_t local_opt = *opt;
// initiate priority stack
for (i = max_len = 0; i != n_seqs; ++i)
if (seqs[i].len > max_len) max_len = seqs[i].len;
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
w = 0;
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
#ifdef HAVE_PTHREAD
if (i % opt->n_threads != tid) continue;
#endif
p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
if (max_l < p->len) {
max_l = p->len;
w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t));
memset(w, 0, (max_l + 1) * sizeof(bwt_width_t));
}
bwt_cal_width(bwt, p->len, p->seq, w);
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
if (p->len > opt->seed_len)
bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w);
// core function
for (j = 0; j < p->len; ++j) // we need to complement
p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j];
p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
//fprintf(stderr, "mm=%lld,ins=%lld,del=%lld,gapo=%lld\n", p->aln->n_mm, p->aln->n_ins, p->aln->n_del, p->aln->n_gapo);
// clean up the unused data in the record
free(p->name); free(p->seq); free(p->rseq); free(p->qual);
p->name = 0; p->seq = p->rseq = p->qual = 0;
}
free(seed_w); free(w);
gap_destroy_stack(stack);
}
#ifdef HAVE_PTHREAD
typedef struct {
int tid;
bwt_t *bwt;
int n_seqs;
bwa_seq_t *seqs;
const gap_opt_t *opt;
} thread_aux_t;
static void *worker(void *data)
{
thread_aux_t *d = (thread_aux_t*)data;
bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
return 0;
}
#endif
bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
{
bwa_seqio_t *ks;
if (mode & BWA_MODE_BAM) { // open BAM
int which = 0;
if (mode & BWA_MODE_BAM_SE) which |= 4;
if (mode & BWA_MODE_BAM_READ1) which |= 1;
if (mode & BWA_MODE_BAM_READ2) which |= 2;
if (which == 0) which = 7; // then read all reads
ks = bwa_bam_open(fn_fa, which);
} else ks = bwa_seq_open(fn_fa);
return ks;
}
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
{
int i, n_seqs;
long long tot_seqs = 0;
bwa_seq_t *seqs;
bwa_seqio_t *ks;
clock_t t;
bwt_t *bwt;
// initialization
ks = bwa_open_reads(opt->mode, fn_fa);
{ // load BWT
char *str = (char*)calloc(strlen(prefix) + 10, 1);
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
free(str);
}
// core loop
err_fwrite(SAI_MAGIC, 1, 4, stdout);
err_fwrite(opt, sizeof(gap_opt_t), 1, stdout);
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) {
tot_seqs += n_seqs;
t = clock();
fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
#ifdef HAVE_PTHREAD
if (opt->n_threads <= 1) { // no multi-threading at all
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
} else {
pthread_t *tid;
pthread_attr_t attr;
thread_aux_t *data;
int j;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
for (j = 0; j < opt->n_threads; ++j) {
data[j].tid = j; data[j].bwt = bwt;
data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
pthread_create(&tid[j], &attr, worker, data + j);
}
for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
free(data); free(tid);
}
#else
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
#endif
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
t = clock();
fprintf(stderr, "[bwa_aln_core] write to the disk... ");
for (i = 0; i < n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
err_fwrite(&p->n_aln, 4, 1, stdout);
if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
}
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
bwa_free_read_seq(n_seqs, seqs);
fprintf(stderr, "[bwa_aln_core] %lld sequences have been processed.\n", tot_seqs);
}
// destroy
bwt_destroy(bwt);
bwa_seq_close(ks);
}
int bwa_aln(int argc, char *argv[])
{
int c, opte = -1;
gap_opt_t *opt;
char *prefix;
opt = gap_init_opt();
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
switch (c) {
case 'n':
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
break;
case 'o': opt->max_gapo = atoi(optarg); break;
case 'e': opte = atoi(optarg); break;
case 'M': opt->s_mm = atoi(optarg); break;
case 'O': opt->s_gapo = atoi(optarg); break;
case 'E': opt->s_gape = atoi(optarg); break;
case 'd': opt->max_del_occ = atoi(optarg); break;
case 'i': opt->indel_end_skip = atoi(optarg); break;
case 'l': opt->seed_len = atoi(optarg); break;
case 'k': opt->max_seed_diff = atoi(optarg); break;
case 'm': opt->max_entries = atoi(optarg); break;
case 't': opt->n_threads = atoi(optarg); break;
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
case 'R': opt->max_top2 = atoi(optarg); break;
case 'q': opt->trim_qual = atoi(optarg); break;
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
case 'f': xreopen(optarg, "wb", stdout); break;
case 'b': opt->mode |= BWA_MODE_BAM; break;
case '0': opt->mode |= BWA_MODE_BAM_SE; break;
case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
case 'I': opt->mode |= BWA_MODE_IL13; break;
case 'Y': opt->mode |= BWA_MODE_CFY; break;
case 'B': opt->mode |= atoi(optarg) << 24; break;
default: return 1;
}
}
if (opte > 0) {
opt->max_gape = opte;
opt->mode &= ~BWA_MODE_GAPE;
}
if (optind + 2 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
BWA_AVG_ERR, opt->fnr);
fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
fprintf(stderr, " -B INT length of barcode\n");
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
fprintf(stderr, " -b the input read file is in the BAM format\n");
fprintf(stderr, " -0 use single-end reads only (effective with -b)\n");
fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n");
fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n");
fprintf(stderr, " -Y filter Casava-filtered sequences\n");
fprintf(stderr, "\n");
return 1;
}
if (opt->fnr > 0.0) {
int i, k;
for (i = 17, k = 0; i <= 250; ++i) {
int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
k = l;
}
}
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
free(opt);
return 1;
}
bwa_aln_core(prefix, argv[optind+1], opt);
free(opt); free(prefix);
return 0;
}

153
bwtaln.h 100644
View File

@ -0,0 +1,153 @@
#ifndef BWTALN_H
#define BWTALN_H
#include <stdint.h>
#include "bwt.h"
#define BWA_TYPE_NO_MATCH 0
#define BWA_TYPE_UNIQUE 1
#define BWA_TYPE_REPEAT 2
#define BWA_TYPE_MATESW 3
#define SAM_FPD 1 // paired
#define SAM_FPP 2 // properly paired
#define SAM_FSU 4 // self-unmapped
#define SAM_FMU 8 // mate-unmapped
#define SAM_FSR 16 // self on the reverse strand
#define SAM_FMR 32 // mate on the reverse strand
#define SAM_FR1 64 // this is read one
#define SAM_FR2 128 // this is read two
#define SAM_FSC 256 // secondary alignment
#define BWA_AVG_ERR 0.02
#define BWA_MIN_RDLEN 35 // for read trimming
#define BWA_MAX_BCLEN 63 // maximum barcode length; 127 is the maximum
#ifndef bns_pac
#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
#endif
#define FROM_M 0
#define FROM_I 1
#define FROM_D 2
#define FROM_S 3
#define SAI_MAGIC "SAI\1"
typedef struct {
bwtint_t w;
int bid;
} bwt_width_t;
typedef struct {
uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10;
bwtint_t k, l;
} bwt_aln1_t;
typedef uint16_t bwa_cigar_t;
/* rgoya: If changing order of bytes, beware of operations like:
* s->cigar[0] += s->full_len - s->len;
*/
#define CIGAR_OP_SHIFT 14
#define CIGAR_LN_MASK 0x3fff
#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT)
#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK)
#define __cigar_create(__op, __len) ((__op)<<CIGAR_OP_SHIFT | (__len))
typedef struct {
uint32_t n_cigar:15, gap:8, mm:8, strand:1;
int ref_shift;
bwtint_t pos;
bwa_cigar_t *cigar;
} bwt_multi1_t;
typedef struct {
char *name;
ubyte_t *seq, *rseq, *qual;
uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8;
uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8;
int score;
int clip_len;
// alignments in SA coordinates
int n_aln;
bwt_aln1_t *aln;
// multiple hits
int n_multi;
bwt_multi1_t *multi;
// alignment information
bwtint_t sa, pos;
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
int ref_shift;
int n_cigar;
bwa_cigar_t *cigar;
// for multi-threading only
int tid;
// barcode
char bc[BWA_MAX_BCLEN+1]; // null terminated; up to BWA_MAX_BCLEN bases
// NM and MD tags
uint32_t full_len:20, nm:12;
char *md;
} bwa_seq_t;
#define BWA_MODE_GAPE 0x01
#define BWA_MODE_COMPREAD 0x02
#define BWA_MODE_LOGGAP 0x04
#define BWA_MODE_CFY 0x08
#define BWA_MODE_NONSTOP 0x10
#define BWA_MODE_BAM 0x20
#define BWA_MODE_BAM_SE 0x40
#define BWA_MODE_BAM_READ1 0x80
#define BWA_MODE_BAM_READ2 0x100
#define BWA_MODE_IL13 0x200
typedef struct {
int s_mm, s_gapo, s_gape;
int mode; // bit 24-31 are the barcode length
int indel_end_skip, max_del_occ, max_entries;
float fnr;
int max_diff, max_gapo, max_gape;
int max_seed_diff, seed_len;
int n_threads;
int max_top2;
int trim_qual;
} gap_opt_t;
#define BWA_PET_STD 1
typedef struct {
int max_isize, force_isize;
int max_occ;
int n_multi, N_multi;
int type, is_sw, is_preload;
double ap_prior;
} pe_opt_t;
struct __bwa_seqio_t;
typedef struct __bwa_seqio_t bwa_seqio_t;
#ifdef __cplusplus
extern "C" {
#endif
gap_opt_t *gap_init_opt();
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt);
bwa_seqio_t *bwa_seq_open(const char *fn);
bwa_seqio_t *bwa_bam_open(const char *fn, int which);
void bwa_seq_close(bwa_seqio_t *bs);
void seq_reverse(int len, ubyte_t *seq, int is_comp);
bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int mode, int trim_qual);
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
int bwa_cal_maxdiff(int l, double err, double thres);
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
#ifdef __cplusplus
}
#endif
#endif

264
bwtgap.c 100644
View File

@ -0,0 +1,264 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "bwtgap.h"
#include "bwtaln.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#define STATE_M 0
#define STATE_I 1
#define STATE_D 2
#define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape)
gap_stack_t *gap_init_stack2(int max_score)
{
gap_stack_t *stack;
stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t));
stack->n_stacks = max_score;
stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t));
return stack;
}
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt)
{
return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt));
}
void gap_destroy_stack(gap_stack_t *stack)
{
int i;
for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack);
free(stack->stacks);
free(stack);
}
static void gap_reset_stack(gap_stack_t *stack)
{
int i;
for (i = 0; i != stack->n_stacks; ++i)
stack->stacks[i].n_entries = 0;
stack->best = stack->n_stacks;
stack->n_entries = 0;
}
static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int n_ins, int n_del,
int state, int is_diff, const gap_opt_t *opt)
{
int score;
gap_entry_t *p;
gap_stack1_t *q;
score = aln_score(n_mm, n_gapo, n_gape, opt);
q = stack->stacks + score;
if (q->n_entries == q->m_entries) {
q->m_entries = q->m_entries? q->m_entries<<1 : 4;
q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries);
}
p = q->stack + q->n_entries;
p->info = (uint32_t)score<<21 | i; p->k = k; p->l = l;
p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape;
p->n_ins = n_ins; p->n_del = n_del;
p->state = state;
p->last_diff_pos = is_diff? i : 0;
++(q->n_entries);
++(stack->n_entries);
if (stack->best > score) stack->best = score;
}
static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e)
{
gap_stack1_t *q;
q = stack->stacks + stack->best;
*e = q->stack[q->n_entries - 1];
--(q->n_entries);
--(stack->n_entries);
if (q->n_entries == 0 && stack->n_entries) { // reset best
int i;
for (i = stack->best + 1; i < stack->n_stacks; ++i)
if (stack->stacks[i].n_entries != 0) break;
stack->best = i;
} else if (stack->n_entries == 0) stack->best = stack->n_stacks;
}
static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w)
{
int i, j;
for (i = j = 0; i < last_diff_pos; ++i) {
if (w[i].w > x) w[i].w -= x;
else if (w[i].w == x) {
w[i].bid = 1;
w[i].w = max - (++j);
} // else should not happen
}
}
static inline int int_log2(uint32_t v)
{
int c = 0;
if (v & 0xffff0000u) { v >>= 16; c |= 16; }
if (v & 0xff00) { v >>= 8; c |= 8; }
if (v & 0xf0) { v >>= 4; c |= 4; }
if (v & 0xc) { v >>= 2; c |= 2; }
if (v & 0x2) c |= 1;
return c;
}
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width,
bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack)
{ // $seq is the reverse complement of the input read
int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt);
int best_diff = opt->max_diff + 1, max_diff = opt->max_diff;
int best_cnt = 0;
int max_entries = 0, j, _j, n_aln, m_aln;
bwt_aln1_t *aln;
m_aln = 4; n_aln = 0;
aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t));
// check whether there are too many N
for (j = _j = 0; j < len; ++j)
if (seq[j] > 3) ++_j;
if (_j > max_diff) {
*_n_aln = n_aln;
return aln;
}
//for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w);
gap_reset_stack(stack); // reset stack
gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, 0, 0, opt);
while (stack->n_entries) {
gap_entry_t e;
int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp;
bwtint_t k, l, cnt_k[4], cnt_l[4], occ;
if (max_entries < stack->n_entries) max_entries = stack->n_entries;
if (stack->n_entries > opt->max_entries) break;
gap_pop(stack, &e); // get the best entry
k = e.k; l = e.l; // SA interval
i = e.info&0xffff; // length
if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed
m = max_diff - (e.n_mm + e.n_gapo);
if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape;
if (m < 0) continue;
if (seed_width) { // apply seeding
m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo);
if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape;
}
//printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos);
if (i > 0 && m < width[i-1].bid) continue;
// check whether a hit is found
hit_found = 0;
if (i == 0) hit_found = 1;
else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed
if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1;
else continue; // no hit, skip
}
if (hit_found) { // action for found hits
int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt);
int do_add = 1;
//printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l);
if (n_aln == 0) {
best_score = score;
best_diff = e.n_mm + e.n_gapo;
if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape;
if (!(opt->mode & BWA_MODE_NONSTOP))
max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour
}
if (score == best_score) best_cnt += l - k + 1;
else if (best_cnt > opt->max_top2) break; // top2b behaviour
if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat
for (j = 0; j != n_aln; ++j)
if (aln[j].k == k && aln[j].l == l) break;
if (j < n_aln) do_add = 0;
}
if (do_add) { // append
bwt_aln1_t *p;
gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width);
if (n_aln == m_aln) {
m_aln <<= 1;
aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t));
memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t));
}
p = aln + n_aln;
p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape;
p->n_ins = e.n_ins; p->n_del = e.n_del;
p->k = k; p->l = l;
p->score = score;
//fprintf(stderr, "*** n_mm=%d,n_gapo=%d,n_gape=%d,n_ins=%d,n_del=%d\n", e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del);
++n_aln;
}
continue;
}
--i;
bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values
occ = l - k + 1;
// test whether diff is allowed
allow_diff = allow_M = 1;
if (i > 0) {
int ii = i - (len - opt->seed_len);
if (width[i-1].bid > m-1) allow_diff = 0;
else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0;
if (seed_width && ii > 0) {
if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0;
else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1
&& seed_width[ii-1].w == seed_width[ii].w) allow_M = 0;
}
}
// indels
tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape;
if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) {
if (e.state == STATE_M) { // gap open
if (e.n_gapo < opt->max_gapo) { // gap open is allowed
// insertion
gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins + 1, e.n_del, STATE_I, 1, opt);
// deletion
for (j = 0; j != 4; ++j) {
k = bwt->L2[j] + cnt_k[j] + 1;
l = bwt->L2[j] + cnt_l[j];
if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins, e.n_del + 1, STATE_D, 1, opt);
}
}
} else if (e.state == STATE_I) { // extention of an insertion
if (e.n_gape < opt->max_gape) // gap extention is allowed
gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins + 1, e.n_del, STATE_I, 1, opt);
} else if (e.state == STATE_D) { // extention of a deletion
if (e.n_gape < opt->max_gape) { // gap extention is allowed
if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) {
for (j = 0; j != 4; ++j) {
k = bwt->L2[j] + cnt_k[j] + 1;
l = bwt->L2[j] + cnt_l[j];
if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins, e.n_del + 1, STATE_D, 1, opt);
}
}
}
}
}
// mismatches
if (allow_diff && allow_M) { // mismatch is allowed
for (j = 1; j <= 4; ++j) {
int c = (seq[i] + j) & 3;
int is_mm = (j != 4 || seq[i] > 3);
k = bwt->L2[c] + cnt_k[c] + 1;
l = bwt->L2[c] + cnt_l[c];
if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, is_mm, opt);
}
} else if (seq[i] < 4) { // try exact match only
int c = seq[i] & 3;
k = bwt->L2[c] + cnt_k[c] + 1;
l = bwt->L2[c] + cnt_l[c];
if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, 0, opt);
}
}
*_n_aln = n_aln;
//fprintf(stderr, "max_entries = %d\n", max_entries);
return aln;
}

40
bwtgap.h 100644
View File

@ -0,0 +1,40 @@
#ifndef BWTGAP_H_
#define BWTGAP_H_
#include "bwt.h"
#include "bwtaln.h"
typedef struct { // recursion stack
uint32_t info; // score<<21 | i
uint32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6;
uint32_t n_ins:16, n_del:16;
int last_diff_pos;
bwtint_t k, l; // (k,l) is the SA region of [i,n-1]
} gap_entry_t;
typedef struct {
int n_entries, m_entries;
gap_entry_t *stack;
} gap_stack1_t;
typedef struct {
int n_stacks, best, n_entries;
gap_stack1_t *stacks;
} gap_stack_t;
#ifdef __cplusplus
extern "C" {
#endif
gap_stack_t *gap_init_stack2(int max_score);
gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt);
void gap_destroy_stack(gap_stack_t *stack);
bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w,
bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack);
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
#ifdef __cplusplus
}
#endif
#endif

323
bwtindex.c 100644
View File

@ -0,0 +1,323 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
#include <zlib.h>
#include "bntseq.h"
#include "bwa.h"
#include "bwt.h"
#include "utils.h"
#include "rle.h"
#include "rope.h"
#ifdef _DIVBWT
#include "divsufsort.h"
#endif
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
int is_bwt(ubyte_t *T, int n);
int64_t bwa_seq_len(const char *fn_pac)
{
FILE *fp;
int64_t pac_len;
ubyte_t c;
fp = xopen(fn_pac, "rb");
err_fseek(fp, -1, SEEK_END);
pac_len = err_ftell(fp);
err_fread_noeof(&c, 1, 1, fp);
err_fclose(fp);
return (pac_len - 1) * 4 + (int)c;
}
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
{
bwt_t *bwt;
ubyte_t *buf, *buf2;
int64_t i, pac_size;
FILE *fp;
// initialization
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
bwt->seq_len = bwa_seq_len(fn_pac);
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
fp = xopen(fn_pac, "rb");
// prepare sequence
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
buf2 = (ubyte_t*)calloc(pac_size, 1);
err_fread_noeof(buf2, 1, pac_size, fp);
err_fclose(fp);
memset(bwt->L2, 0, 5 * 4);
buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
for (i = 0; i < bwt->seq_len; ++i) {
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
++bwt->L2[1+buf[i]];
}
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
free(buf2);
// Burrows-Wheeler Transform
if (use_is) {
bwt->primary = is_bwt(buf, bwt->seq_len);
} else {
rope_t *r;
int64_t x;
rpitr_t itr;
const uint8_t *blk;
r = rope_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN);
for (i = bwt->seq_len - 1, x = 0; i >= 0; --i) {
int c = buf[i] + 1;
x = rope_insert_run(r, x, c, 1, 0) + 1;
while (--c >= 0) x += r->c[c];
}
bwt->primary = x;
rope_itr_first(r, &itr);
x = 0;
while ((blk = rope_itr_next_block(&itr)) != 0) {
const uint8_t *q = blk + 2, *end = blk + 2 + *rle_nptr(blk);
while (q < end) {
int c = 0;
int64_t l;
rle_dec1(q, c, l);
for (i = 0; i < l; ++i)
buf[x++] = c - 1;
}
}
rope_destroy(r);
}
bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
for (i = 0; i < bwt->seq_len; ++i)
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
free(buf);
return bwt;
}
int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required!
{
bwt_t *bwt;
int c, use_is = 1;
while ((c = getopt(argc, argv, "d")) >= 0) {
switch (c) {
case 'd': use_is = 0; break;
default: return 1;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
return 1;
}
bwt = bwt_pac2bwt(argv[optind], use_is);
bwt_dump_bwt(argv[optind+1], bwt);
bwt_destroy(bwt);
return 0;
}
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
void bwt_bwtupdate_core(bwt_t *bwt)
{
bwtint_t i, k, c[4], n_occ;
uint32_t *buf;
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
c[0] = c[1] = c[2] = c[3] = 0;
for (i = k = 0; i < bwt->seq_len; ++i) {
if (i % OCC_INTERVAL == 0) {
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
}
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
++c[bwt_B00(bwt, i)];
}
// the last element
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
// update bwt
free(bwt->bwt); bwt->bwt = buf;
}
int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
{
bwt_t *bwt;
if (argc != 2) {
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
return 1;
}
bwt = bwt_restore_bwt(argv[1]);
bwt_bwtupdate_core(bwt);
bwt_dump_bwt(argv[1], bwt);
bwt_destroy(bwt);
return 0;
}
int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
{
bwt_t *bwt;
int c, sa_intv = 32;
while ((c = getopt(argc, argv, "i:")) >= 0) {
switch (c) {
case 'i': sa_intv = atoi(optarg); break;
default: return 1;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
bwt_cal_sa(bwt, sa_intv);
bwt_dump_sa(argv[optind+1], bwt);
bwt_destroy(bwt);
return 0;
}
int bwa_index(int argc, char *argv[]) // the "index" command
{
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
char *prefix = 0, *str;
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
switch (c) {
case 'a': // if -a is not set, algo_type will be determined later
if (strcmp(optarg, "rb2") == 0) algo_type = BWTALGO_RB2;
else if (strcmp(optarg, "bwtsw") == 0) algo_type = BWTALGO_BWTSW;
else if (strcmp(optarg, "is") == 0) algo_type = BWTALGO_IS;
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
break;
case 'p': prefix = strdup(optarg); break;
case '6': is_64 = 1; break;
case 'b':
block_size = strtol(optarg, &str, 10);
if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024;
else if (*str == 'K' || *str == 'k') block_size *= 1024;
break;
default: return 1;
}
}
if (optind + 1 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
fprintf(stderr, "\n");
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
fprintf(stderr, " `-a div' do not work for long genomes.\n\n");
return 1;
}
if (prefix == 0) {
prefix = malloc(strlen(argv[optind]) + 4);
strcpy(prefix, argv[optind]);
if (is_64) strcat(prefix, ".64");
}
bwa_idx_build(argv[optind], prefix, algo_type, block_size);
free(prefix);
return 0;
}
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size)
{
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
char *str, *str2, *str3;
clock_t t;
int64_t l_pac;
str = (char*)calloc(strlen(prefix) + 10, 1);
str2 = (char*)calloc(strlen(prefix) + 10, 1);
str3 = (char*)calloc(strlen(prefix) + 10, 1);
{ // nucleotide indexing
gzFile fp = xzopen(fa, "r");
t = clock();
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 0);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
}
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
{
strcpy(str, prefix); strcat(str, ".pac");
strcpy(str2, prefix); strcat(str2, ".bwt");
t = clock();
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
else if (algo_type == 1 || algo_type == 3) {
bwt_t *bwt;
bwt = bwt_pac2bwt(str, algo_type == 3);
bwt_dump_bwt(str2, bwt);
bwt_destroy(bwt);
}
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
{
bwt_t *bwt;
strcpy(str, prefix); strcat(str, ".bwt");
t = clock();
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Update BWT... ");
bwt = bwt_restore_bwt(str);
bwt_bwtupdate_core(bwt);
bwt_dump_bwt(str, bwt);
bwt_destroy(bwt);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
{
gzFile fp = xzopen(fa, "r");
t = clock();
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 1);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
}
{
bwt_t *bwt;
strcpy(str, prefix); strcat(str, ".bwt");
strcpy(str3, prefix); strcat(str3, ".sa");
t = clock();
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
bwt = bwt_restore_bwt(str);
bwt_cal_sa(bwt, 32);
bwt_dump_sa(str3, bwt);
bwt_destroy(bwt);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
free(str3); free(str2); free(str);
return 0;
}

69
bwtsw2.h 100644
View File

@ -0,0 +1,69 @@
#ifndef LH3_BWTSW2_H
#define LH3_BWTSW2_H
#include <stdint.h>
#include "bntseq.h"
#include "bwt_lite.h"
#include "bwt.h"
#define BSW2_FLAG_MATESW 0x100
#define BSW2_FLAG_TANDEM 0x200
#define BSW2_FLAG_MOVED 0x400
#define BSW2_FLAG_RESCUED 0x800
typedef struct {
int skip_sw:8, cpy_cmt:8, hard_clip:16;
int a, b, q, r, t, qr, bw, max_ins, max_chain_gap;
int z, is, t_seeds, multi_2nd;
float mask_level, coef;
int n_threads, chunk_size;
} bsw2opt_t;
typedef struct {
bwtint_t k, l;
uint32_t flag:18, n_seeds:13, is_rev:1;
int len, G, G2;
int beg, end;
} bsw2hit_t;
typedef struct {
int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize, nm;
uint32_t *cigar;
} bsw2aux_t;
typedef struct {
int n, max;
bsw2hit_t *hits;
bsw2aux_t *aux;
} bwtsw2_t;
typedef struct {
void *stack;
int max_l;
uint8_t *aln_mem;
} bsw2global_t;
typedef struct {
int l, tid;
char *name, *seq, *qual, *sam, *comment;
} bsw2seq1_t;
#ifdef __cplusplus
extern "C" {
#endif
bsw2opt_t *bsw2_init_opt();
bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool);
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2);
void bsw2_destroy(bwtsw2_t *b);
bsw2global_t *bsw2_global_init();
void bsw2_global_destroy(bsw2global_t *_pool);
void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit);
#ifdef __cplusplus
}
#endif
#endif

776
bwtsw2_aux.c 100644
View File

@ -0,0 +1,776 @@
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifdef HAVE_PTHREAD
#include <pthread.h>
#endif
#include "bntseq.h"
#include "bwt_lite.h"
#include "utils.h"
#include "bwtsw2.h"
#include "kstring.h"
#include "bwa.h"
#include "ksw.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
#include "ksort.h"
#define __left_lt(a, b) ((a).end > (b).end)
KSORT_INIT(hit, bsw2hit_t, __left_lt)
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
extern unsigned char nst_nt4_table[256];
unsigned char nt_comp_table[256] = {
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n',
'n','n','y','s', 'a','n','b','w', 'x','r','n','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
};
extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS);
extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level);
bsw2opt_t *bsw2_init_opt()
{
bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t));
o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30;
o->bw = 50;
o->max_ins = 20000;
o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0;
o->mask_level = 0.50f; o->coef = 5.5f;
o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000;
o->max_chain_gap = 10000;
o->cpy_cmt = 0;
return o;
}
void bsw2_destroy(bwtsw2_t *b)
{
int i;
if (b == 0) return;
if (b->aux)
for (i = 0; i < b->n; ++i) free(b->aux[i].cigar);
free(b->aux); free(b->hits);
free(b);
}
bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b)
{
bwtsw2_t *p;
p = calloc(1, sizeof(bwtsw2_t));
p->max = p->n = b->n;
if (b->n) {
kroundup32(p->max);
p->hits = calloc(p->max, sizeof(bsw2hit_t));
memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t));
}
return p;
}
#define __gen_ap(par, opt) do { \
int i; \
for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \
for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \
(par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \
(par).gap_end = (opt)->r; \
(par).row = 5; (par).band_width = opt->bw; \
} while (0)
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
{
int i;
bwtint_t k;
uint8_t *target = 0, *query;
int8_t mat[25];
bwa_fill_scmat(opt->a, opt->b, mat);
query = calloc(lq, 1);
// sort according to the descending order of query end
ks_introsort(hit, b->n, b->hits);
target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
// reverse _query
for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i];
// core loop
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
int score, j, qle, tle;
p->n_seeds = 1;
if (p->l || p->k == 0) continue;
for (j = score = 0; j < i; ++j) {
bsw2hit_t *q = b->hits + j;
if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) {
if (q->n_seeds < (1<<13) - 2) ++q->n_seeds;
++score;
}
}
if (score) continue;
if (lt > p->k) lt = p->k;
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j;
score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0);
if (score > p->G) { // extensible
p->G = score;
p->k -= tle;
p->len += tle;
p->beg -= qle;
}
}
free(query); free(target);
}
void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
{
int i;
bwtint_t k;
uint8_t *target;
int8_t mat[25];
bwa_fill_scmat(opt->a, opt->b, mat);
target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
int j, score, qle, tle;
if (p->l) continue;
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j;
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1;
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
if (score >= p->G) {
p->G = score;
p->len = tle;
p->end = p->beg + qle;
}
}
free(target);
}
/* generate CIGAR array(s) in b->cigar[] */
static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name)
{
int i;
int8_t mat[25];
bwa_fill_scmat(opt->a, opt->b, mat);
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
bsw2aux_t *q = b->aux + i;
uint8_t *query;
int beg, end, score;
if (p->l) continue;
beg = (p->flag & 0x10)? lq - p->end : p->beg;
end = (p->flag & 0x10)? lq - p->beg : p->end;
query = seq[(p->flag & 0x10)? 1 : 0] + beg;
q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm);
#if 0
if (name && score != p->G) { // debugging only
int j, glen = 0;
for (j = 0; j < q->n_cigar; ++j)
if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2)
glen += q->cigar[j]>>4;
fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n",
__func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw);
}
#endif
if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping
q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2));
if (beg != 0) {
memmove(q->cigar + 1, q->cigar, q->n_cigar * 4);
q->cigar[0] = beg<<4 | 4;
++q->n_cigar;
}
if (end < lq) {
q->cigar[q->n_cigar] = (lq - end)<<4 | 4;
++q->n_cigar;
}
}
}
}
/* this is for the debugging purpose only */
void bsw2_debug_hits(const bwtsw2_t *b)
{
int i;
printf("# raw hits: %d\n", b->n);
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
if (p->G > 0)
printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev);
}
}
static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse)
{
int i;
if (b[0]->n + b[1]->n > b[0]->max) {
b[0]->max = b[0]->n + b[1]->n;
b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t));
}
for (i = 0; i < b[1]->n; ++i) {
bsw2hit_t *p = b[0]->hits + b[0]->n + i;
*p = b[1]->hits[i];
if (is_reverse) {
int x = p->beg;
p->beg = l - p->end;
p->end = l - x;
p->flag |= 0x10;
}
}
b[0]->n += b[1]->n;
bsw2_destroy(b[1]);
b[1] = 0;
}
/* seq[0] is the forward sequence and seq[1] is the reverse complement. */
static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target,
int l, uint8_t *seq[2], bsw2global_t *pool)
{
extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]);
bwtsw2_t *b[2], **bb[2], **_b, *p;
int k, j;
bwtl_t *query;
query = bwtl_seq2bwtl(l, seq[0]);
_b = bsw2_core(bns, opt, query, target, pool);
bwtl_destroy(query);
for (k = 0; k < 2; ++k) {
bb[k] = calloc(2, sizeof(void*));
bb[k][0] = calloc(1, sizeof(bwtsw2_t));
bb[k][1] = calloc(1, sizeof(bwtsw2_t));
}
for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand
for (j = 0; j < _b[k]->n; ++j) {
bsw2hit_t *q;
p = bb[_b[k]->hits[j].is_rev][k];
if (p->n == p->max) {
p->max = p->max? p->max<<1 : 8;
p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t));
}
q = &p->hits[p->n++];
*q = _b[k]->hits[j];
if (_b[k]->hits[j].is_rev) {
int x = q->beg;
q->beg = l - q->end;
q->end = l - x;
}
}
}
b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits"
bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained
for (k = 0; k < 2; ++k) {
bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem);
merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here
bsw2_resolve_duphits(0, 0, bb[k][0], 0);
bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem);
bsw2_resolve_duphits(0, 0, bb[k][0], 0);
b[k] = bb[k][0];
free(bb[k]);
}
merge_hits(b, l, 1); // again, b[1] is merged to b[0]
bsw2_resolve_query_overlaps(b[0], opt->mask_level);
bsw2_destroy(_b[0]); bsw2_destroy(_b[1]); free(_b);
return b[0];
}
/* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */
static void flag_fr(bwtsw2_t *b[2])
{
int i, j;
for (i = 0; i < b[0]->n; ++i) {
bsw2hit_t *p = b[0]->hits + i;
p->flag |= 0x10000;
}
for (i = 0; i < b[1]->n; ++i) {
bsw2hit_t *p = b[1]->hits + i;
p->flag |= 0x20000;
}
for (i = 0; i < b[0]->n; ++i) {
bsw2hit_t *p = b[0]->hits + i;
for (j = 0; j < b[1]->n; ++j) {
bsw2hit_t *q = b[1]->hits + j;
if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) {
q->flag |= 0x30000; p->flag |= 0x30000;
break;
}
}
}
}
typedef struct {
int n, max;
bsw2seq1_t *seq;
} bsw2seq_t;
static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar)
{
// FIXME: this routine does not work if the query bridge three reference sequences
int32_t coor, refl, lq;
int x, y, i, seqid;
bns_cnt_ambi(bns, p->k, p->len, &seqid);
coor = p->k - bns->anns[seqid].offset;
refl = bns->anns[seqid].len;
x = coor; y = 0;
// test if the alignment goes beyond the boundary
for (i = 0; i < n_cigar; ++i) {
int op = cigar[i]&0xf, ln = cigar[i]>>4;
if (op == 1 || op == 4 || op == 5) y += ln;
else if (op == 2) x += ln;
else x += ln, y += ln;
}
lq = y; // length of the query sequence
if (x > refl) { // then fix it
int j, nc, mq[2], nlen[2];
uint32_t *cn;
bwtint_t kk = 0;
nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0;
cn = calloc(n_cigar + 3, 4);
x = coor; y = 0;
for (i = j = 0; i < n_cigar; ++i) {
int op = cigar[i]&0xf, ln = cigar[i]>>4;
if (op == 4 || op == 5 || op == 1) { // ins or clipping
y += ln;
cn[j++] = cigar[i];
} else if (op == 2) { // del
if (x + ln >= refl && nc == 0) {
cn[j++] = (uint32_t)(lq - y)<<4 | 4;
nc = j;
cn[j++] = (uint32_t)y<<4 | 4;
kk = p->k + (x + ln - refl);
nlen[0] = x - coor;
nlen[1] = p->len - nlen[0] - ln;
} else cn[j++] = cigar[i];
x += ln;
} else if (op == 0) { // match
if (x + ln >= refl && nc == 0) {
// FIXME: not consider a special case where a split right between M and I
cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M
cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S
nc = j;
mq[0] += refl - x;
cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4;
if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0;
mq[1] += x + ln - refl;
kk = bns->anns[seqid].offset + refl;
nlen[0] = refl - coor;
nlen[1] = p->len - nlen[0];
} else {
cn[j++] = cigar[i];
mq[nc?1:0] += ln;
}
x += ln; y += ln;
}
}
if (mq[0] > mq[1]) { // then take the first alignment
n_cigar = nc;
memcpy(cigar, cn, 4 * nc);
p->len = nlen[0];
} else {
p->k = kk; p->len = nlen[1];
n_cigar = j - nc;
memcpy(cigar, cn + nc, 4 * (j - nc));
}
free(cn);
}
return n_cigar;
}
static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name)
{
int i;
// allocate for b->aux
if (b->n<<1 < b->max) {
b->max = b->n;
kroundup32(b->max);
b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t));
}
b->aux = calloc(b->n, sizeof(bsw2aux_t));
// generate CIGAR
gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name);
// fix CIGAR, generate mapQ, and write chromosomal position
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = &b->hits[i];
bsw2aux_t *q = &b->aux[i];
q->flag = p->flag & 0xfe;
q->isize = 0;
if (p->l == 0) { // unique hit
float c = 1.0;
int subo;
// fix out-of-boundary CIGAR
q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar);
// compute mapQ
subo = p->G2 > opt->t? p->G2 : opt->t;
if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
if (p->n_seeds < 2) c *= .2;
q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499);
if (q->qual > 250) q->qual = 250;
if (q->qual < 0) q->qual = 0;
if (p->flag&1) q->qual = 0; // this is a random hit
q->pqual = q->qual; // set the paired qual as qual
// get the chromosomal position
q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr);
q->pos = p->k - bns->anns[q->chr].offset;
} else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0;
}
}
static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m)
{
int i;
if (m == 0) return;
// update flag, mchr and mpos
for (i = 0; i < b->n; ++i) {
bsw2aux_t *q = &b->aux[i];
q->flag |= 1; // paired
if (m->n == 0) q->flag |= 8; // mate unmapped
if (m->n == 1) {
q->mchr = m->aux[0].chr;
q->mpos = m->aux[0].pos;
if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand
if (q->chr == q->mchr) { // set insert size
if (q->mpos + m->hits[0].len > q->pos)
q->isize = q->mpos + m->hits[0].len - q->pos;
else q->isize = q->mpos - q->pos - b->hits[0].len;
} else q->isize = 0;
} else q->mchr = q->mpos = -1;
}
// update mapping quality
if (b->n == 1 && m->n == 1) {
bsw2hit_t *p = &b->hits[0];
if (p->flag & BSW2_FLAG_MATESW) { // this alignment is found by Smith-Waterman
if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < 20)
b->aux[0].pqual = 20;
if (b->aux[0].pqual >= m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual;
} else if ((p->flag & 2) && !(m->hits[0].flag & BSW2_FLAG_MATESW)) { // properly paired
if (!(p->flag & BSW2_FLAG_TANDEM)) { // pqual is bounded by [b->aux[0].qual,m->aux[0].qual]
b->aux[0].pqual += 20;
if (b->aux[0].pqual > m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual;
if (b->aux[0].pqual < b->aux[0].qual) b->aux[0].pqual = b->aux[0].qual;
}
}
}
}
/* generate SAM lines for a sequence in ks with alignment stored in
* b. ks->name and ks->seq will be freed and set to NULL in the end. */
static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate)
{
int i, k;
kstring_t str;
memset(&str, 0, sizeof(kstring_t));
if (b == 0 || b->n == 0) { // no hits
ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name);
for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str);
if (ks->qual) {
kputc('\t', &str);
for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str);
} else kputs("\t*", &str);
kputc('\n', &str);
}
for (i = 0; b && i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
bsw2aux_t *q = b->aux + i;
int j, beg, end, type = 0;
// print mandatory fields before SEQ
if (q->cigar == 0) q->flag |= 0x4;
ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0));
ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1);
if (p->l == 0 && q->cigar) { // not a repetitive hit
ksprintf(&str, "\t%d\t", q->pqual);
for (k = 0; k < q->n_cigar; ++k)
ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]);
} else ksprintf(&str, "\t0\t*");
if (!is_pe) kputs("\t*\t0\t0\t", &str);
else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize);
// get the sequence begin and end
beg = 0; end = ks->l;
if (opt->hard_clip && q->cigar) {
if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4;
if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4;
}
for (j = beg; j < end; ++j) {
if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str);
else kputc(ks->seq[j], &str);
}
// print base quality if present
if (ks->qual) {
kputc('\t', &str);
for (j = beg; j < end; ++j) {
if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str);
else kputc(ks->qual[j], &str);
}
} else kputs("\t*", &str);
// print optional tags
ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm);
if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn);
if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1);
if (p->flag&BSW2_FLAG_MATESW) type |= 1;
if (p->flag&BSW2_FLAG_TANDEM) type |= 2;
if (type) ksprintf(&str, "\tXT:i:%d", type);
if (opt->cpy_cmt && ks->comment) {
int l = strlen(ks->comment);
if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') {
kputc('\t', &str); kputs(ks->comment, &str);
}
}
kputc('\n', &str);
}
ks->sam = str.s;
free(ks->seq); ks->seq = 0;
free(ks->qual); ks->qual = 0;
free(ks->name); ks->name = 0;
}
static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen)
{
double ll = log(qlen);
int i, k;
*dst = *src;
if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499);
// set band width: the query length sets a boundary on the maximum band width
k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a);
i = (qlen * dst->a - dst->a - dst->t) / dst->r;
if (k > i) k = i;
if (k < 1) k = 1; // I do not know if k==0 causes troubles
dst->bw = src->bw < k? src->bw : k;
}
/* Core routine to align reads in _seq. It is separated from
* process_seqs() to realize multi-threading */
static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe)
{
int x;
bsw2opt_t opt;
bsw2global_t *pool = bsw2_global_init();
bwtsw2_t **buf;
buf = calloc(_seq->n, sizeof(void*));
for (x = 0; x < _seq->n; ++x) {
bsw2seq1_t *p = _seq->seq + x;
uint8_t *seq[2], *rseq[2];
int i, l, k;
bwtsw2_t *b[2];
l = p->l;
update_opt(&opt, _opt, p->l);
if (pool->max_l < l) { // then enlarge working space for aln_extend_core()
int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l;
pool->max_l = l;
pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24);
}
// set seq[2] and rseq[2]
seq[0] = calloc(l * 4, 1);
seq[1] = seq[0] + l;
rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l;
// convert sequences to 2-bit representation
for (i = k = 0; i < l; ++i) {
int c = nst_nt4_table[(int)p->seq[i]];
if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled
seq[0][i] = c;
seq[1][l-1-i] = 3 - c;
rseq[0][l-1-i] = 3 - c;
rseq[1][i] = c;
}
if (l - k < opt.t) { // too few unambiguous bases
buf[x] = calloc(1, sizeof(bwtsw2_t));
free(seq[0]); continue;
}
// alignment
b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool);
for (k = 0; k < b[0]->n; ++k)
if (b[0]->hits[k].n_seeds < opt.t_seeds) break;
if (k < b[0]->n) {
b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool);
for (i = 0; i < b[1]->n; ++i) {
bsw2hit_t *p = &b[1]->hits[i];
int x = p->beg;
p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand
p->beg = l - p->end;
p->end = l - x;
}
flag_fr(b);
merge_hits(b, l, 0);
bsw2_resolve_duphits(0, 0, b[0], 0);
bsw2_resolve_query_overlaps(b[0], opt.mask_level);
} else b[1] = 0;
// generate CIGAR and print SAM
buf[x] = bsw2_dup_no_cigar(b[0]);
// free
free(seq[0]);
bsw2_destroy(b[0]);
}
if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf);
for (x = 0; x < _seq->n; ++x) {
bsw2seq1_t *p = _seq->seq + x;
uint8_t *seq[2];
int i;
seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l;
for (i = 0; i < p->l; ++i) {
int c = nst_nt4_table[(int)p->seq[i]];
if (c >= 4) c = (int)(drand48() * 4);
seq[0][i] = c;
seq[1][p->l-1-i] = 3 - c;
}
update_opt(&opt, _opt, p->l);
write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name);
free(seq[0]);
}
for (x = 0; x < _seq->n; ++x) {
if (is_pe) update_mate_aux(buf[x], buf[x^1]);
print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]);
}
for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]);
free(buf);
bsw2_global_destroy(pool);
}
#ifdef HAVE_PTHREAD
typedef struct {
int tid, is_pe;
bsw2seq_t *_seq;
const bsw2opt_t *_opt;
const bntseq_t *bns;
uint8_t *pac;
const bwt_t *target;
} thread_aux_t;
/* another interface to bsw2_aln_core() to facilitate pthread_create() */
static void *worker(void *data)
{
thread_aux_t *p = (thread_aux_t*)data;
bsw2_aln_core(p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe);
return 0;
}
#endif
/* process sequences stored in _seq, generate SAM lines for these
* sequences and reset _seq afterwards. */
static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe)
{
int i;
is_pe = is_pe? 1 : 0;
#ifdef HAVE_PTHREAD
if (opt->n_threads <= 1) {
bsw2_aln_core(_seq, opt, bns, pac, target, is_pe);
} else {
pthread_t *tid;
pthread_attr_t attr;
thread_aux_t *data;
int j;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
for (j = 0; j < opt->n_threads; ++j) {
thread_aux_t *p = data + j;
p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe;
p->pac = pac; p->target = target;
p->_seq = calloc(1, sizeof(bsw2seq_t));
p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1;
p->_seq->n = 0;
p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t));
}
for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread
bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq;
p->seq[p->n++] = _seq->seq[i];
}
for (j = 0; j < opt->n_threads; ++j) pthread_create(&tid[j], &attr, worker, &data[j]);
for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
for (j = 0; j < opt->n_threads; ++j) data[j]._seq->n = 0;
for (i = 0; i < _seq->n; ++i) { // copy the result from each thread back
bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq;
_seq->seq[i] = p->seq[p->n++];
}
for (j = 0; j < opt->n_threads; ++j) {
thread_aux_t *p = data + j;
free(p->_seq->seq);
free(p->_seq);
}
free(data); free(tid);
}
#else
bsw2_aln_core(_seq, opt, bns, pac, target, is_pe);
#endif
// print and reset
for (i = 0; i < _seq->n; ++i) {
bsw2seq1_t *p = _seq->seq + i;
if (p->sam) err_printf("%s", p->sam);
free(p->name); free(p->seq); free(p->qual); free(p->sam);
p->tid = -1; p->l = 0;
p->name = p->seq = p->qual = p->sam = 0;
}
err_fflush(stdout);
_seq->n = 0;
}
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2)
{
gzFile fp, fp2;
kseq_t *ks, *ks2;
int l, is_pe = 0, i, n;
uint8_t *pac;
bsw2seq_t *_seq;
bseq1_t *bseq;
pac = calloc(bns->l_pac/4+1, 1);
for (l = 0; l < bns->n_seqs; ++l)
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
fp = xzopen(fn, "r");
ks = kseq_init(fp);
_seq = calloc(1, sizeof(bsw2seq_t));
if (fn2) {
fp2 = xzopen(fn2, "r");
ks2 = kseq_init(fp2);
is_pe = 1;
} else fp2 = 0, ks2 = 0, is_pe = 0;
while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
int size = 0;
if (n > _seq->max) {
_seq->max = n;
kroundup32(_seq->max);
_seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
}
_seq->n = n;
for (i = 0; i < n; ++i) {
bseq1_t *b = &bseq[i];
bsw2seq1_t *p = &_seq->seq[i];
p->tid = -1; p->l = b->l_seq;
p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0;
size += p->l;
}
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size);
free(bseq);
process_seqs(_seq, opt, bns, pac, target, is_pe);
}
// free
free(pac);
free(_seq->seq); free(_seq);
kseq_destroy(ks);
err_gzclose(fp);
if (fn2) {
kseq_destroy(ks2);
err_gzclose(fp2);
}
}

112
bwtsw2_chain.c 100644
View File

@ -0,0 +1,112 @@
#include <stdio.h>
#include "bwtsw2.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
typedef struct {
uint32_t tbeg, tend;
int qbeg, qend;
uint32_t flag:1, idx:31;
int chain; // also reuse as a counter
} hsaip_t;
#define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg)
#include "ksort.h"
KSORT_INIT(hsaip, hsaip_t, _hsaip_lt)
static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain)
{
int j, k, m = 0;
ks_introsort(hsaip, n, z);
for (j = 0; j < n; ++j) {
hsaip_t *p = z + j;
for (k = m - 1; k >= 0; --k) {
hsaip_t *q = chain + k;
int x = p->qbeg - q->qbeg; // always positive
int y = p->tbeg - q->tbeg;
if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained
if (p->qend > q->qend) q->qend = p->qend;
if (p->tend > q->tend) q->tend = p->tend;
++q->chain;
p->chain = shift + k;
break;
} else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains
}
if (k < 0) { // not added to any previous chains
chain[m] = *p;
chain[m].chain = 1;
chain[m].idx = p->chain = shift + m;
++m;
}
}
return m;
}
void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
{
hsaip_t *z[2], *chain[2];
int i, j, k, n[2], m[2], thres = opt->t_seeds * 2;
char *flag;
// initialization
n[0] = b[0]->n; n[1] = b[1]->n;
z[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
z[1] = z[0] + n[0];
chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
for (k = j = 0; k < 2; ++k) {
for (i = 0; i < b[k]->n; ++i) {
bsw2hit_t *p = b[k]->hits + i;
hsaip_t *q = z[k] + i;
q->flag = k; q->idx = i;
q->tbeg = p->k; q->tend = p->k + p->len;
q->chain = -1;
q->qbeg = p->beg; q->qend = p->end;
}
}
// chaining
m[0] = chaining(opt, 0, n[0], z[0], chain[0]);
chain[1] = chain[0] + m[0];
m[1] = chaining(opt, m[0], n[1], z[1], chain[1]);
// change query coordinate on the reverse strand
for (k = 0; k < m[1]; ++k) {
hsaip_t *p = chain[1] + k;
int tmp = p->qbeg;
p->qbeg = len - p->qend; p->qend = len - tmp;
}
//for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend);
// filtering
flag = calloc(m[0] + m[1], 1);
ks_introsort(hsaip, m[0] + m[1], chain[0]);
for (k = 1; k < m[0] + m[1]; ++k) {
hsaip_t *p = chain[0] + k;
for (j = 0; j < k; ++j) {
hsaip_t *q = chain[0] + j;
if (flag[q->idx]) continue;
if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) {
flag[p->idx] = 1;
break;
}
}
}
for (k = 0; k < n[0] + n[1]; ++k) {
hsaip_t *p = z[0] + k;
if (flag[p->chain])
b[p->flag]->hits[p->idx].G = 0;
}
free(flag);
// squeeze out filtered elements in b[2]
for (k = 0; k < 2; ++k) {
for (j = i = 0; j < n[k]; ++j) {
bsw2hit_t *p = b[k]->hits + j;
if (p->G) {
if (i != j) b[k]->hits[i++] = *p;
else ++i;
}
}
b[k]->n = i;
}
// free
free(z[0]); free(chain[0]);
}

619
bwtsw2_core.c 100644
View File

@ -0,0 +1,619 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <sys/resource.h>
#include <assert.h>
#include "bwt_lite.h"
#include "bwtsw2.h"
#include "bwt.h"
#include "kvec.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
typedef struct {
bwtint_t k, l;
} qintv_t;
#define qintv_eq(a, b) ((a).k == (b).k && (a).l == (b).l)
#define qintv_hash(a) ((a).k>>7^(a).l<<17)
#include "khash.h"
KHASH_INIT(qintv, qintv_t, uint64_t, 1, qintv_hash, qintv_eq)
KHASH_MAP_INIT_INT64(64, uint64_t)
#define MINUS_INF -0x3fffffff
#define MASK_LEVEL 0.90f
struct __mempool_t;
static void mp_destroy(struct __mempool_t*);
typedef struct {
bwtint_t qk, ql;
int I, D, G;
uint32_t pj:2, qlen:30;
int tlen;
int ppos, upos;
int cpos[4];
} bsw2cell_t;
#include "ksort.h"
KSORT_INIT_GENERIC(int)
#define __hitG_lt(a, b) (((a).G + ((int)(a).n_seeds<<2)) > (b).G + ((int)(b).n_seeds<<2))
KSORT_INIT(hitG, bsw2hit_t, __hitG_lt)
static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} };
typedef struct {
int n, max;
uint32_t tk, tl; // this is fine
bsw2cell_t *array;
} bsw2entry_t, *bsw2entry_p;
/* --- BEGIN: Stack operations --- */
typedef struct {
int n_pending;
kvec_t(bsw2entry_p) stack0, pending;
struct __mempool_t *pool;
} bsw2stack_t;
#define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0)
static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); }
inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); }
inline static bsw2entry_p stack_pop(bsw2stack_t *s)
{
assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0));
return kv_pop(s->stack0);
}
/* --- END: Stack operations --- */
/* --- BEGIN: memory pool --- */
typedef struct __mempool_t {
int cnt; // if cnt!=0, then there must be memory leak
kvec_t(bsw2entry_p) pool;
} mempool_t;
inline static bsw2entry_p mp_alloc(mempool_t *mp)
{
++mp->cnt;
if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t));
else return kv_pop(mp->pool);
}
inline static void mp_free(mempool_t *mp, bsw2entry_p e)
{
--mp->cnt; e->n = 0;
kv_push(bsw2entry_p, mp->pool, e);
}
static void mp_destroy(struct __mempool_t *mp)
{
int i;
for (i = 0; i != kv_size(mp->pool); ++i) {
free(kv_A(mp->pool, i)->array);
free(kv_A(mp->pool, i));
}
kv_destroy(mp->pool);
free(mp);
}
/* --- END: memory pool --- */
/* --- BEGIN: utilities --- */
static khash_t(64) *bsw2_connectivity(const bwtl_t *b)
{
khash_t(64) *h;
uint32_t k, l, cntk[4], cntl[4]; // this is fine
uint64_t x;
khiter_t iter;
int j, ret;
kvec_t(uint64_t) stack;
kv_init(stack);
h = kh_init(64);
kh_resize(64, h, b->seq_len * 4);
x = b->seq_len;
kv_push(uint64_t, stack, x);
while (kv_size(stack)) {
x = kv_pop(stack);
k = x>>32; l = (uint32_t)x;
bwtl_2occ4(b, k-1, l, cntk, cntl);
for (j = 0; j != 4; ++j) {
k = b->L2[j] + cntk[j] + 1;
l = b->L2[j] + cntl[j];
if (k > l) continue;
x = (uint64_t)k << 32 | l;
iter = kh_put(64, h, x, &ret);
if (ret) { // if not present
kh_value(h, iter) = 1;
kv_push(uint64_t, stack, x);
} else ++kh_value(h, iter);
}
}
kv_destroy(stack);
//fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h));
return h;
}
// pick up top T matches at a node
static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux)
{
int i, *a, n, x;
if (u->n <= T) return;
if (aux->max < u->n) {
aux->max = u->n;
aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t));
}
a = (int*)aux->array;
for (i = n = 0; i != u->n; ++i)
if (u->array[i].ql && u->array[i].G > 0)
a[n++] = -u->array[i].G;
if (n <= T) return;
x = -ks_ksmall(int, n, a, T);
n = 0;
for (i = 0; i < u->n; ++i) {
bsw2cell_t *p = u->array + i;
if (p->G == x) ++n;
if (p->G < x || (p->G == x && n >= T)) {
p->qk = p->ql = 0; p->G = 0;
if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1;
}
}
}
// remove duplicated cells
static inline void remove_duplicate(bsw2entry_t *u, khash_t(qintv) *hash)
{
int i, ret, j;
khiter_t k;
qintv_t key;
kh_clear(qintv, hash);
for (i = 0; i != u->n; ++i) {
bsw2cell_t *p = u->array + i;
if (p->ql == 0) continue;
key.k = p->qk; key.l = p->ql;
k = kh_put(qintv, hash, key, &ret);
j = -1;
if (ret == 0) {
if ((uint32_t)kh_value(hash, k) >= p->G) j = i;
else {
j = kh_value(hash, k)>>32;
kh_value(hash, k) = (uint64_t)i<<32 | p->G;
}
} else kh_value(hash, k) = (uint64_t)i<<32 | p->G;
if (j >= 0) {
p = u->array + j;
p->qk = p->ql = 0; p->G = 0;
if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
}
}
}
// merge two entries
static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b)
{
int i;
if (u->n + v->n >= u->max) {
u->max = u->n + v->n;
u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t));
}
for (i = 0; i != v->n; ++i) {
bsw2cell_t *p = v->array + i;
if (p->ppos >= 0) p->ppos += u->n;
if (p->cpos[0] >= 0) p->cpos[0] += u->n;
if (p->cpos[1] >= 0) p->cpos[1] += u->n;
if (p->cpos[2] >= 0) p->cpos[2] += u->n;
if (p->cpos[3] >= 0) p->cpos[3] += u->n;
}
memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t));
u->n += v->n;
}
static inline bsw2cell_t *push_array_p(bsw2entry_t *e)
{
if (e->n == e->max) {
e->max = e->max? e->max<<1 : 256;
e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max);
}
return e->array + e->n;
}
static inline double time_elapse(const struct rusage *curr, const struct rusage *last)
{
long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec);
long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec);
return (double)t1 + t2 * 1e-6;
}
/* --- END: utilities --- */
/* --- BEGIN: processing partial hits --- */
static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u)
{
int i;
uint32_t k; // this is fine
for (i = 0; i < u->n; ++i) {
bsw2cell_t *p = u->array + i;
if (p->G < thres) continue;
for (k = u->tk; k <= u->tl; ++k) {
int beg, end;
bsw2hit_t *q = 0;
beg = bwt->sa[k]; end = beg + p->tlen;
if (p->G > hits[beg*2].G) {
hits[beg*2+1] = hits[beg*2];
q = hits + beg * 2;
} else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1;
if (q) {
q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G;
q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G;
q->flag = q->n_seeds = 0;
}
}
}
}
/* "narrow hits" are node-to-node hits that have a high score and
* are not so repetitive (|SA interval|<=IS). */
static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS)
{
int i;
for (i = 0; i < u->n; ++i) {
bsw2hit_t *q;
bsw2cell_t *p = u->array + i;
if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit
if (b1->max == b1->n) {
b1->max = b1->max? b1->max<<1 : 4;
b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t));
}
q = &b1->hits[b1->n++];
q->k = p->qk; q->l = p->ql;
q->len = p->qlen;
q->G = p->G; q->G2 = 0;
q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen;
q->flag = 0;
// delete p
p->qk = p->ql = 0; p->G = 0;
if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
}
}
}
/* after this, "narrow SA hits" will be expanded and the coordinates
* will be obtained and stored in b->hits[*].k. */
int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS)
{
int i, j, n, is_rev;
if (b->n == 0) return 0;
if (bwt && bns) { // convert to chromosomal coordinates if requested
int old_n = b->n;
bsw2hit_t *old_hits = b->hits;
for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated
bsw2hit_t *p = old_hits + i;
if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1;
else if (p->G > 0) ++n;
}
b->n = b->max = n;
b->hits = calloc(b->max, sizeof(bsw2hit_t));
for (i = j = 0; i < old_n; ++i) {
bsw2hit_t *p = old_hits + i;
if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive
bwtint_t k;
if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue;
for (k = p->k; k <= p->l; ++k) {
b->hits[j] = *p;
b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev);
b->hits[j].l = 0;
b->hits[j].is_rev = is_rev;
if (is_rev) b->hits[j].k -= p->len - 1;
++j;
}
} else if (p->G > 0) {
b->hits[j] = *p;
b->hits[j].k = bns_depos(bns, bwt_sa(bwt, p->k), &is_rev);
b->hits[j].l = 0;
b->hits[j].flag |= 1;
b->hits[j].is_rev = is_rev;
if (is_rev) b->hits[j].k -= p->len - 1;
++j;
}
}
free(old_hits);
}
for (i = j = 0; i < b->n; ++i) // squeeze out empty elements
if (b->hits[i].G) b->hits[j++] = b->hits[i];
b->n = j;
ks_introsort(hitG, b->n, b->hits);
for (i = 1; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
for (j = 0; j < i; ++j) {
bsw2hit_t *q = b->hits + j;
int compatible = 1;
if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates
if (p->l == 0 && q->l == 0) {
int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap
if (qol < 0) qol = 0;
if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) {
int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
- (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap
if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL)
compatible = 0;
}
}
if (!compatible) {
p->G = 0;
if (q->G2 < p->G2) q->G2 = p->G2;
break;
}
}
}
n = i;
for (i = j = 0; i < n; ++i) {
if (b->hits[i].G == 0) continue;
if (i != j) b->hits[j++] = b->hits[i];
else ++j;
}
b->n = j;
return b->n;
}
int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level)
{
int i, j, n;
if (b->n == 0) return 0;
ks_introsort(hitG, b->n, b->hits);
{ // choose a random one
int G0 = b->hits[0].G;
for (i = 1; i < b->n; ++i)
if (b->hits[i].G != G0) break;
j = (int)(i * drand48());
if (j) {
bsw2hit_t tmp;
tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp;
}
}
for (i = 1; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
int all_compatible = 1;
if (p->G == 0) break;
for (j = 0; j < i; ++j) {
bsw2hit_t *q = b->hits + j;
int64_t tol = 0;
int qol, compatible = 0;
float fol;
if (q->G == 0) continue;
qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg);
if (qol < 0) qol = 0;
if (p->l == 0 && q->l == 0) {
tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
- (p->k > q->k? p->k : q->k);
if (tol < 0) tol = 0;
}
fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg);
if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1;
if (!compatible) {
if (q->G2 < p->G) q->G2 = p->G;
all_compatible = 0;
}
}
if (!all_compatible) p->G = 0;
}
n = i;
for (i = j = 0; i < n; ++i) {
if (b->hits[i].G == 0) continue;
if (i != j) b->hits[j++] = b->hits[i];
else ++j;
}
b->n = j;
return j;
}
/* --- END: processing partial hits --- */
/* --- BEGIN: global mem pool --- */
bsw2global_t *bsw2_global_init()
{
bsw2global_t *pool;
bsw2stack_t *stack;
pool = calloc(1, sizeof(bsw2global_t));
stack = calloc(1, sizeof(bsw2stack_t));
stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t));
pool->stack = (void*)stack;
return pool;
}
void bsw2_global_destroy(bsw2global_t *pool)
{
stack_destroy((bsw2stack_t*)pool->stack);
free(pool->aln_mem);
free(pool);
}
/* --- END: global mem pool --- */
static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4])
{
int G = c[3]? c[3]->G + match_score : MINUS_INF;
if (c[1]) {
c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr;
if (c[0]->I > G) G = c[0]->I;
} else c[0]->I = MINUS_INF;
if (c[2]) {
c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr;
if (c[0]->D > G) G = c[0]->D;
} else c[0]->D = MINUS_INF;
return(c[0]->G = G);
}
static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s)
{
bsw2entry_t *u;
bsw2cell_t *x;
u = mp_alloc(s->pool);
u->tk = 0; u->tl = target->seq_len;
x = push_array_p(u);
*x = g_default_cell;
x->G = 0; x->qk = 0; x->ql = query->seq_len;
u->n++;
stack_push0(s, u);
}
/* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */
bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool)
{
bsw2stack_t *stack = (bsw2stack_t*)pool->stack;
bwtsw2_t *b, *b1, **b_ret;
int i, j, score_mat[16], *heap, heap_size, n_tot = 0;
struct rusage curr, last;
khash_t(qintv) *rhash;
khash_t(64) *chash;
// initialize connectivity hash (chash)
chash = bsw2_connectivity(target);
// calculate score matrix
for (i = 0; i != 4; ++i)
for (j = 0; j != 4; ++j)
score_mat[i<<2|j] = (i == j)? opt->a : -opt->b;
// initialize other variables
rhash = kh_init(qintv);
init_bwtsw2(target, query, stack);
heap_size = opt->z;
heap = calloc(heap_size, sizeof(int));
// initialize the return struct
b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
b->n = b->max = target->seq_len * 2;
b->hits = calloc(b->max, sizeof(bsw2hit_t));
b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
b_ret = calloc(2, sizeof(void*));
b_ret[0] = b; b_ret[1] = b1;
// initialize timer
getrusage(0, &last);
// the main loop: traversal of the DAG
while (!stack_isempty(stack)) {
int old_n, tj;
bsw2entry_t *v;
uint32_t tcntk[4], tcntl[4];
bwtint_t k, l;
v = stack_pop(stack); old_n = v->n;
n_tot += v->n;
for (i = 0; i < v->n; ++i) { // test max depth and band width
bsw2cell_t *p = v->array + i;
if (p->ql == 0) continue;
if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) {
p->qk = p->ql = 0;
if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5;
}
}
// get Occ for the DAG
bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl);
for (tj = 0; tj != 4; ++tj) { // descend to the children
bwtint_t qcntk[4], qcntl[4];
int qj, *curr_score_mat = score_mat + tj * 4;
khiter_t iter;
bsw2entry_t *u;
k = target->L2[tj] + tcntk[tj] + 1;
l = target->L2[tj] + tcntl[tj];
if (k > l) continue;
// update counter
iter = kh_get(64, chash, (uint64_t)k<<32 | l);
--kh_value(chash, iter);
// initialization
u = mp_alloc(stack->pool);
u->tk = k; u->tl = l;
memset(heap, 0, sizeof(int) * opt->z);
// loop through all the nodes in v
for (i = 0; i < v->n; ++i) {
bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G
int is_added = 0;
if (p->ql == 0) continue; // deleted node
c[0] = x = push_array_p(u);
x->G = MINUS_INF;
p->upos = x->upos = -1;
if (p->ppos >= 0) { // parent has been visited
c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0;
c[3] = v->array + p->ppos; c[2] = p;
if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x
x->ppos = v->array[p->ppos].upos; // the parent pos in u
p->upos = u->n++; // the current pos in u
if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u
is_added = 1;
}
} else {
x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr;
if (x->D > 0) {
x->G = x->D;
x->I = MINUS_INF; x->ppos = -1;
p->upos = u->n++;
is_added = 1;
}
}
if (is_added) { // x has been added to u->array. fill the remaining variables
x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1;
if (x->G > -heap[0]) {
heap[0] = -x->G;
ks_heapadjust(int, 0, heap_size, heap);
}
}
if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v
if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) {
bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl);
for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie
if (p->cpos[qj] != -1) continue; // this node will be visited later
k = query->L2[qj] + qcntk[qj] + 1;
l = query->L2[qj] + qcntl[qj];
if (k > l) { p->cpos[qj] = -2; continue; }
x = push_array_p(v);
p = v->array + i; // p may not point to the correct position after realloc
x->G = x->I = x->D = MINUS_INF;
x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen;
x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
p->cpos[qj] = v->n++;
} // ~for(qj)
} // ~if(p->cpos[])
} // ~if
} // ~for(i)
if (u->n) save_hits(target, opt->t, b->hits, u);
{ // push u to the stack (or to the pending array)
uint32_t cnt, pos;
cnt = (uint32_t)kh_value(chash, iter);
pos = kh_value(chash, iter)>>32;
if (pos) { // something in the pending array, then merge
bsw2entry_t *w = kv_A(stack->pending, pos-1);
if (u->n) {
if (w->n < u->n) { // swap
w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w;
}
merge_entry(opt, w, u, b);
}
if (cnt == 0) { // move from pending to stack0
remove_duplicate(w, rhash);
save_narrow_hits(target, w, b1, opt->t, opt->is);
cut_tail(w, opt->z, u);
stack_push0(stack, w);
kv_A(stack->pending, pos-1) = 0;
--stack->n_pending;
}
mp_free(stack->pool, u);
} else if (cnt) { // the first time
if (u->n) { // push to the pending queue
++stack->n_pending;
kv_push(bsw2entry_p, stack->pending, u);
kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt;
} else mp_free(stack->pool, u);
} else { // cnt == 0, then push to the stack
bsw2entry_t *w = mp_alloc(stack->pool);
save_narrow_hits(target, u, b1, opt->t, opt->is);
cut_tail(u, opt->z, w);
mp_free(stack->pool, w);
stack_push0(stack, u);
}
}
} // ~for(tj)
mp_free(stack->pool, v);
} // while(top)
getrusage(0, &curr);
for (i = 0; i < 2; ++i)
for (j = 0; j < b_ret[i]->n; ++j)
b_ret[i]->hits[j].n_seeds = 0;
bsw2_resolve_duphits(bns, query, b, opt->is);
bsw2_resolve_duphits(bns, query, b1, opt->is);
//fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot);
// free
free(heap);
kh_destroy(qintv, rhash);
kh_destroy(64, chash);
stack->pending.n = stack->stack0.n = 0;
return b_ret;
}

89
bwtsw2_main.c 100644
View File

@ -0,0 +1,89 @@
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "bwt.h"
#include "bwtsw2.h"
#include "utils.h"
#include "bwa.h"
int bwa_bwtsw2(int argc, char *argv[])
{
bsw2opt_t *opt;
bwaidx_t *idx;
int c;
opt = bsw2_init_opt();
srand48(11);
while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) {
switch (c) {
case 'q': opt->q = atoi(optarg); break;
case 'r': opt->r = atoi(optarg); break;
case 'a': opt->a = atoi(optarg); break;
case 'b': opt->b = atoi(optarg); break;
case 'w': opt->bw = atoi(optarg); break;
case 'T': opt->t = atoi(optarg); break;
case 't': opt->n_threads = atoi(optarg); break;
case 'z': opt->z = atoi(optarg); break;
case 's': opt->is = atoi(optarg); break;
case 'm': opt->mask_level = atof(optarg); break;
case 'c': opt->coef = atof(optarg); break;
case 'N': opt->t_seeds = atoi(optarg); break;
case 'M': opt->multi_2nd = 1; break;
case 'H': opt->hard_clip = 1; break;
case 'f': xreopen(optarg, "w", stdout); break;
case 'I': opt->max_ins = atoi(optarg); break;
case 'S': opt->skip_sw = 1; break;
case 'C': opt->cpy_cmt = 1; break;
case 'G': opt->max_chain_gap = atoi(optarg); break;
default: return 1;
}
}
opt->qr = opt->q + opt->r;
if (optind + 2 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa bwasw [options] <target.prefix> <query.fa> [query2.fa]\n\n");
fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a);
fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b);
fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q);
fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r);
fprintf(stderr, " -w INT band width [%d]\n", opt->bw);
fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level);
fprintf(stderr, "\n");
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -f FILE file to output results to instead of stdout\n");
fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n");
fprintf(stderr, " -C copy FASTA/Q comment to SAM output\n");
fprintf(stderr, " -M mark multi-part alignments as secondary\n");
fprintf(stderr, " -S skip Smith-Waterman read pairing\n");
fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins);
fprintf(stderr, "\n");
fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t);
fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef);
fprintf(stderr, " -z INT Z-best [%d]\n", opt->z);
fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is);
fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds);
fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap);
fprintf(stderr, "\n");
fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n");
fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n");
fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n");
fprintf(stderr, " increase '-z' for better sensitivity.\n");
fprintf(stderr, "\n");
return 1;
}
// adjust opt for opt->a
opt->t *= opt->a;
opt->coef *= opt->a;
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
bwa_idx_destroy(idx);
free(opt);
return 0;
}

274
bwtsw2_pair.c 100644
View File

@ -0,0 +1,274 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "utils.h"
#include "bwt.h"
#include "bntseq.h"
#include "bwtsw2.h"
#include "kstring.h"
#include "ksw.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#define MIN_RATIO 0.8
#define OUTLIER_BOUND 2.0
#define MAX_STDDEV 4.0
#define EXT_STDDEV 4.0
typedef struct {
int low, high, failed;
double avg, std;
} bsw2pestat_t;
bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
{
int i, k, x, p25, p50, p75, tmp, max_len = 0;
uint64_t *isize;
bsw2pestat_t r;
memset(&r, 0, sizeof(bsw2pestat_t));
isize = calloc(n, 8);
for (i = k = 0; i < n; i += 2) {
bsw2hit_t *t[2];
int l;
if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits
t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0];
if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough
if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough
l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len;
if (l >= max_ins) continue; // skip pairs with excessively large insert
max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg;
max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
isize[k++] = l;
}
ks_introsort_64(k, isize);
p25 = isize[(int)(.25 * k + .499)];
p50 = isize[(int)(.50 * k + .499)];
p75 = isize[(int)(.75 * k + .499)];
ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k);
if (k < 8) {
ksprintf(msg, "[%s] fail to infer the insert size distribution: too few good pairs.\n", __func__);
free(isize);
r.failed = 1;
return r;
}
tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
r.low = tmp > max_len? tmp : max_len;
if (r.low < 1) r.low = 1;
r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
if (r.low > r.high) {
ksprintf(msg, "[%s] fail to infer the insert size distribution: upper bound is smaller than max read length.\n", __func__);
free(isize);
r.failed = 1;
return r;
}
ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high);
for (i = x = 0, r.avg = 0; i < k; ++i)
if (isize[i] >= r.low && isize[i] <= r.high)
r.avg += isize[i], ++x;
if (x == 0) {
ksprintf(msg, "[%s] fail to infer the insert size distribution: no pairs within boundaries.\n", __func__);
free(isize);
r.failed = 1;
return r;
}
r.avg /= x;
for (i = 0, r.std = 0; i < k; ++i)
if (isize[i] >= r.low && isize[i] <= r.high)
r.std += (isize[i] - r.avg) * (isize[i] - r.avg);
r.std = sqrt(r.std / x);
ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std);
tmp = (int)(p25 - 3. * (p75 - p25) + .499);
r.low = tmp > max_len? tmp : max_len;
if (r.low < 1) r.low = 1;
r.high = (int)(p75 + 3. * (p75 - p25) + .499);
if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499);
r.low = tmp > max_len? tmp : max_len;
if (r.high < r.avg + MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499);
ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
free(isize);
return r;
}
typedef struct {
int n_cigar, beg, end, len;
int64_t pos;
uint32_t *cigar;
} pairaux_t;
extern unsigned char nst_nt4_table[256];
void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25])
{
extern void seq_reverse(int len, ubyte_t *seq, int is_comp);
int64_t k, beg, end;
uint8_t *seq, *ref;
int i;
// compute the region start and end
a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7
if (h->is_rev == 0) {
beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499);
if (beg < h->k) beg = h->k;
end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499);
a->is_rev = 1; a->flag |= 16;
} else {
beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499);
end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499);
if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg);
a->is_rev = 0;
}
if (beg < 1) beg = 1;
if (end > l_pac) end = l_pac;
if (end - beg < l_mseq) return;
// generate the sequence
seq = malloc(l_mseq + (end - beg));
ref = seq + l_mseq;
for (k = beg; k < end; ++k)
ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3;
if (h->is_rev == 0) {
for (i = 0; i < l_mseq; ++i) { // on the reverse strand
int c = nst_nt4_table[(int)mseq[i]];
seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c;
}
} else {
for (i = 0; i < l_mseq; ++i) // on the forward strand
seq[i] = nst_nt4_table[(int)mseq[i]];
}
{
int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t;
kswr_t aln;
aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0);
a->G = aln.score;
a->G2 = aln.score2;
if (a->G < opt->t) a->G = 0;
if (a->G2 < opt->t) a->G2 = 0;
if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
a->k = beg + aln.tb;
a->len = aln.te - aln.tb + 1;
a->beg = aln.qb;
a->end = aln.qe + 1;
/*
printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n');
printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n');
printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len);
*/
}
if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i;
free(seq);
}
void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits)
{
extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS);
bsw2pestat_t pes;
int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0;
int8_t g_mat[25];
kstring_t msg;
memset(&msg, 0, sizeof(kstring_t));
pes = bsw2_stat(n, hits, &msg, opt->max_ins);
for (i = k = 0; i < 5; ++i) {
for (j = 0; j < 4; ++j)
g_mat[k++] = i == j? opt->a : -opt->b;
g_mat[k++] = 0;
}
for (i = 0; i < n; i += 2) {
bsw2hit_t a[2];
memset(&a, 0, sizeof(bsw2hit_t) * 2);
a[0].flag = 1<<6; a[1].flag = 1<<7;
for (j = 0; j < 2; ++j) { // set the read1/2 flag
if (hits[i+j] == 0) continue;
for (k = 0; k < hits[i+j]->n; ++k) {
bsw2hit_t *p = &hits[i+j]->hits[k];
p->flag |= 1<<(6+j);
}
}
if (pes.failed) continue;
if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N
if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit
if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit
if (!opt->skip_sw) {
if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat);
if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat);
} // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0
// the following enumerate all possibilities. It is tedious but necessary...
if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not;
bwtsw2_t *p[2];
int which;
if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1;
else p[0] = hits[i+1], p[1] = hits[i], which = 0;
if (a[which].G == 0) continue;
a[which].flag |= BSW2_FLAG_RESCUED;
if (p[1]->max == 0) {
p[1]->max = 1;
p[1]->hits = malloc(sizeof(bsw2hit_t));
}
p[1]->hits[0] = a[which];
p[1]->n = 1;
p[0]->hits[0].flag |= 2;
p[1]->hits[0].flag |= 2;
++n_rescued;
} else { // then both ends mapped
int is_fixed = 0;
//fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end);
for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score
bsw2hit_t *p = &hits[i+j]->hits[0];
if (p->G < a[j].G) { // the orginal mapping is suboptimal
a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM?
*p = a[j];
++n_fixed;
is_fixed = 1;
} else if (p->k != a[j].k && p->G2 < a[j].G) {
p->G2 = a[j].G;
} else if (p->k == a[j].k && p->G2 < a[j].G2) {
p->G2 = a[j].G2;
}
}
if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved
for (j = 0; j < 2; ++j)
hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM);
} else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match
for (j = 0; j < 2; ++j) {
hits[i+j]->hits[0].flag |= 2;
if (hits[i+j]->hits[0].k != a[j].k)
hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM;
}
} else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end
if (a[0].G && a[1].G) { // now we have two "proper pairs"
int G[2];
double diff;
G[0] = hits[i]->hits[0].G + a[1].G;
G[1] = hits[i+1]->hits[0].G + a[0].G;
diff = fabs((double)(G[0] - G[1])) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.);
if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0;
}
if (a[0].G == 0 || a[1].G == 0) { // one proper pair only
bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved
int which, isize;
double dev, diff;
if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0;
else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1;
isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k;
dev = fabs(isize - pes.avg) / pes.std;
diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0;
if (diff < dev * 2.) { // then move (heuristic)
a[which].G2 = a[which].G;
p[1][0] = a[which];
p[1]->flag |= BSW2_FLAG_MOVED | 2;
p[0]->flag |= 2;
++n_moved;
}
}
} else if (is_fixed) {
hits[i+0]->hits[0].flag |= 2;
hits[i+1]->hits[0].flag |= 2;
}
}
}
ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved);
fputs(msg.s, stderr);
free(msg.s);
}

30
code_of_conduct.md 100644
View File

@ -0,0 +1,30 @@
## Contributor Code of Conduct
As contributors and maintainers of this project, we pledge to respect all
people who contribute through reporting issues, posting feature requests,
updating documentation, submitting pull requests or patches, and other
activities.
We are committed to making participation in this project a harassment-free
experience for everyone, regardless of level of experience, gender, gender
identity and expression, sexual orientation, disability, personal appearance,
body size, race, age, or religion.
Examples of unacceptable behavior by participants include the use of sexual
language or imagery, derogatory comments or personal attacks, trolling, public
or private harassment, insults, or other unprofessional conduct.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct. Project maintainers or
contributors who do not follow the Code of Conduct may be removed from the
project team.
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by opening an issue or contacting the maintainer via email.
This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
1.0.0][v1].
[cc]: http://contributor-covenant.org/
[v1]: http://contributor-covenant.org/version/1/0/0/

60
example.c 100644
View File

@ -0,0 +1,60 @@
#include <stdio.h>
#include <zlib.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include "bwamem.h"
#include "kseq.h" // for the FASTA/Q parser
KSEQ_DECLARE(gzFile)
int main(int argc, char *argv[])
{
bwaidx_t *idx;
gzFile fp;
kseq_t *ks;
mem_opt_t *opt;
if (argc < 3) {
fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
return 1;
}
idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
if (NULL == idx) {
fprintf(stderr, "Index load failed.\n");
exit(EXIT_FAILURE);
}
fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
if (NULL == fp) {
fprintf(stderr, "Couldn't open %s : %s\n",
strcmp(argv[2], "-") ? argv[2] : "stdin",
errno ? strerror(errno) : "Out of memory");
exit(EXIT_FAILURE);
}
ks = kseq_init(fp); // initialize the FASTA/Q parser
opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
while (kseq_read(ks) >= 0) { // read one sequence
mem_alnreg_v ar;
int i, k;
ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
for (i = 0; i < ar.n; ++i) { // traverse each hit
mem_aln_t a;
if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
// print alignment
printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
for (k = 0; k < a.n_cigar; ++k) // print CIGAR
printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
printf("\t%d\n", a.NM); // print edit distance
free(a.cigar); // don't forget to deallocate CIGAR
}
free(ar.a); // and deallocate the hit list
}
free(opt);
kseq_destroy(ks);
gzclose(fp);
bwa_idx_destroy(idx);
return 0;
}

483
fastmap.c 100644
View File

@ -0,0 +1,483 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <zlib.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <ctype.h>
#include <math.h>
#include "bwa.h"
#include "bwamem.h"
#include "kvec.h"
#include "utils.h"
#include "bntseq.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
extern unsigned char nst_nt4_table[256];
void *kopen(const char *fn, int *_fd);
int kclose(void *a);
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
typedef struct {
kseq_t *ks, *ks2;
mem_opt_t *opt;
mem_pestat_t *pes0;
int64_t n_processed;
int copy_comment, actual_chunk_size;
bwaidx_t *idx;
} ktp_aux_t;
typedef struct {
ktp_aux_t *aux;
int n_seqs;
bseq1_t *seqs;
} ktp_data_t;
static void *process(void *shared, int step, void *_data)
{
ktp_aux_t *aux = (ktp_aux_t*)shared;
ktp_data_t *data = (ktp_data_t*)_data;
int i;
if (step == 0) {
ktp_data_t *ret;
int64_t size = 0;
ret = calloc(1, sizeof(ktp_data_t));
ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2);
if (ret->seqs == 0) {
free(ret);
return 0;
}
if (!aux->copy_comment)
for (i = 0; i < ret->n_seqs; ++i) {
free(ret->seqs[i].comment);
ret->seqs[i].comment = 0;
}
for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size);
return ret;
} else if (step == 1) {
const mem_opt_t *opt = aux->opt;
const bwaidx_t *idx = aux->idx;
if (opt->flag & MEM_F_SMARTPE) {
bseq1_t *sep[2];
int n_sep[2];
mem_opt_t tmp_opt = *opt;
bseq_classify(data->n_seqs, data->seqs, n_sep, sep);
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
if (n_sep[0]) {
tmp_opt.flag &= ~MEM_F_PE;
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0);
for (i = 0; i < n_sep[0]; ++i)
data->seqs[sep[0][i].id].sam = sep[0][i].sam;
}
if (n_sep[1]) {
tmp_opt.flag |= MEM_F_PE;
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0);
for (i = 0; i < n_sep[1]; ++i)
data->seqs[sep[1][i].id].sam = sep[1][i].sam;
}
free(sep[0]); free(sep[1]);
} else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0);
aux->n_processed += data->n_seqs;
return data;
} else if (step == 2) {
for (i = 0; i < data->n_seqs; ++i) {
if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout);
free(data->seqs[i].name); free(data->seqs[i].comment);
free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam);
}
free(data->seqs); free(data);
return 0;
}
return 0;
}
static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
{
if (opt0->a) { // matching score is changed
if (!opt0->b) opt->b *= opt->a;
if (!opt0->T) opt->T *= opt->a;
if (!opt0->o_del) opt->o_del *= opt->a;
if (!opt0->e_del) opt->e_del *= opt->a;
if (!opt0->o_ins) opt->o_ins *= opt->a;
if (!opt0->e_ins) opt->e_ins *= opt->a;
if (!opt0->zdrop) opt->zdrop *= opt->a;
if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a;
if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a;
if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a;
}
}
int main_mem(int argc, char *argv[])
{
mem_opt_t *opt, opt0;
int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0;
int fixed_chunk_size = -1;
gzFile fp, fp2 = 0;
char *p, *rg_line = 0, *hdr_line = 0;
const char *mode = 0;
void *ko = 0, *ko2 = 0;
mem_pestat_t pes[4];
ktp_aux_t aux;
memset(&aux, 0, sizeof(ktp_aux_t));
memset(pes, 0, 4 * sizeof(mem_pestat_t));
for (i = 0; i < 4; ++i) pes[i].failed = 1;
aux.opt = opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == '1') no_mt_io = 1;
else if (c == 'x') mode = optarg;
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1;
else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1;
else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1;
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
else if (c == 'a') opt->flag |= MEM_F_ALL;
else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
else if (c == '5') opt->flag |= MEM_F_PRIMARY5 | MEM_F_KEEP_SUPP_MAPQ; // always apply MEM_F_KEEP_SUPP_MAPQ with -5
else if (c == 'q') opt->flag |= MEM_F_KEEP_SUPP_MAPQ;
else if (c == 'u') opt->flag |= MEM_F_XB;
else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
else if (c == 'v') bwa_verbose = atoi(optarg);
else if (c == 'j') ignore_alt = 1;
else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.;
else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.;
else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1;
else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1;
else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
else if (c == 'o' || c == 'f') xreopen(optarg, "wb", stdout);
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
else if (c == 'C') aux.copy_comment = 1;
else if (c == 'K') fixed_chunk_size = atoi(optarg);
else if (c == 'X') opt->mask_level = atof(optarg);
else if (c == 'F') bwa_dbg = atoi(optarg);
else if (c == 'h') {
opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
opt->max_XA_hits_alt = strtol(p+1, &p, 10);
}
else if (c == 'z') opt->XA_drop_ratio = atof(optarg);
else if (c == 'Q') {
opt0.mapQ_coef_len = 1;
opt->mapQ_coef_len = atoi(optarg);
opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0;
} else if (c == 'O') {
opt0.o_del = opt0.o_ins = 1;
opt->o_del = opt->o_ins = strtol(optarg, &p, 10);
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
opt->o_ins = strtol(p+1, &p, 10);
} else if (c == 'E') {
opt0.e_del = opt0.e_ins = 1;
opt->e_del = opt->e_ins = strtol(optarg, &p, 10);
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
opt->e_ins = strtol(p+1, &p, 10);
} else if (c == 'L') {
opt0.pen_clip5 = opt0.pen_clip3 = 1;
opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10);
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
opt->pen_clip3 = strtol(p+1, &p, 10);
} else if (c == 'R') {
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
} else if (c == 'H') {
if (optarg[0] != '@') {
FILE *fp;
if ((fp = fopen(optarg, "r")) != 0) {
char *buf;
buf = calloc(1, 0x10000);
while (fgets(buf, 0xffff, fp)) {
i = strlen(buf);
assert(buf[i-1] == '\n'); // a long line
buf[i-1] = 0;
hdr_line = bwa_insert_header(buf, hdr_line);
}
free(buf);
fclose(fp);
}
} else hdr_line = bwa_insert_header(optarg, hdr_line);
} else if (c == 'I') { // specify the insert size distribution
aux.pes0 = pes;
pes[1].failed = 0;
pes[1].avg = strtod(optarg, &p);
pes[1].std = pes[1].avg * .1;
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
pes[1].std = strtod(p+1, &p);
pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499);
pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499);
if (pes[1].low < 1) pes[1].low = 1;
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
pes[1].high = (int)(strtod(p+1, &p) + .499);
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
pes[1].low = (int)(strtod(p+1, &p) + .499);
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
}
else return 1;
}
if (rg_line) {
hdr_line = bwa_insert_header(rg_line, hdr_line);
free(rg_line);
}
if (opt->n_threads < 1) opt->n_threads = 1;
if (optind + 1 >= argc || optind + 3 < argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
fprintf(stderr, "Algorithm options:\n\n");
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv);
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n");
fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
fprintf(stderr, " -S skip mate rescue\n");
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
fprintf(stderr, "\nScoring options:\n\n");
fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired);
fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overridden [null]\n");
fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref)\n");
fprintf(stderr, " ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref)\n");
fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n");
fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
fprintf(stderr, " -H STR/FILE insert STR to header if it starts with @; or insert lines in FILE [null]\n");
fprintf(stderr, " -o FILE sam file to output results to [stdout]\n");
fprintf(stderr, " -j treat ALT contigs as part of the primary assembly (i.e. ignore <idxbase>.alt file)\n");
fprintf(stderr, " -5 for split alignment, take the alignment with the smallest query (not genomic) coordinate as primary\n");
fprintf(stderr, " -q don't modify mapQ of supplementary alignments\n");
fprintf(stderr, " -K INT process INT input bases in each batch regardless of nThreads (for reproducibility) []\n");
fprintf(stderr, "\n");
fprintf(stderr, " -v INT verbosity level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
fprintf(stderr, " -h INT[,INT] if there are <INT hits with score >%.2f%% of the max score, output all in XA [%d,%d]\n",
opt->XA_drop_ratio * 100.0,
opt->max_XA_hits, opt->max_XA_hits_alt);
fprintf(stderr, " A second value may be given for alternate sequences.\n");
fprintf(stderr, " -z FLOAT The fraction of the max score to use with -h [%f].\n", opt->XA_drop_ratio);
fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n");
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
fprintf(stderr, " -V output the reference FASTA header in the XR tag\n");
fprintf(stderr, " -Y use soft clipping for supplementary alignments\n");
fprintf(stderr, " -M mark shorter split hits as secondary\n\n");
fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n");
fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n");
fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n");
fprintf(stderr, " FR orientation only. [inferred]\n");
fprintf(stderr, " -u output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n");
fprintf(stderr, "\n");
fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
fprintf(stderr, "\n");
free(opt);
return 1;
}
if (mode) {
if (strcmp(mode, "intractg") == 0) {
if (!opt0.o_del) opt->o_del = 16;
if (!opt0.o_ins) opt->o_ins = 16;
if (!opt0.b) opt->b = 9;
if (!opt0.pen_clip5) opt->pen_clip5 = 5;
if (!opt0.pen_clip3) opt->pen_clip3 = 5;
} else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "ont2d") == 0) {
if (!opt0.o_del) opt->o_del = 1;
if (!opt0.e_del) opt->e_del = 1;
if (!opt0.o_ins) opt->o_ins = 1;
if (!opt0.e_ins) opt->e_ins = 1;
if (!opt0.b) opt->b = 1;
if (opt0.split_factor == 0.) opt->split_factor = 10.;
if (strcmp(mode, "ont2d") == 0) {
if (!opt0.min_chain_weight) opt->min_chain_weight = 20;
if (!opt0.min_seed_len) opt->min_seed_len = 14;
if (!opt0.pen_clip5) opt->pen_clip5 = 0;
if (!opt0.pen_clip3) opt->pen_clip3 = 0;
} else {
if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
if (!opt0.min_seed_len) opt->min_seed_len = 17;
if (!opt0.pen_clip5) opt->pen_clip5 = 0;
if (!opt0.pen_clip3) opt->pen_clip3 = 0;
}
} else {
fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode);
return 1; // FIXME memory leak
}
} else update_a(opt, &opt0);
bwa_fill_scmat(opt->a, opt->b, opt->mat);
aux.idx = bwa_idx_load_from_shm(argv[optind]);
if (aux.idx == 0) {
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
} else if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
if (ignore_alt)
for (i = 0; i < aux.idx->bns->n_seqs; ++i)
aux.idx->bns->anns[i].is_alt = 0;
ko = kopen(argv[optind + 1], &fd);
if (ko == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]);
return 1;
}
fp = gzdopen(fd, "r");
aux.ks = kseq_init(fp);
if (optind + 2 < argc) {
if (opt->flag&MEM_F_PE) {
if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
} else {
ko2 = kopen(argv[optind + 2], &fd2);
if (ko2 == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]);
return 1;
}
fp2 = gzdopen(fd2, "r");
aux.ks2 = kseq_init(fp2);
opt->flag |= MEM_F_PE;
}
}
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
free(hdr_line);
free(opt);
bwa_idx_destroy(aux.idx);
kseq_destroy(aux.ks);
err_gzclose(fp); kclose(ko);
if (aux.ks2) {
kseq_destroy(aux.ks2);
err_gzclose(fp2); kclose(ko2);
}
return 0;
}
int main_fastmap(int argc, char *argv[])
{
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX;
uint64_t max_intv = 0;
kseq_t *seq;
bwtint_t k;
gzFile fp;
smem_i *itr;
const bwtintv_v *a;
bwaidx_t *idx;
while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) {
switch (c) {
case 'p': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
case 'i': min_intv = atoi(optarg); break;
case 'I': max_intv = atol(optarg); break;
case 'L': max_len = atoi(optarg); break;
default: return 1;
}
}
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);
fprintf(stderr, " -L INT max MEM length [%d]\n", max_len);
fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv);
fprintf(stderr, "\n");
return 1;
}
fp = xzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
itr = smem_itr_init(idx->bwt);
smem_config(itr, min_intv, max_len, max_intv);
while (kseq_read(seq) >= 0) {
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {
err_putchar('\t');
err_puts(seq->seq.s);
} else err_putchar('\n');
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
while ((a = smem_next(itr)) != 0) {
for (i = 0; i < a->n; ++i) {
bwtintv_t *p = &a->a[i];
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
if (p->x[2] <= min_iwidth) {
for (k = 0; k < p->x[2]; ++k) {
bwtint_t pos;
int len, is_rev, ref_id;
len = (uint32_t)p->info - (p->info>>32);
pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
if (is_rev) pos -= len - 1;
bns_cnt_ambi(idx->bns, pos, len, &ref_id);
err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
}
} else err_puts("\t*");
err_putchar('\n');
}
}
err_puts("//");
}
smem_itr_destroy(itr);
bwa_idx_destroy(idx);
kseq_destroy(seq);
err_gzclose(fp);
return 0;
}

223
is.c 100644
View File

@ -0,0 +1,223 @@
/*
* sais.c for sais-lite
* Copyright (c) 2008 Yuta Mori All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <stdlib.h>
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
typedef unsigned char ubyte_t;
#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i])
/* find the start or end of each bucket */
static void getCounts(const unsigned char *T, int *C, int n, int k, int cs)
{
int i;
for (i = 0; i < k; ++i) C[i] = 0;
for (i = 0; i < n; ++i) ++C[chr(i)];
}
static void getBuckets(const int *C, int *B, int k, int end)
{
int i, sum = 0;
if (end) {
for (i = 0; i < k; ++i) {
sum += C[i];
B[i] = sum;
}
} else {
for (i = 0; i < k; ++i) {
sum += C[i];
B[i] = sum - C[i];
}
}
}
/* compute SA */
static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs)
{
int *b, i, j;
int c0, c1;
/* compute SAl */
if (C == B) getCounts(T, C, n, k, cs);
getBuckets(C, B, k, 0); /* find starts of buckets */
j = n - 1;
b = SA + B[c1 = chr(j)];
*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
for (i = 0; i < n; ++i) {
j = SA[i], SA[i] = ~j;
if (0 < j) {
--j;
if ((c0 = chr(j)) != c1) {
B[c1] = b - SA;
b = SA + B[c1 = c0];
}
*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
}
}
/* compute SAs */
if (C == B) getCounts(T, C, n, k, cs);
getBuckets(C, B, k, 1); /* find ends of buckets */
for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
if (0 < (j = SA[i])) {
--j;
if ((c0 = chr(j)) != c1) {
B[c1] = b - SA;
b = SA + B[c1 = c0];
}
*--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j;
} else SA[i] = ~j;
}
}
/*
* find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working
* space (excluding T and SA) of at most 2n+O(1) for a constant alphabet
*/
static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs)
{
int *C, *B, *RA;
int i, j, c, m, p, q, plen, qlen, name;
int c0, c1;
int diff;
/* stage 1: reduce the problem by at least 1/2 sort all the
* S-substrings */
if (k <= fs) {
C = SA + n;
B = (k <= (fs - k)) ? C + k : C;
} else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
getCounts(T, C, n, k, cs);
getBuckets(C, B, k, 1); /* find ends of buckets */
for (i = 0; i < n; ++i) SA[i] = 0;
for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
if ((c0 = chr(i)) < (c1 + c)) c = 1;
else if (c != 0) SA[--B[c1]] = i + 1, c = 0;
}
induceSA(T, SA, C, B, n, k, cs);
if (fs < k) free(C);
/* compact all the sorted substrings into the first m items of SA
* 2*m must be not larger than n (proveable) */
for (i = 0, m = 0; i < n; ++i) {
p = SA[i];
if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) {
for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j);
if ((j < n) && (c0 < c1)) SA[m++] = p;
}
}
for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */
/* store the length of all substrings */
for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
if ((c0 = chr(i)) < (c1 + c)) c = 1;
else if (c != 0) {
SA[m + ((i + 1) >> 1)] = j - i - 1;
j = i + 1;
c = 0;
}
}
/* find the lexicographic names of all substrings */
for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
if (plen == qlen) {
for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++);
if (j == plen) diff = 0;
}
if (diff != 0) ++name, q = p, qlen = plen;
SA[m + (p >> 1)] = name;
}
/* stage 2: solve the reduced problem recurse if names are not yet
* unique */
if (name < m) {
RA = SA + n + fs - m;
for (i = n - 1, j = m - 1; m <= i; --i) {
if (SA[i] != 0) RA[j--] = SA[i] - 1;
}
if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2;
for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
if ((c0 = chr(i)) < (c1 + c)) c = 1;
else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */
}
for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */
}
/* stage 3: induce the result for the original problem */
if (k <= fs) {
C = SA + n;
B = (k <= (fs - k)) ? C + k : C;
} else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
/* put all left-most S characters into their buckets */
getCounts(T, C, n, k, cs);
getBuckets(C, B, k, 1); /* find ends of buckets */
for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
for (i = m - 1; 0 <= i; --i) {
j = SA[i], SA[i] = 0;
SA[--B[chr(j)]] = j;
}
induceSA(T, SA, C, B, n, k, cs);
if (fs < k) free(C);
return 0;
}
/**
* Constructs the suffix array of a given string.
* @param T[0..n-1] The input string.
* @param SA[0..n] The output array of suffixes.
* @param n The length of the given string.
* @return 0 if no error occurred
*/
int is_sa(const ubyte_t *T, int *SA, int n)
{
if ((T == NULL) || (SA == NULL) || (n < 0)) return -1;
SA[0] = n;
if (n <= 1) {
if (n == 1) SA[1] = 0;
return 0;
}
return sais_main(T, SA+1, 0, n, 256, 1);
}
/**
* Constructs the burrows-wheeler transformed string of a given string.
* @param T[0..n-1] The input string.
* @param n The length of the given string.
* @return The primary index if no error occurred, -1 or -2 otherwise.
*/
int is_bwt(ubyte_t *T, int n)
{
int *SA, i, primary = 0;
SA = (int*)calloc(n+1, sizeof(int));
if (is_sa(T, SA, n)) return -1;
for (i = 0; i <= n; ++i) {
if (SA[i] == 0) primary = i;
else SA[i] = T[SA[i] - 1];
}
for (i = 0; i < primary; ++i) T[i] = SA[i];
for (; i < n; ++i) T[i] = SA[i + 1];
free(SA);
return primary;
}

388
kbtree.h 100644
View File

@ -0,0 +1,388 @@
/*-
* Copyright 1997-1999, 2001, John-Mark Gurney.
* 2008-2009, Attractive Chaos <attractor@live.co.uk>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef __AC_KBTREE_H
#define __AC_KBTREE_H
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
typedef struct {
int32_t is_internal:1, n:31;
} kbnode_t;
#define __KB_KEY(type, x) ((type*)((char*)x + 4))
#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr))
#define __KB_TREE_T(name) \
typedef struct { \
kbnode_t *root; \
int off_key, off_ptr, ilen, elen; \
int n, t; \
int n_keys, n_nodes; \
} kbtree_##name##_t;
#define __KB_INIT(name, key_t) \
kbtree_##name##_t *kb_init_##name(int size) \
{ \
kbtree_##name##_t *b; \
b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \
b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
if (b->t < 2) { \
free(b); return 0; \
} \
b->n = 2 * b->t - 1; \
b->off_ptr = 4 + b->n * sizeof(key_t); \
b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
b->elen = (b->off_ptr + 3) >> 2 << 2; \
b->root = (kbnode_t*)calloc(1, b->ilen); \
++b->n_nodes; \
return b; \
}
#define __kb_destroy(b) do { \
int i, max = 8; \
kbnode_t *x, **top, **stack = 0; \
if (b) { \
top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \
*top++ = (b)->root; \
while (top != stack) { \
x = *--top; \
if (x == 0 || x->is_internal == 0) { free(x); continue; } \
for (i = 0; i <= x->n; ++i) \
if (__KB_PTR(b, x)[i]) { \
if (top - stack == max) { \
max <<= 1; \
stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \
top = stack + (max>>1); \
} \
*top++ = __KB_PTR(b, x)[i]; \
} \
free(x); \
} \
} \
free(b); free(stack); \
} while (0)
#define __kb_get_first(key_t, b, ret) do { \
kbnode_t *__x = (b)->root; \
while (__KB_PTR(b, __x)[0] != 0) \
__x = __KB_PTR(b, __x)[0]; \
(ret) = __KB_KEY(key_t, __x)[0]; \
} while (0)
#define __KB_GET_AUX0(name, key_t, __cmp) \
static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
{ \
int tr, *rr, begin, end, n = x->n >> 1; \
if (x->n == 0) return -1; \
if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \
begin = 0; end = n; \
} else { begin = n; end = x->n - 1; } \
rr = r? r : &tr; \
n = end; \
while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
return n; \
}
#define __KB_GET_AUX1(name, key_t, __cmp) \
static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
{ \
int tr, *rr, begin = 0, end = x->n; \
if (x->n == 0) return -1; \
rr = r? r : &tr; \
while (begin < end) { \
int mid = (begin + end) >> 1; \
if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
else end = mid; \
} \
if (begin == x->n) { *rr = 1; return x->n - 1; } \
if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \
return begin; \
}
#define __KB_GET(name, key_t) \
static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
{ \
int i, r = 0; \
kbnode_t *x = b->root; \
while (x) { \
i = __kb_getp_aux_##name(x, k, &r); \
if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \
if (x->is_internal == 0) return 0; \
x = __KB_PTR(b, x)[i + 1]; \
} \
return 0; \
} \
static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
{ \
return kb_getp_##name(b, &k); \
}
#define __KB_INTERVAL(name, key_t) \
static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \
{ \
int i, r = 0; \
kbnode_t *x = b->root; \
*lower = *upper = 0; \
while (x) { \
i = __kb_getp_aux_##name(x, k, &r); \
if (i >= 0 && r == 0) { \
*lower = *upper = &__KB_KEY(key_t, x)[i]; \
return; \
} \
if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \
if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \
if (x->is_internal == 0) return; \
x = __KB_PTR(b, x)[i + 1]; \
} \
} \
static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
{ \
kb_intervalp_##name(b, &k, lower, upper); \
}
#define __KB_PUT(name, key_t, __cmp) \
/* x must be an internal node */ \
static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
{ \
kbnode_t *z; \
z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \
++b->n_nodes; \
z->is_internal = y->is_internal; \
z->n = b->t - 1; \
memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
y->n = b->t - 1; \
memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
__KB_PTR(b, x)[i + 1] = z; \
memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \
++x->n; \
} \
static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
{ \
int i = x->n - 1; \
if (x->is_internal == 0) { \
i = __kb_getp_aux_##name(x, k, 0); \
if (i != x->n - 1) \
memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
__KB_KEY(key_t, x)[i + 1] = *k; \
++x->n; \
} else { \
i = __kb_getp_aux_##name(x, k, 0) + 1; \
if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \
__kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \
if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \
} \
__kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \
} \
} \
static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
{ \
kbnode_t *r, *s; \
++b->n_keys; \
r = b->root; \
if (r->n == 2 * b->t - 1) { \
++b->n_nodes; \
s = (kbnode_t*)calloc(1, b->ilen); \
b->root = s; s->is_internal = 1; s->n = 0; \
__KB_PTR(b, s)[0] = r; \
__kb_split_##name(b, s, 0, r); \
r = s; \
} \
__kb_putp_aux_##name(b, r, k); \
} \
static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
{ \
kb_putp_##name(b, &k); \
}
#define __KB_DEL(name, key_t) \
static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
{ \
int yn, zn, i, r = 0; \
kbnode_t *xp, *y, *z; \
key_t kp; \
if (x == 0) return *k; \
if (s) { /* s can only be 0, 1 or 2 */ \
r = x->is_internal == 0? 0 : s == 1? 1 : -1; \
i = s == 1? x->n - 1 : -1; \
} else i = __kb_getp_aux_##name(x, k, &r); \
if (x->is_internal == 0) { \
if (s == 2) ++i; \
kp = __KB_KEY(key_t, x)[i]; \
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
--x->n; \
return kp; \
} \
if (r == 0) { \
if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \
xp = __KB_PTR(b, x)[i]; \
kp = __KB_KEY(key_t, x)[i]; \
__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
return kp; \
} else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \
xp = __KB_PTR(b, x)[i + 1]; \
kp = __KB_KEY(key_t, x)[i]; \
__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
return kp; \
} else if (yn == b->t - 1 && zn == b->t - 1) { \
y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \
__KB_KEY(key_t, y)[y->n++] = *k; \
memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
y->n += z->n; \
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
--x->n; \
free(z); \
return __kb_delp_aux_##name(b, y, k, s); \
} \
} \
++i; \
if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \
if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \
memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
__KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \
__KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
--y->n; ++xp->n; \
} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \
if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
--y->n; \
memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
} else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
__KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \
memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
y->n += xp->n; \
memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
--x->n; \
free(xp); \
xp = y; \
} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \
if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
xp->n += y->n; \
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
--x->n; \
free(y); \
} \
} \
return __kb_delp_aux_##name(b, xp, k, s); \
} \
static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
{ \
kbnode_t *x; \
key_t ret; \
ret = __kb_delp_aux_##name(b, b->root, k, 0); \
--b->n_keys; \
if (b->root->n == 0 && b->root->is_internal) { \
--b->n_nodes; \
x = b->root; \
b->root = __KB_PTR(b, x)[0]; \
free(x); \
} \
return ret; \
} \
static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
{ \
return kb_delp_##name(b, &k); \
}
typedef struct {
kbnode_t *x;
int i;
} __kbstack_t;
#define __kb_traverse(key_t, b, __func) do { \
int __kmax = 8; \
__kbstack_t *__kstack, *__kp; \
__kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \
__kp->x = (b)->root; __kp->i = 0; \
for (;;) { \
while (__kp->x && __kp->i <= __kp->x->n) { \
if (__kp - __kstack == __kmax - 1) { \
__kmax <<= 1; \
__kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \
__kp = __kstack + (__kmax>>1) - 1; \
} \
(__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
++__kp; \
} \
--__kp; \
if (__kp >= __kstack) { \
if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
++__kp->i; \
} else break; \
} \
free(__kstack); \
} while (0)
#define KBTREE_INIT(name, key_t, __cmp) \
__KB_TREE_T(name) \
__KB_INIT(name, key_t) \
__KB_GET_AUX1(name, key_t, __cmp) \
__KB_GET(name, key_t) \
__KB_INTERVAL(name, key_t) \
__KB_PUT(name, key_t, __cmp) \
__KB_DEL(name, key_t)
#define KB_DEFAULT_SIZE 512
#define kbtree_t(name) kbtree_##name##_t
#define kb_init(name, s) kb_init_##name(s)
#define kb_destroy(name, b) __kb_destroy(b)
#define kb_get(name, b, k) kb_get_##name(b, k)
#define kb_put(name, b, k) kb_put_##name(b, k)
#define kb_del(name, b, k) kb_del_##name(b, k)
#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
#define kb_getp(name, b, k) kb_getp_##name(b, k)
#define kb_putp(name, b, k) kb_putp_##name(b, k)
#define kb_delp(name, b, k) kb_delp_##name(b, k)
#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
#define kb_size(b) ((b)->n_keys)
#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
#define kb_str_cmp(a, b) strcmp(a, b)
#endif

614
khash.h 100644
View File

@ -0,0 +1,614 @@
/* The MIT License
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
An example:
#include "khash.h"
KHASH_MAP_INIT_INT(32, char)
int main() {
int ret, is_missing;
khiter_t k;
khash_t(32) *h = kh_init(32);
k = kh_put(32, h, 5, &ret);
kh_value(h, k) = 10;
k = kh_get(32, h, 10);
is_missing = (k == kh_end(h));
k = kh_get(32, h, 5);
kh_del(32, h, k);
for (k = kh_begin(h); k != kh_end(h); ++k)
if (kh_exist(h, k)) kh_value(h, k) = 1;
kh_destroy(32, h);
return 0;
}
*/
/*
2011-12-29 (0.2.7):
* Minor code clean up; no actual effect.
2011-09-16 (0.2.6):
* The capacity is a power of 2. This seems to dramatically improve the
speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
- http://code.google.com/p/ulib/
- http://nothings.org/computer/judy/
* Allow to optionally use linear probing which usually has better
performance for random input. Double hashing is still the default as it
is more robust to certain non-random input.
* Added Wang's integer hash function (not used by default). This hash
function is more robust to certain non-random input.
2011-02-14 (0.2.5):
* Allow to declare global functions.
2009-09-26 (0.2.4):
* Improve portability
2008-09-19 (0.2.3):
* Corrected the example
* Improved interfaces
2008-09-11 (0.2.2):
* Improved speed a little in kh_put()
2008-09-10 (0.2.1):
* Added kh_clear()
* Fixed a compiling error
2008-09-02 (0.2.0):
* Changed to token concatenation which increases flexibility.
2008-08-31 (0.1.2):
* Fixed a bug in kh_get(), which has not been tested previously.
2008-08-31 (0.1.1):
* Added destructor
*/
#ifndef __AC_KHASH_H
#define __AC_KHASH_H
/*!
@header
Generic hash table library.
*/
#define AC_VERSION_KHASH_H "0.2.6"
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
/* compipler specific configuration */
#if UINT_MAX == 0xffffffffu
typedef unsigned int khint32_t;
#elif ULONG_MAX == 0xffffffffu
typedef unsigned long khint32_t;
#endif
#if ULONG_MAX == ULLONG_MAX
typedef unsigned long khint64_t;
#else
typedef unsigned long long khint64_t;
#endif
#ifdef _MSC_VER
#define kh_inline __inline
#else
#define kh_inline inline
#endif
typedef khint32_t khint_t;
typedef khint_t khiter_t;
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
#ifdef KHASH_LINEAR
#define __ac_inc(k, m) 1
#else
#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
#endif
#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#ifndef kcalloc
#define kcalloc(N,Z) calloc(N,Z)
#endif
#ifndef kmalloc
#define kmalloc(Z) malloc(Z)
#endif
#ifndef krealloc
#define krealloc(P,Z) realloc(P,Z)
#endif
#ifndef kfree
#define kfree(P) free(P)
#endif
static const double __ac_HASH_UPPER = 0.77;
#define __KHASH_TYPE(name, khkey_t, khval_t) \
typedef struct { \
khint_t n_buckets, size, n_occupied, upper_bound; \
khint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t;
#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
extern kh_##name##_t *kh_init_##name(void); \
extern void kh_destroy_##name(kh_##name##_t *h); \
extern void kh_clear_##name(kh_##name##_t *h); \
extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
extern void kh_del_##name(kh_##name##_t *h, khint_t x);
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
SCOPE kh_##name##_t *kh_init_##name(void) { \
return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
} \
SCOPE void kh_destroy_##name(kh_##name##_t *h) \
{ \
if (h) { \
kfree((void *)h->keys); kfree(h->flags); \
kfree((void *)h->vals); \
kfree(h); \
} \
} \
SCOPE void kh_clear_##name(kh_##name##_t *h) \
{ \
if (h && h->flags) { \
memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
h->size = h->n_occupied = 0; \
} \
} \
SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
{ \
if (h->n_buckets) { \
khint_t inc, k, i, last, mask; \
mask = h->n_buckets - 1; \
k = __hash_func(key); i = k & mask; \
inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
i = (i + inc) & mask; \
if (i == last) return h->n_buckets; \
} \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
} else return 0; \
} \
SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
khint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
kroundup32(new_n_buckets); \
if (new_n_buckets < 4) new_n_buckets = 4; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
else { /* hash table size to be changed (shrink or expand); rehash */ \
new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
if (!new_flags) return -1; \
memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
if (h->n_buckets < new_n_buckets) { /* expand */ \
khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
if (!new_keys) return -1; \
h->keys = new_keys; \
if (kh_is_map) { \
khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
if (!new_vals) return -1; \
h->vals = new_vals; \
} \
} /* otherwise shrink */ \
} \
} \
if (j) { /* rehashing is needed */ \
for (j = 0; j != h->n_buckets; ++j) { \
if (__ac_iseither(h->flags, j) == 0) { \
khkey_t key = h->keys[j]; \
khval_t val; \
khint_t new_mask; \
new_mask = new_n_buckets - 1; \
if (kh_is_map) val = h->vals[j]; \
__ac_set_isdel_true(h->flags, j); \
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
khint_t inc, k, i; \
k = __hash_func(key); \
i = k & new_mask; \
inc = __ac_inc(k, new_mask); \
while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
__ac_set_isempty_false(new_flags, i); \
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
} else { /* write the element and jump out of the loop */ \
h->keys[i] = key; \
if (kh_is_map) h->vals[i] = val; \
break; \
} \
} \
} \
} \
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
} \
kfree(h->flags); /* free the working space */ \
h->flags = new_flags; \
h->n_buckets = new_n_buckets; \
h->n_occupied = h->size; \
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
} \
return 0; \
} \
SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
{ \
khint_t x; \
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
if (h->n_buckets > (h->size<<1)) { \
if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
*ret = -1; return h->n_buckets; \
} \
} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
*ret = -1; return h->n_buckets; \
} \
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
{ \
khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
else { \
inc = __ac_inc(k, mask); last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (__ac_isdel(h->flags, i)) site = i; \
i = (i + inc) & mask; \
if (i == last) { x = site; break; } \
} \
if (x == h->n_buckets) { \
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
else x = i; \
} \
} \
} \
if (__ac_isempty(h->flags, x)) { /* not present at all */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; ++h->n_occupied; \
*ret = 1; \
} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; \
*ret = 2; \
} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
return x; \
} \
SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
{ \
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
__ac_set_isdel_true(h->flags, x); \
--h->size; \
} \
}
#define KHASH_DECLARE(name, khkey_t, khval_t) \
__KHASH_TYPE(name, khkey_t, khval_t) \
__KHASH_PROTOTYPES(name, khkey_t, khval_t)
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
__KHASH_TYPE(name, khkey_t, khval_t) \
__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
/* --- BEGIN OF HASH FUNCTIONS --- */
/*! @function
@abstract Integer hash function
@param key The integer [khint32_t]
@return The hash value [khint_t]
*/
#define kh_int_hash_func(key) (khint32_t)(key)
/*! @function
@abstract Integer comparison function
*/
#define kh_int_hash_equal(a, b) ((a) == (b))
/*! @function
@abstract 64-bit integer hash function
@param key The integer [khint64_t]
@return The hash value [khint_t]
*/
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
/*! @function
@abstract 64-bit integer comparison function
*/
#define kh_int64_hash_equal(a, b) ((a) == (b))
/*! @function
@abstract const char* hash function
@param s Pointer to a null terminated string
@return The hash value
*/
static kh_inline khint_t __ac_X31_hash_string(const char *s)
{
khint_t h = (khint_t)*s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
return h;
}
/*! @function
@abstract Another interface to const char* hash function
@param key Pointer to a null terminated string [const char*]
@return The hash value [khint_t]
*/
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
/*! @function
@abstract Const char* comparison function
*/
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
static kh_inline khint_t __ac_Wang_hash(khint_t key)
{
key += ~(key << 15);
key ^= (key >> 10);
key += (key << 3);
key ^= (key >> 6);
key += ~(key << 11);
key ^= (key >> 16);
return key;
}
#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
/* --- END OF HASH FUNCTIONS --- */
/* Other convenient macros... */
/*!
@abstract Type of the hash table.
@param name Name of the hash table [symbol]
*/
#define khash_t(name) kh_##name##_t
/*! @function
@abstract Initiate a hash table.
@param name Name of the hash table [symbol]
@return Pointer to the hash table [khash_t(name)*]
*/
#define kh_init(name) kh_init_##name()
/*! @function
@abstract Destroy a hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
*/
#define kh_destroy(name, h) kh_destroy_##name(h)
/*! @function
@abstract Reset a hash table without deallocating memory.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
*/
#define kh_clear(name, h) kh_clear_##name(h)
/*! @function
@abstract Resize a hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param s New size [khint_t]
*/
#define kh_resize(name, h, s) kh_resize_##name(h, s)
/*! @function
@abstract Insert a key to the hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Key [type of keys]
@param r Extra return code: 0 if the key is present in the hash table;
1 if the bucket is empty (never used); 2 if the element in
the bucket has been deleted [int*]
@return Iterator to the inserted element [khint_t]
*/
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
/*! @function
@abstract Retrieve a key from the hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Key [type of keys]
@return Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
*/
#define kh_get(name, h, k) kh_get_##name(h, k)
/*! @function
@abstract Remove a key from the hash table.
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param k Iterator to the element to be deleted [khint_t]
*/
#define kh_del(name, h, k) kh_del_##name(h, k)
/*! @function
@abstract Test whether a bucket contains data.
@param h Pointer to the hash table [khash_t(name)*]
@param x Iterator to the bucket [khint_t]
@return 1 if containing data; 0 otherwise [int]
*/
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
/*! @function
@abstract Get key given an iterator
@param h Pointer to the hash table [khash_t(name)*]
@param x Iterator to the bucket [khint_t]
@return Key [type of keys]
*/
#define kh_key(h, x) ((h)->keys[x])
/*! @function
@abstract Get value given an iterator
@param h Pointer to the hash table [khash_t(name)*]
@param x Iterator to the bucket [khint_t]
@return Value [type of values]
@discussion For hash sets, calling this results in segfault.
*/
#define kh_val(h, x) ((h)->vals[x])
/*! @function
@abstract Alias of kh_val()
*/
#define kh_value(h, x) ((h)->vals[x])
/*! @function
@abstract Get the start iterator
@param h Pointer to the hash table [khash_t(name)*]
@return The start iterator [khint_t]
*/
#define kh_begin(h) (khint_t)(0)
/*! @function
@abstract Get the end iterator
@param h Pointer to the hash table [khash_t(name)*]
@return The end iterator [khint_t]
*/
#define kh_end(h) ((h)->n_buckets)
/*! @function
@abstract Get the number of elements in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@return Number of elements in the hash table [khint_t]
*/
#define kh_size(h) ((h)->size)
/*! @function
@abstract Get the number of buckets in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@return Number of buckets in the hash table [khint_t]
*/
#define kh_n_buckets(h) ((h)->n_buckets)
/*! @function
@abstract Iterate over the entries in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@param kvar Variable to which key will be assigned
@param vvar Variable to which value will be assigned
@param code Block of code to execute
*/
#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
if (!kh_exist(h,__i)) continue; \
(kvar) = kh_key(h,__i); \
(vvar) = kh_val(h,__i); \
code; \
} }
/*! @function
@abstract Iterate over the values in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@param vvar Variable to which value will be assigned
@param code Block of code to execute
*/
#define kh_foreach_value(h, vvar, code) { khint_t __i; \
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
if (!kh_exist(h,__i)) continue; \
(vvar) = kh_val(h,__i); \
code; \
} }
/* More conenient interfaces */
/*! @function
@abstract Instantiate a hash set containing integer keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_INT(name) \
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT(name, khval_t) \
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_INT64(name) \
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT64(name, khval_t) \
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
typedef const char *kh_cstr_t;
/*! @function
@abstract Instantiate a hash map containing const char* keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_STR(name) \
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
/*! @function
@abstract Instantiate a hash map containing const char* keys
@param name Name of the hash table [symbol]
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_STR(name, khval_t) \
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
#endif /* __AC_KHASH_H */

374
kopen.c 100644
View File

@ -0,0 +1,374 @@
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <ctype.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <signal.h>
#include <sys/wait.h>
#include <sys/types.h>
#ifndef _WIN32
#include <netdb.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#endif
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#ifdef _WIN32
#define _KO_NO_NET
#endif
#ifndef _KO_NO_NET
static int socket_wait(int fd, int is_read)
{
fd_set fds, *fdr = 0, *fdw = 0;
struct timeval tv;
int ret;
tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
FD_ZERO(&fds);
FD_SET(fd, &fds);
if (is_read) fdr = &fds;
else fdw = &fds;
ret = select(fd+1, fdr, fdw, 0, &tv);
if (ret == -1) perror("select");
return ret;
}
static int socket_connect(const char *host, const char *port)
{
#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
int on = 1, fd;
struct linger lng = { 0, 0 };
struct addrinfo hints, *res = 0;
memset(&hints, 0, sizeof(struct addrinfo));
hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
freeaddrinfo(res);
return fd;
#undef __err_connect
}
static int write_bytes(int fd, const char *buf, size_t len)
{
ssize_t bytes;
do {
bytes = write(fd, buf, len);
if (bytes >= 0) {
len -= bytes;
} else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
return -1;
}
} while (len > 0);
return 0;
}
static int http_open(const char *fn)
{
char *p, *proxy, *q, *http_host, *host, *port, *path, *buf;
int fd, ret, l;
ssize_t bytes = 0, bufsz = 0x10000;
/* parse URL; adapted from khttp_parse_url() in knetfile.c */
if (strstr(fn, "http://") != fn) return 0;
// set ->http_host
for (p = (char*)fn + 7; *p && *p != '/'; ++p);
l = p - fn - 7;
http_host = calloc(l + 1, 1);
strncpy(http_host, fn + 7, l);
http_host[l] = 0;
for (q = http_host; *q && *q != ':'; ++q);
if (*q == ':') *q++ = 0;
// get http_proxy
proxy = getenv("http_proxy");
// set host, port and path
if (proxy == 0) {
host = strdup(http_host); // when there is no proxy, server name is identical to http_host name.
port = strdup(*q? q : "80");
path = strdup(*p? p : "/");
} else {
host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
for (q = host; *q && *q != ':'; ++q);
if (*q == ':') *q++ = 0;
port = strdup(*q? q : "80");
path = strdup(fn);
}
/* connect; adapted from khttp_connect() in knetfile.c */
l = 0;
fd = socket_connect(host, port);
buf = calloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n",
path, http_host);
if (write_bytes(fd, buf, l) != 0) {
close(fd);
fd = -1;
goto out;
}
l = 0;
retry:
while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency
if (buf[l] == '\n' && l >= 3)
if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
++l;
}
if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry;
buf[l] = 0;
if (bytes < 0 || l < 14) { // prematured header
close(fd);
fd = -1;
goto out;
}
ret = strtol(buf + 8, &p, 0); // HTTP return code
if (ret != 200) {
close(fd);
fd = -1;
}
out:
free(buf); free(http_host); free(host); free(port); free(path);
return fd;
}
typedef struct {
int max_response, ctrl_fd;
char *response;
} ftpaux_t;
static int kftp_get_response(ftpaux_t *aux)
{
unsigned char c;
int n = 0;
char *p;
if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0;
while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
if (n >= aux->max_response) {
aux->max_response = aux->max_response? aux->max_response<<1 : 256;
aux->response = realloc(aux->response, aux->max_response);
}
aux->response[n++] = c;
if (c == '\n') {
if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2])
&& aux->response[3] != '-') break;
n = 0;
continue;
}
}
if (n < 2) return -1;
aux->response[n-2] = 0;
return strtol(aux->response, &p, 0);
}
static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get)
{
if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1;
return is_get? kftp_get_response(aux) : 0;
}
static int ftp_open(const char *fn)
{
char *p, *host = 0, *port = 0, *retr = 0;
char host2[80], port2[10];
int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4];
ftpaux_t aux;
/* parse URL */
if (strstr(fn, "ftp://") != fn) return 0;
for (p = (char*)fn + 6; *p && *p != '/'; ++p);
if (*p != '/') return 0;
l = p - fn - 6;
port = strdup("21");
host = calloc(l + 1, 1);
strncpy(host, fn + 6, l);
retr = calloc(strlen(p) + 8, 1);
sprintf(retr, "RETR %s\r\n", p);
/* connect to ctrl */
memset(&aux, 0, sizeof(ftpaux_t));
aux.ctrl_fd = socket_connect(host, port);
if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */
/* connect to the data stream */
kftp_get_response(&aux);
kftp_send_cmd(&aux, "USER anonymous\r\n", 1);
kftp_send_cmd(&aux, "PASS kopen@\r\n", 1);
kftp_send_cmd(&aux, "TYPE I\r\n", 1);
kftp_send_cmd(&aux, "PASV\r\n", 1);
for (p = aux.response; *p && *p != '('; ++p);
if (*p != '(') goto ftp_open_end;
++p;
sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
memcpy(pasv_ip, v, 4 * sizeof(int));
pasv_port = (v[4]<<8&0xff00) + v[5];
kftp_send_cmd(&aux, retr, 0);
sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
sprintf(port2, "%d", pasv_port);
fd = socket_connect(host2, port2);
if (fd == -1) goto ftp_open_end;
ret = kftp_get_response(&aux);
if (ret != 150) {
close(fd);
fd = -1;
}
close(aux.ctrl_fd);
ftp_open_end:
free(host); free(port); free(retr); free(aux.response);
return fd;
}
#endif /* !defined(_KO_NO_NET) */
static char **cmd2argv(const char *cmd)
{
int i, beg, end, argc;
char **argv, *str;
end = strlen(cmd);
for (i = end - 1; i >= 0; --i)
if (!isspace(cmd[i])) break;
end = i + 1;
for (beg = 0; beg < end; ++beg)
if (!isspace(cmd[beg])) break;
if (beg == end) return 0;
for (i = beg + 1, argc = 0; i < end; ++i)
if (isspace(cmd[i]) && !isspace(cmd[i-1]))
++argc;
argv = (char**)calloc(argc + 2, sizeof(void*));
argv[0] = str = (char*)calloc(end - beg + 1, 1);
strncpy(argv[0], cmd + beg, end - beg);
for (i = argc = 1; i < end - beg; ++i)
if (isspace(str[i])) str[i] = 0;
else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i];
return argv;
}
#define KO_STDIN 1
#define KO_FILE 2
#define KO_PIPE 3
#define KO_HTTP 4
#define KO_FTP 5
typedef struct {
int type, fd;
pid_t pid;
} koaux_t;
void *kopen(const char *fn, int *_fd)
{
koaux_t *aux = 0;
*_fd = -1;
if (strstr(fn, "http://") == fn) {
aux = calloc(1, sizeof(koaux_t));
aux->type = KO_HTTP;
aux->fd = http_open(fn);
} else if (strstr(fn, "ftp://") == fn) {
aux = calloc(1, sizeof(koaux_t));
aux->type = KO_FTP;
aux->fd = ftp_open(fn);
} else if (strcmp(fn, "-") == 0) {
aux = calloc(1, sizeof(koaux_t));
aux->type = KO_STDIN;
aux->fd = STDIN_FILENO;
} else {
const char *p, *q;
for (p = fn; *p; ++p)
if (!isspace(*p)) break;
if (*p == '<') { // pipe open
int need_shell, pfd[2];
pid_t pid;
// a simple check to see if we need to invoke a shell; not always working
for (q = p + 1; *q; ++q)
if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':')
break;
need_shell = (*q != 0);
if (pipe(pfd) != 0) return 0;
pid = vfork();
if (pid == -1) { /* vfork() error */
close(pfd[0]); close(pfd[1]);
return 0;
}
if (pid == 0) { /* the child process */
char **argv; /* FIXME: I do not know if this will lead to a memory leak */
close(pfd[0]);
dup2(pfd[1], STDOUT_FILENO);
close(pfd[1]);
if (!need_shell) {
argv = cmd2argv(p + 1);
execvp(argv[0], argv);
free(argv[0]); free(argv);
} else execl("/bin/sh", "sh", "-c", p + 1, NULL);
exit(1);
} else { /* parent process */
close(pfd[1]);
aux = calloc(1, sizeof(koaux_t));
aux->type = KO_PIPE;
aux->fd = pfd[0];
aux->pid = pid;
}
} else {
#ifdef _WIN32
*_fd = open(fn, O_RDONLY | O_BINARY);
#else
*_fd = open(fn, O_RDONLY);
#endif
if (*_fd >= 0) {
aux = calloc(1, sizeof(koaux_t));
aux->type = KO_FILE;
aux->fd = *_fd;
}
}
}
if (aux) *_fd = aux->fd;
return aux;
}
int kclose(void *a)
{
koaux_t *aux = (koaux_t*)a;
if (aux->type == KO_PIPE) {
int status;
pid_t pid;
pid = waitpid(aux->pid, &status, WNOHANG);
if (pid != aux->pid) kill(aux->pid, 15);
}
free(aux);
return 0;
}
#ifdef _KO_MAIN
#define BUF_SIZE 0x10000
int main(int argc, char *argv[])
{
void *x;
int l, fd;
unsigned char buf[BUF_SIZE];
FILE *fp;
if (argc == 1) {
fprintf(stderr, "Usage: kopen <file>\n");
return 1;
}
x = kopen(argv[1], &fd);
fp = fdopen(fd, "r");
if (fp == 0) {
fprintf(stderr, "ERROR: fail to open the input\n");
return 1;
}
do {
if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0)
fwrite(buf, 1, l, stdout);
} while (l == BUF_SIZE);
fclose(fp);
kclose(x);
return 0;
}
#endif

239
kseq.h 100644
View File

@ -0,0 +1,239 @@
/* The MIT License
Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Last Modified: 05MAR2012 */
#ifndef AC_KSEQ_H
#define AC_KSEQ_H
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define KS_SEP_TAB 1 // isspace() && !' '
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
#define KS_SEP_MAX 2
#define __KS_TYPE(type_t) \
typedef struct __kstream_t { \
unsigned char *buf; \
int begin, end, is_eof; \
type_t f; \
} kstream_t;
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
#define __KS_BASIC(type_t, __bufsize) \
static inline kstream_t *ks_init(type_t f) \
{ \
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
ks->f = f; \
ks->buf = (unsigned char*)malloc(__bufsize); \
return ks; \
} \
static inline void ks_destroy(kstream_t *ks) \
{ \
if (ks) { \
free(ks->buf); \
free(ks); \
} \
}
#define __KS_GETC(__read, __bufsize) \
static inline int ks_getc(kstream_t *ks) \
{ \
if (ks->is_eof && ks->begin >= ks->end) return -1; \
if (ks->begin >= ks->end) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
if (ks->end == 0) { ks->is_eof = 1; return -1;} \
} \
return (int)ks->buf[ks->begin++]; \
}
#ifndef KSTRING_T
#define KSTRING_T kstring_t
typedef struct __kstring_t {
size_t l, m;
char *s;
} kstring_t;
#endif
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#define __KS_GETUNTIL(__read, __bufsize) \
static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
int gotany = 0; \
if (dret) *dret = 0; \
str->l = append? str->l : 0; \
for (;;) { \
int i; \
if (ks->begin >= ks->end) { \
if (!ks->is_eof) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
if (ks->end == 0) { ks->is_eof = 1; break; } \
} else break; \
} \
if (delimiter == KS_SEP_LINE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == '\n') break; \
} else if (delimiter > KS_SEP_MAX) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == delimiter) break; \
} else if (delimiter == KS_SEP_SPACE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i])) break; \
} else if (delimiter == KS_SEP_TAB) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
} else i = 0; /* never come to here! */ \
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
str->m = str->l + (i - ks->begin) + 1; \
kroundup32(str->m); \
str->s = (char*)realloc(str->s, str->m); \
} \
gotany = 1; \
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
str->l = str->l + (i - ks->begin); \
ks->begin = i + 1; \
if (i < ks->end) { \
if (dret) *dret = ks->buf[i]; \
break; \
} \
} \
if (!gotany && ks_eof(ks)) return -1; \
if (str->s == 0) { \
str->m = 1; \
str->s = (char*)calloc(1, 1); \
} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
str->s[str->l] = '\0'; \
return str->l; \
} \
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
#define KSTREAM_INIT(type_t, __read, __bufsize) \
__KS_TYPE(type_t) \
__KS_BASIC(type_t, __bufsize) \
__KS_GETC(__read, __bufsize) \
__KS_GETUNTIL(__read, __bufsize)
#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
#define __KSEQ_BASIC(SCOPE, type_t) \
SCOPE kseq_t *kseq_init(type_t fd) \
{ \
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
s->f = ks_init(fd); \
return s; \
} \
SCOPE void kseq_destroy(kseq_t *ks) \
{ \
if (!ks) return; \
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
ks_destroy(ks->f); \
free(ks); \
}
/* Return value:
>=0 length of the sequence (normal)
-1 end-of-file
-2 truncated quality string
*/
#define __KSEQ_READ(SCOPE) \
SCOPE int kseq_read(kseq_t *seq) \
{ \
int c; \
kstream_t *ks = seq->f; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
if (c == -1) return -1; /* end of file */ \
seq->last_char = c; \
} /* else: the first header char has been read in the previous call */ \
seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
seq->seq.m = 256; \
seq->seq.s = (char*)malloc(seq->seq.m); \
} \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
if (c == '\n') continue; /* skip empty lines */ \
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
} \
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
seq->seq.m = seq->seq.l + 2; \
kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
} \
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
if (c != '+') return seq->seq.l; /* FASTA */ \
if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
seq->qual.m = seq->seq.m; \
seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
} \
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
if (c == -1) return -2; /* error: no quality string */ \
while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
seq->last_char = 0; /* we have not come to the next header line */ \
if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
return seq->seq.l; \
}
#define __KSEQ_TYPE(type_t) \
typedef struct { \
kstring_t name, comment, seq, qual; \
int last_char; \
kstream_t *f; \
} kseq_t;
#define KSEQ_INIT2(SCOPE, type_t, __read) \
KSTREAM_INIT(type_t, __read, 16384) \
__KSEQ_TYPE(type_t) \
__KSEQ_BASIC(SCOPE, type_t) \
__KSEQ_READ(SCOPE)
#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
#define KSEQ_DECLARE(type_t) \
__KS_TYPE(type_t) \
__KSEQ_TYPE(type_t) \
extern kseq_t *kseq_init(type_t fd); \
void kseq_destroy(kseq_t *ks); \
int kseq_read(kseq_t *seq);
#endif

273
ksort.h 100644
View File

@ -0,0 +1,273 @@
/* The MIT License
Copyright (c) 2008, by Attractive Chaos <attractivechaos@aol.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
2008-11-16 (0.1.4):
* Fixed a bug in introsort() that happens in rare cases.
2008-11-05 (0.1.3):
* Fixed a bug in introsort() for complex comparisons.
* Fixed a bug in mergesort(). The previous version is not stable.
2008-09-15 (0.1.2):
* Accelerated introsort. On my Mac (not on another Linux machine),
my implementation is as fast as std::sort on random input.
* Added combsort and in introsort, switch to combsort if the
recursion is too deep.
2008-09-13 (0.1.1):
* Added k-small algorithm
2008-09-05 (0.1.0):
* Initial version
*/
#ifndef AC_KSORT_H
#define AC_KSORT_H
#include <stdlib.h>
#include <string.h>
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
typedef struct {
void *left, *right;
int depth;
} ks_isort_stack_t;
#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
#define KSORT_INIT(name, type_t, __sort_lt) \
void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
{ \
type_t *a2[2], *a, *b; \
int curr, shift; \
\
a2[0] = array; \
a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
a = a2[curr]; b = a2[1-curr]; \
if (shift == 0) { \
type_t *p = b, *i, *eb = a + n; \
for (i = a; i < eb; i += 2) { \
if (i == eb - 1) *p++ = *i; \
else { \
if (__sort_lt(*(i+1), *i)) { \
*p++ = *(i+1); *p++ = *i; \
} else { \
*p++ = *i; *p++ = *(i+1); \
} \
} \
} \
} else { \
size_t i, step = 1ul<<shift; \
for (i = 0; i < n; i += step<<1) { \
type_t *p, *j, *k, *ea, *eb; \
if (n < i + step) { \
ea = a + n; eb = a; \
} else { \
ea = a + i + step; \
eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
} \
j = a + i; k = a + i + step; p = b + i; \
while (j < ea && k < eb) { \
if (__sort_lt(*k, *j)) *p++ = *k++; \
else *p++ = *j++; \
} \
while (j < ea) *p++ = *j++; \
while (k < eb) *p++ = *k++; \
} \
} \
curr = 1 - curr; \
} \
if (curr == 1) { \
type_t *p = a2[0], *i = a2[1], *eb = array + n; \
for (; p < eb; ++i) *p++ = *i; \
} \
if (temp == 0) free(a2[1]); \
} \
void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
{ \
size_t k = i; \
type_t tmp = l[i]; \
while ((k = (k << 1) + 1) < n) { \
if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
if (__sort_lt(l[k], tmp)) break; \
l[i] = l[k]; i = k; \
} \
l[i] = tmp; \
} \
void ks_heapmake_##name(size_t lsize, type_t l[]) \
{ \
size_t i; \
for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
ks_heapadjust_##name(i, lsize, l); \
} \
void ks_heapsort_##name(size_t lsize, type_t l[]) \
{ \
size_t i; \
for (i = lsize - 1; i > 0; --i) { \
type_t tmp; \
tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
} \
} \
static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
{ \
type_t *i, *j, swap_tmp; \
for (i = s + 1; i < t; ++i) \
for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
} \
} \
void ks_combsort_##name(size_t n, type_t a[]) \
{ \
const double shrink_factor = 1.2473309501039786540366528676643; \
int do_swap; \
size_t gap = n; \
type_t tmp, *i, *j; \
do { \
if (gap > 2) { \
gap = (size_t)(gap / shrink_factor); \
if (gap == 9 || gap == 10) gap = 11; \
} \
do_swap = 0; \
for (i = a; i < a + n - gap; ++i) { \
j = i + gap; \
if (__sort_lt(*j, *i)) { \
tmp = *i; *i = *j; *j = tmp; \
do_swap = 1; \
} \
} \
} while (do_swap || gap > 2); \
if (gap != 1) __ks_insertsort_##name(a, a + n); \
} \
void ks_introsort_##name(size_t n, type_t a[]) \
{ \
int d; \
ks_isort_stack_t *top, *stack; \
type_t rp, swap_tmp; \
type_t *s, *t, *i, *j, *k; \
\
if (n < 1) return; \
else if (n == 2) { \
if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
return; \
} \
for (d = 2; 1ul<<d < n; ++d); \
stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
top = stack; s = a; t = a + (n-1); d <<= 1; \
while (1) { \
if (s < t) { \
if (--d == 0) { \
ks_combsort_##name(t - s + 1, s); \
t = s; \
continue; \
} \
i = s; j = t; k = i + ((j-i)>>1) + 1; \
if (__sort_lt(*k, *i)) { \
if (__sort_lt(*k, *j)) k = j; \
} else k = __sort_lt(*j, *i)? i : j; \
rp = *k; \
if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
for (;;) { \
do ++i; while (__sort_lt(*i, rp)); \
do --j; while (i <= j && __sort_lt(rp, *j)); \
if (j <= i) break; \
swap_tmp = *i; *i = *j; *j = swap_tmp; \
} \
swap_tmp = *i; *i = *t; *t = swap_tmp; \
if (i-s > t-i) { \
if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
s = t-i > 16? i+1 : t; \
} else { \
if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
t = i-s > 16? i-1 : s; \
} \
} else { \
if (top == stack) { \
free(stack); \
__ks_insertsort_##name(a, a+n); \
return; \
} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
} \
} \
} \
/* This function is adapted from: http://ndevilla.free.fr/median/ */ \
/* 0 <= kk < n */ \
type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
{ \
type_t *low, *high, *k, *ll, *hh, *mid; \
low = arr; high = arr + n - 1; k = arr + kk; \
for (;;) { \
if (high <= low) return *k; \
if (high == low + 1) { \
if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
return *k; \
} \
mid = low + (high - low) / 2; \
if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
KSORT_SWAP(type_t, *mid, *(low+1)); \
ll = low + 1; hh = high; \
for (;;) { \
do ++ll; while (__sort_lt(*ll, *low)); \
do --hh; while (__sort_lt(*low, *hh)); \
if (hh < ll) break; \
KSORT_SWAP(type_t, *ll, *hh); \
} \
KSORT_SWAP(type_t, *low, *hh); \
if (hh <= k) low = ll; \
if (hh >= k) high = hh - 1; \
} \
}
#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
#define ks_lt_generic(a, b) ((a) < (b))
#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
typedef const char *ksstr_t;
#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
#endif

37
kstring.c 100644
View File

@ -0,0 +1,37 @@
#include <stdarg.h>
#include <stdio.h>
#include "kstring.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
int bwa_kvsprintf(kstring_t *s, const char *fmt, va_list ap)
{
va_list ap2;
int l;
va_copy(ap2, ap);
l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
if (l + 1 > s->m - s->l) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap2);
}
va_end(ap2);
s->l += l;
return l;
}
#ifdef KSTRING_MAIN
#include <stdio.h>
int main()
{
kstring_t *s;
s = (kstring_t*)calloc(1, sizeof(kstring_t));
ksprintf(s, "abcdefg: %d", 100);
printf("%s\n", s->s);
free(s);
return 0;
}
#endif

131
kstring.h 100644
View File

@ -0,0 +1,131 @@
#ifndef KSTRING_H
#define KSTRING_H
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#ifndef KSTRING_T
#define KSTRING_T kstring_t
typedef struct __kstring_t {
size_t l, m;
char *s;
} kstring_t;
#endif
static inline void ks_resize(kstring_t *s, size_t size)
{
if (s->m < size) {
s->m = size;
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
}
static inline int kputsn(const char *p, int l, kstring_t *s)
{
if (s->l + l + 1 >= s->m) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
memcpy(s->s + s->l, p, l);
s->l += l;
s->s[s->l] = 0;
return l;
}
static inline int kputs(const char *p, kstring_t *s)
{
return kputsn(p, strlen(p), s);
}
static inline int kputc(int c, kstring_t *s)
{
if (s->l + 1 >= s->m) {
s->m = s->l + 2;
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
s->s[s->l++] = c;
s->s[s->l] = 0;
return c;
}
static inline int kputw(int c, kstring_t *s)
{
char buf[16];
int l, x;
if (c == 0) return kputc('0', s);
for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
if (c < 0) buf[l++] = '-';
if (s->l + l + 1 >= s->m) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
s->s[s->l] = 0;
return 0;
}
static inline int kputuw(unsigned c, kstring_t *s)
{
char buf[16];
int l, i;
unsigned x;
if (c == 0) return kputc('0', s);
for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
if (s->l + l + 1 >= s->m) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
s->s[s->l] = 0;
return 0;
}
static inline int kputl(long c, kstring_t *s)
{
char buf[32];
long l, x;
if (c == 0) return kputc('0', s);
for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
if (c < 0) buf[l++] = '-';
if (s->l + l + 1 >= s->m) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
s->s[s->l] = 0;
return 0;
}
int bwa_kvsprintf(kstring_t *s, const char *fmt, va_list ap);
static inline int ksprintf(kstring_t *s, const char *fmt, ...)
{
va_list ap;
int l;
va_start(ap, fmt);
l = bwa_kvsprintf(s, fmt, ap);
va_end(ap);
return l;
}
static inline int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
{
return bwa_kvsprintf(s, fmt, ap);
}
#endif

749
ksw.c 100644
View File

@ -0,0 +1,749 @@
/* The MIT License
Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#if defined __SSE2__
#include <emmintrin.h>
#elif defined __ARM_NEON
#include "neon_sse.h"
#else
#include "scalar_sse.h"
#endif
#include "ksw.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x),1)
#define UNLIKELY(x) __builtin_expect((x),0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
struct _kswq_t {
int qlen, slen;
uint8_t shift, mdiff, max, size;
__m128i *qp, *H0, *H1, *E, *Hmax;
};
/**
* Initialize the query data structure
*
* @param size Number of bytes used to store a score; valid valures are 1 or 2
* @param qlen Length of the query sequence
* @param query Query sequence
* @param m Size of the alphabet
* @param mat Scoring matrix in a one-dimension array
*
* @return Query data structure
*/
kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
{
kswq_t *q;
int slen, a, tmp, p;
size = size > 1? 2 : 1;
p = 8 * (3 - size); // # values per __m128i
slen = (qlen + p - 1) / p; // segmented length
q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
q->H0 = q->qp + slen * m;
q->H1 = q->H0 + slen;
q->E = q->H1 + slen;
q->Hmax = q->E + slen;
q->slen = slen; q->qlen = qlen; q->size = size;
// compute shift
tmp = m * m;
for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
}
q->max = q->mdiff;
q->shift = 256 - q->shift; // NB: q->shift is uint8_t
q->mdiff += q->shift; // this is the difference between the min and max scores
// An example: p=8, qlen=19, slen=3 and segmentation:
// {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
if (size == 1) {
int8_t *t = (int8_t*)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
}
} else {
int16_t *t = (int16_t*)q->qp;
for (a = 0; a < m; ++a) {
int i, k, nlen = slen * p;
const int8_t *ma = mat + a * m;
for (i = 0; i < slen; ++i)
for (k = i; k < nlen; k += slen) // p iterations
*t++ = (k >= qlen? 0 : ma[query[k]]);
}
}
return q;
}
#if defined __ARM_NEON
// This macro implicitly uses each function's `zero` local variable
#define _mm_slli_si128(a, n) (vextq_u8(zero, (a), 16 - (n)))
#endif
kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
kswr_t r;
#if defined __SSE2__
#define __max_16(ret, xx) do { \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
} while (0)
// Given entries with arbitrary values, return whether they are all 0x00
#define allzero_16(xx) (_mm_movemask_epi8(_mm_cmpeq_epi8((xx), zero)) == 0xffff)
#elif defined __ARM_NEON
#define __max_16(ret, xx) (ret) = vmaxvq_u8((xx))
#define allzero_16(xx) (vmaxvq_u8((xx)) == 0)
#else
#define __max_16(ret, xx) (ret) = m128i_max_u8((xx))
#define allzero_16(xx) (m128i_allzero((xx)))
#endif
// initialization
r = g_defr;
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
oe_del = _mm_set1_epi8(_o_del + _e_del);
e_del = _mm_set1_epi8(_e_del);
oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
e_ins = _mm_set1_epi8(_e_ins);
shift = _mm_set1_epi8(q->shift);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
_mm_store_si128(E + i, zero);
_mm_store_si128(H0 + i, zero);
_mm_store_si128(Hmax + i, zero);
}
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, imax;
__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
for (j = 0; LIKELY(j < slen); ++j) {
/* SW cells are computed in the following order:
* H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
* E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
* F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
*/
// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
h = _mm_adds_epu8(h, _mm_load_si128(S + j));
h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
e = _mm_load_si128(E + j); // e=E'(i,j)
h = _mm_max_epu8(h, e);
h = _mm_max_epu8(h, f); // h=H'(i,j)
max = _mm_max_epu8(max, h); // set max
_mm_store_si128(H1 + j, h); // save to H'(i,j)
// now compute E'(i+1,j)
e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
e = _mm_max_epu8(e, t); // e=E'(i+1,j)
_mm_store_si128(E + j, e); // save to E'(i+1,j)
// now compute F'(i,j+1)
f = _mm_subs_epu8(f, e_ins);
t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
f = _mm_max_epu8(f, t);
// get H'(i-1,j) and prepare for the next j
h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
}
// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
f = _mm_slli_si128(f, 1);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm_load_si128(H1 + j);
h = _mm_max_epu8(h, f); // h=H'(i,j)
_mm_store_si128(H1 + j, h);
h = _mm_subs_epu8(h, oe_ins);
f = _mm_subs_epu8(f, e_ins);
if (UNLIKELY(allzero_16(_mm_subs_epu8(f, h)))) goto end_loop16;
}
}
end_loop16:
//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
__max_16(imax, max); // imax is the maximum number in max
if (imax >= minsc) { // write the b array; this condition adds branching unfornately
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
b = (uint64_t*)realloc(b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
}
if (imax > gmax) {
gmax = imax; te = i; // te is the end position on the target
for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
if (gmax + q->shift >= 255 || gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S; // swap H0 and H1
}
r.score = gmax + q->shift < 255? gmax : 255;
r.te = te;
if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
int max = -1, tmp, low, high, qlen = slen * 16;
uint8_t *t = (uint8_t*)Hmax;
for (i = 0; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp;
//printf("%d,%d\n", max, gmax);
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
r.score2 = b[i]>>32, r.te2 = e;
}
}
}
free(b);
return r;
}
kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
kswr_t r;
#if defined __SSE2__
#define __max_8(ret, xx) do { \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
(ret) = _mm_extract_epi16((xx), 0); \
} while (0)
// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000
#define allzero_0f_8(xx) (!_mm_movemask_epi8((xx)))
#elif defined __ARM_NEON
#define __max_8(ret, xx) (ret) = vmaxvq_s16(vreinterpretq_s16_u8((xx)))
#define allzero_0f_8(xx) (vmaxvq_u16(vreinterpretq_u16_u8((xx))) == 0)
#else
#define __max_8(ret, xx) (ret) = m128i_max_s16((xx))
#define allzero_0f_8(xx) (m128i_allzero((xx)))
#endif
// initialization
r = g_defr;
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
oe_del = _mm_set1_epi16(_o_del + _e_del);
e_del = _mm_set1_epi16(_e_del);
oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
e_ins = _mm_set1_epi16(_e_ins);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
_mm_store_si128(E + i, zero);
_mm_store_si128(H0 + i, zero);
_mm_store_si128(Hmax + i, zero);
}
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, imax;
__m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128(h, 2);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm_adds_epi16(h, _mm_load_si128(S++));
e = _mm_load_si128(E + j);
h = _mm_max_epi16(h, e);
h = _mm_max_epi16(h, f);
max = _mm_max_epi16(max, h);
_mm_store_si128(H1 + j, h);
e = _mm_subs_epu16(e, e_del);
t = _mm_subs_epu16(h, oe_del);
e = _mm_max_epi16(e, t);
_mm_store_si128(E + j, e);
f = _mm_subs_epu16(f, e_ins);
t = _mm_subs_epu16(h, oe_ins);
f = _mm_max_epi16(f, t);
h = _mm_load_si128(H0 + j);
}
for (k = 0; LIKELY(k < 16); ++k) {
f = _mm_slli_si128(f, 2);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm_load_si128(H1 + j);
h = _mm_max_epi16(h, f);
_mm_store_si128(H1 + j, h);
h = _mm_subs_epu16(h, oe_ins);
f = _mm_subs_epu16(f, e_ins);
if(UNLIKELY(allzero_0f_8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
}
}
end_loop8:
__max_8(imax, max);
if (imax >= minsc) {
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
b = (uint64_t*)realloc(b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
}
if (imax > gmax) {
gmax = imax; te = i;
for (j = 0; LIKELY(j < slen); ++j)
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
if (gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S;
}
r.score = gmax; r.te = te;
{
int max = -1, tmp, low, high, qlen = slen * 8;
uint16_t *t = (uint16_t*)Hmax;
for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp;
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
r.score2 = b[i]>>32, r.te2 = e;
}
}
}
free(b);
return r;
}
static inline void revseq(int l, uint8_t *s)
{
int i, t;
for (i = 0; i < l>>1; ++i)
t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
}
kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry)
{
int size;
kswq_t *q;
kswr_t r, rr;
kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int);
q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
if (qry && *qry == 0) *qry = q;
func = q->size == 2? ksw_i16 : ksw_u8;
size = q->size;
r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra);
if (qry == 0) free(q);
if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
q = ksw_qinit(size, r.qe + 1, query, m, mat);
rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score);
revseq(r.qe + 1, query); revseq(r.te + 1, target);
free(q);
if (r.score == rr.score)
r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
return r;
}
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
{
return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry);
}
/********************
*** SW extension ***
********************/
typedef struct {
int32_t h, e;
} eh_t;
int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
{
eh_t *eh; // score array
int8_t *qp; // query profile
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
assert(h0 > 0);
// allocate memory
qp = malloc(qlen * m);
eh = calloc(qlen + 1, 8);
// generate the query profile
for (k = i = 0; k < m; ++k) {
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
}
// fill the first row
eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
eh[j].h = eh[j-1].h - e_ins;
// adjust $w if it is too large
k = m * m;
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i]? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1? max_ins : 1;
w = w < max_ins? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1? max_del : 1;
w = w < max_del? w : max_del; // TODO: is this necessary?
// DP loop
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
max_off = 0;
beg = 0, end = qlen;
for (i = 0; LIKELY(i < tlen); ++i) {
int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[target[i] * qlen];
// apply the band and the constraint (if provided)
if (beg < i - w) beg = i - w;
if (end > i + w + 1) end = i + w + 1;
if (end > qlen) end = qlen;
// compute the first column
if (beg == 0) {
h1 = h0 - (o_del + e_del * (i + 1));
if (h1 < 0) h1 = 0;
} else h1 = 0;
for (j = beg; LIKELY(j < end); ++j) {
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Similar to SSE2-SW, cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
eh_t *p = &eh[j];
int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
p->h = h1; // set H(i,j-1) for the next row
M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M"
h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0
h = h > f? h : f;
h1 = h; // save H(i,j) to h1 for the next column
mj = m > h? mj : j; // record the position where max score is achieved
m = m > h? m : h; // m is stored at eh[mj+1]
t = M - oe_del;
t = t > 0? t : 0;
e -= e_del;
e = e > t? e : t; // computed E(i+1,j)
p->e = e; // save E(i+1,j) for the next row
t = M - oe_ins;
t = t > 0? t : 0;
f -= e_ins;
f = f > t? f : t; // computed F(i,j+1)
}
eh[end].h = h1; eh[end].e = 0;
if (j == qlen) {
max_ie = gscore > h1? max_ie : i;
gscore = gscore > h1? gscore : h1;
}
if (m == 0) break;
if (m > max) {
max = m, max_i = i, max_j = mj;
max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
} else if (zdrop > 0) {
if (i - max_i > mj - max_j) {
if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
} else {
if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
}
}
// update beg and end for the next round
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j);
beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
end = j + 2 < qlen? j + 2 : qlen;
//beg = 0; end = qlen; // uncomment this line for debugging
}
free(eh); free(qp);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
if (_max_off) *_max_off = max_off;
return max;
}
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off)
{
return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off);
}
/********************
* Global alignment *
********************/
#define MINUS_INF -0x40000000
static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
{
if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
if (*n_cigar == *m_cigar) {
*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
cigar = realloc(cigar, (*m_cigar) << 2);
}
cigar[(*n_cigar)++] = len<<4 | op;
} else cigar[(*n_cigar)-1] += len<<4;
return cigar;
}
int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
{
eh_t *eh;
int8_t *qp; // query profile
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col;
uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
if (n_cigar_) *n_cigar_ = 0;
// allocate memory
n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
z = n_cigar_ && cigar_? malloc((long)n_col * tlen) : 0;
qp = malloc(qlen * m);
eh = calloc(qlen + 1, 8);
// generate the query profile
for (k = i = 0; k < m; ++k) {
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
}
// fill the first row
eh[0].h = 0; eh[0].e = MINUS_INF;
for (j = 1; j <= qlen && j <= w; ++j)
eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF;
for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
// DP loop
for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
int32_t f = MINUS_INF, h1, beg, end, t;
int8_t *q = &qp[target[i] * qlen];
beg = i > w? i - w : 0;
end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF;
if (n_cigar_ && cigar_) {
uint8_t *zi = &z[(long)i * n_col];
for (j = beg; LIKELY(j < end); ++j) {
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Cells are computed in the following order:
// M(i,j) = H(i-1,j-1) + S(i,j)
// H(i,j) = max{M(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape
// F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape
// We have to separate M(i,j); otherwise the direction may not be recorded correctly.
// However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global().
// Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k.
// In practice, this should happen very rarely given a reasonable scoring system.
eh_t *p = &eh[j];
int32_t h, m = p->h, e = p->e;
uint8_t d; // direction
p->h = h1;
m += q[j];
d = m >= e? 0 : 1;
h = m >= e? m : e;
d = h >= f? d : 2;
h = h >= f? h : f;
h1 = h;
t = m - oe_del;
e -= e_del;
d |= e > t? 1<<2 : 0;
e = e > t? e : t;
p->e = e;
t = m - oe_ins;
f -= e_ins;
d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
f = f > t? f : t;
zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
}
} else {
for (j = beg; LIKELY(j < end); ++j) {
eh_t *p = &eh[j];
int32_t h, m = p->h, e = p->e;
p->h = h1;
m += q[j];
h = m >= e? m : e;
h = h >= f? h : f;
h1 = h;
t = m - oe_del;
e -= e_del;
e = e > t? e : t;
p->e = e;
t = m - oe_ins;
f -= e_ins;
f = f > t? f : t;
}
}
eh[end].h = h1; eh[end].e = MINUS_INF;
}
score = eh[qlen].h;
if (n_cigar_ && cigar_) { // backtrack
int n_cigar = 0, m_cigar = 0, which = 0;
uint32_t *cigar = 0, tmp;
i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
while (i >= 0 && k >= 0) {
which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
}
if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
*n_cigar_ = n_cigar, *cigar_ = cigar;
}
free(eh); free(qp); free(z);
return score;
}
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
{
return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_);
}
/*******************************************
* Main function (not compiled by default) *
*******************************************/
#ifdef _KSW_MAIN
#include <unistd.h>
#include <stdio.h>
#include <zlib.h>
#include "kseq.h"
KSEQ_INIT(gzFile, err_gzread)
unsigned char seq_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
int main(int argc, char *argv[])
{
int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
int8_t mat[25];
int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
uint8_t *rseq = 0;
gzFile fpt, fpq;
kseq_t *kst, *ksq;
// parse command line
while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
switch (c) {
case 'a': sa = atoi(optarg); break;
case 'b': sb = atoi(optarg); break;
case 'q': gapo = atoi(optarg); break;
case 'r': gape = atoi(optarg); break;
case 't': minsc = atoi(optarg); break;
case 'f': forward_only = 1; break;
case '1': xtra |= KSW_XBYTE; break;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
return 1;
}
if (minsc > 0xffff) minsc = 0xffff;
xtra |= KSW_XSUBO | minsc;
// initialize scoring matrix
for (i = k = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
mat[k++] = i == j? sa : -sb;
mat[k++] = 0; // ambiguous base
}
for (j = 0; j < 5; ++j) mat[k++] = 0;
// open file
fpt = xzopen(argv[optind], "r"); kst = kseq_init(fpt);
fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
// all-pair alignment
while (kseq_read(ksq) > 0) {
kswq_t *q[2] = {0, 0};
kswr_t r;
for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
if (!forward_only) { // reverse
if ((int)ksq->seq.m > max_rseq) {
max_rseq = ksq->seq.m;
rseq = (uint8_t*)realloc(rseq, max_rseq);
}
for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
}
gzrewind(fpt); kseq_rewind(kst);
while (kseq_read(kst) > 0) {
for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
if (r.score >= minsc)
err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
if (rseq) {
r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
if (r.score >= minsc)
err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
}
}
free(q[0]); free(q[1]);
}
free(rseq);
kseq_destroy(kst); err_gzclose(fpt);
kseq_destroy(ksq); err_gzclose(fpq);
return 0;
}
#endif

114
ksw.h 100644
View File

@ -0,0 +1,114 @@
#ifndef __AC_KSW_H
#define __AC_KSW_H
#include <stdint.h>
#define KSW_XBYTE 0x10000
#define KSW_XSTOP 0x20000
#define KSW_XSUBO 0x40000
#define KSW_XSTART 0x80000
struct _kswq_t;
typedef struct _kswq_t kswq_t;
typedef struct {
int score; // best score
int te, qe; // target end and query end
int score2, te2; // second best score and ending position on the target
int tb, qb; // target start and query start
} kswr_t;
#ifdef __cplusplus
extern "C" {
#endif
/**
* Aligning two sequences
*
* @param qlen length of the query sequence (typically <tlen)
* @param query query sequence with 0 <= query[i] < m
* @param tlen length of the target sequence
* @param target target sequence
* @param m number of residue types
* @param mat m*m scoring matrix in one-dimension array
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
* @param gape gap extension penalty
* @param xtra extra information (see below)
* @param qry query profile (see below)
*
* @return alignment information in a struct; unset values to -1
*
* When xtra==0, ksw_align() uses a signed two-byte integer to store a
* score and only finds the best score and the end positions. The 2nd best
* score or the start positions are not attempted. The default behavior can
* be tuned by setting KSW_X* flags:
*
* KSW_XBYTE: use an unsigned byte to store a score. If overflow occurs,
* kswr_t::score will be set to 255
*
* KSW_XSUBO: track the 2nd best score and the ending position on the
* target if the 2nd best is higher than (xtra&0xffff)
*
* KSW_XSTOP: stop if the maximum score is above (xtra&0xffff)
*
* KSW_XSTART: find the start positions
*
* When *qry==NULL, ksw_align() will compute and allocate the query profile
* and when the function returns, *qry will point to the profile, which can
* be deallocated simply by free(). If one query is aligned against multiple
* target sequences, *qry should be set to NULL during the first call and
* freed after the last call. Note that qry can equal 0. In this case, the
* query profile will be deallocated in ksw_align().
*/
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry);
/**
* Banded global alignment
*
* @param qlen query length
* @param query query sequence with 0 <= query[i] < m
* @param tlen target length
* @param target target sequence with 0 <= target[i] < m
* @param m number of residue types
* @param mat m*m scoring mattrix in one-dimension array
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
* @param gape gap extension penalty
* @param w band width
* @param n_cigar (out) number of CIGAR elements
* @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with free()
*
* @return score of the alignment
*/
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar);
/**
* Extend alignment
*
* The routine aligns $query and $target, assuming their upstream sequences,
* which are not provided, have been aligned with score $h0. In return,
* region [0,*qle) on the query and [0,*tle) on the target sequences are
* aligned together. If *gscore>=0, *gscore keeps the best score such that
* the entire query sequence is aligned; *gtle keeps the position on the
* target where *gscore is achieved. Returning *gscore and *gtle helps the
* caller to decide whether an end-to-end hit or a partial hit is preferred.
*
* The first 9 parameters are identical to those in ksw_global()
*
* @param h0 alignment score of upstream sequences
* @param _qle (out) length of the query in the alignment
* @param _tle (out) length of the target in the alignment
* @param _gtle (out) length of the target if query is fully aligned
* @param _gscore (out) score of the best end-to-end alignment; negative if not found
*
* @return best semi-local alignment score
*/
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
#ifdef __cplusplus
}
#endif
#endif

147
kthread.c 100644
View File

@ -0,0 +1,147 @@
#include <pthread.h>
#include <stdint.h>
#include <stdlib.h>
#include <limits.h>
/************
* kt_for() *
************/
struct kt_for_t;
typedef struct {
struct kt_for_t *t;
long i;
} ktf_worker_t;
typedef struct kt_for_t {
int n_threads;
long n;
ktf_worker_t *w;
void (*func)(void*,long,int);
void *data;
} kt_for_t;
static inline long steal_work(kt_for_t *t)
{
int i, min_i = -1;
long k, min = LONG_MAX;
for (i = 0; i < t->n_threads; ++i)
if (min > t->w[i].i) min = t->w[i].i, min_i = i;
k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
return k >= t->n? -1 : k;
}
static void *ktf_worker(void *data)
{
ktf_worker_t *w = (ktf_worker_t*)data;
long i;
for (;;) {
i = __sync_fetch_and_add(&w->i, w->t->n_threads);
if (i >= w->t->n) break;
w->t->func(w->t->data, i, w - w->t->w);
}
while ((i = steal_work(w->t)) >= 0)
w->t->func(w->t->data, i, w - w->t->w);
pthread_exit(0);
}
void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
{
int i;
kt_for_t t;
pthread_t *tid;
t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t));
tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
for (i = 0; i < n_threads; ++i)
t.w[i].t = &t, t.w[i].i = i;
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
}
/*****************
* kt_pipeline() *
*****************/
struct ktp_t;
typedef struct {
struct ktp_t *pl;
int64_t index;
int step;
void *data;
} ktp_worker_t;
typedef struct ktp_t {
void *shared;
void *(*func)(void*, int, void*);
int64_t index;
int n_workers, n_steps;
ktp_worker_t *workers;
pthread_mutex_t mutex;
pthread_cond_t cv;
} ktp_t;
static void *ktp_worker(void *data)
{
ktp_worker_t *w = (ktp_worker_t*)data;
ktp_t *p = w->pl;
while (w->step < p->n_steps) {
// test whether we can kick off the job with this worker
pthread_mutex_lock(&p->mutex);
for (;;) {
int i;
// test whether another worker is doing the same step
for (i = 0; i < p->n_workers; ++i) {
if (w == &p->workers[i]) continue; // ignore itself
if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
break;
}
if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
pthread_cond_wait(&p->cv, &p->mutex);
}
pthread_mutex_unlock(&p->mutex);
// working on w->step
w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
// update step and let other workers know
pthread_mutex_lock(&p->mutex);
w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
if (w->step == 0) w->index = p->index++;
pthread_cond_broadcast(&p->cv);
pthread_mutex_unlock(&p->mutex);
}
pthread_exit(0);
}
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
{
ktp_t aux;
pthread_t *tid;
int i;
if (n_threads < 1) n_threads = 1;
aux.n_workers = n_threads;
aux.n_steps = n_steps;
aux.func = func;
aux.shared = shared_data;
aux.index = 0;
pthread_mutex_init(&aux.mutex, 0);
pthread_cond_init(&aux.cv, 0);
aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t));
for (i = 0; i < n_threads; ++i) {
ktp_worker_t *w = &aux.workers[i];
w->step = 0; w->pl = &aux; w->data = 0;
w->index = aux.index++;
}
tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
pthread_mutex_destroy(&aux.mutex);
pthread_cond_destroy(&aux.cv);
}

94
kvec.h 100644
View File

@ -0,0 +1,94 @@
/* The MIT License
Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
An example:
#include "kvec.h"
int main() {
kvec_t(int) array;
kv_init(array);
kv_push(int, array, 10); // append
kv_a(int, array, 20) = 5; // dynamic
kv_A(array, 20) = 4; // static
kv_destroy(array);
return 0;
}
*/
/*
2008-09-22 (0.1.0):
* The initial version.
*/
#ifndef AC_KVEC_H
#define AC_KVEC_H
#include <stdlib.h>
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#define kvec_t(type) struct { size_t n, m; type *a; }
#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
#define kv_destroy(v) free((v).a)
#define kv_A(v, i) ((v).a[(i)])
#define kv_pop(v) ((v).a[--(v).n])
#define kv_size(v) ((v).n)
#define kv_max(v) ((v).m)
#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
#define kv_copy(type, v1, v0) do { \
if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
(v1).n = (v0).n; \
memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
} while (0) \
#define kv_push(type, v, x) do { \
if ((v).n == (v).m) { \
(v).m = (v).m? (v).m<<1 : 2; \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
} \
(v).a[(v).n++] = (x); \
} while (0)
#define kv_pushp(type, v) ((((v).n == (v).m)? \
((v).m = ((v).m? (v).m<<1 : 2), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
: 0), &(v).a[(v).n++])
#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
: (v).n <= (size_t)(i)? (v).n = (i) + 1 \
: 0), (v).a[(i)])
#endif

140
main.c 100644
View File

@ -0,0 +1,140 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <stdio.h>
#include <string.h>
#include "kstring.h"
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.19-r1273"
#endif
int bwa_fa2pac(int argc, char *argv[]);
int bwa_pac2bwt(int argc, char *argv[]);
int bwa_bwtupdate(int argc, char *argv[]);
int bwa_bwt2sa(int argc, char *argv[]);
int bwa_index(int argc, char *argv[]);
int bwt_bwtgen_main(int argc, char *argv[]);
int bwa_aln(int argc, char *argv[]);
int bwa_sai2sam_se(int argc, char *argv[]);
int bwa_sai2sam_pe(int argc, char *argv[]);
int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);
int main_mem(int argc, char *argv[]);
int main_shm(int argc, char *argv[]);
int main_pemerge(int argc, char *argv[]);
int main_maxk(int argc, char *argv[]);
int bwa_bwt2kmer(int argc, char* argv[]); // create kmer-index from bwt
int bwa_bwt2bytesa(int argc, char* argv[]); // create byte-based Suffix-Array
int bwa_bwt2hyb(int argc, char* argv[]); // create hybrid-index
static int usage()
{
fprintf(stderr, "\n");
fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n");
fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
fprintf(stderr, "Contact: Heng Li <hli@ds.dfci.harvard.edu>\n\n");
fprintf(stderr, "Usage: bwa <command> [options]\n\n");
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
fprintf(stderr, " mem BWA-MEM algorithm\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
fprintf(stderr, " aln gapped/ungapped alignment\n");
fprintf(stderr, " samse generate alignment (single ended)\n");
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries (DEPRECATED)\n");
fprintf(stderr, "\n");
fprintf(stderr, " shm manage indices in shared memory\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
fprintf(stderr, " bwt2bytesa generate SA(using byte array) from BWT and Occ\n");
fprintf(stderr, " bwt2kmer generate kmer hash index from bwt to accelarate the first 14 bases in seeding process.\n");
fprintf(stderr, " bwt2hyb generate hybrid index from BWT\n");
fprintf(stderr, "\n");
fprintf(stderr,
"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
" There are three alignment algorithms in BWA: `mem', `bwasw', and\n"
" `aln/samse/sampe'. If you are not sure which to use, try `bwa mem'\n"
" first. Please `man ./bwa.1' for the manual.\n\n");
return 1;
}
int main(int argc, char *argv[])
{
extern char *bwa_pg;
int i, ret;
double t_real;
kstring_t pg = {0,0,0};
t_real = realtime();
ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]);
for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]);
bwa_pg = pg.s;
if (argc < 2) return usage();
if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1);
else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1);
else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1);
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
else if (strcmp(argv[1], "maxk") == 0) ret = main_maxk(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc - 1, argv + 1);
else if (strcmp(argv[1], "bwt2kmer") == 0) ret = bwa_bwt2kmer(argc - 1, argv + 1);
else if (strcmp(argv[1], "bwt2hyb") == 0) ret = bwa_bwt2hyb(argc - 1, argv + 1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;
}
err_fflush(stdout);
err_fclose(stdout);
if (ret == 0) {
fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION);
fprintf(stderr, "[%s] CMD:", __func__);
for (i = 0; i < argc; ++i)
fprintf(stderr, " %s", argv[i]);
fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime());
}
free(bwa_pg);
return ret;
}

57
malloc_wrap.c 100644
View File

@ -0,0 +1,57 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#ifdef USE_MALLOC_WRAPPERS
/* Don't wrap ourselves */
# undef USE_MALLOC_WRAPPERS
#endif
#include "malloc_wrap.h"
void *wrap_calloc(size_t nmemb, size_t size,
const char *file, unsigned int line, const char *func) {
void *p = calloc(nmemb, size);
if (NULL == p) {
fprintf(stderr,
"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
func, nmemb * size, file, line, strerror(errno));
exit(EXIT_FAILURE);
}
return p;
}
void *wrap_malloc(size_t size,
const char *file, unsigned int line, const char *func) {
void *p = malloc(size);
if (NULL == p) {
fprintf(stderr,
"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
func, size, file, line, strerror(errno));
exit(EXIT_FAILURE);
}
return p;
}
void *wrap_realloc(void *ptr, size_t size,
const char *file, unsigned int line, const char *func) {
void *p = realloc(ptr, size);
if (NULL == p) {
fprintf(stderr,
"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
func, size, file, line, strerror(errno));
exit(EXIT_FAILURE);
}
return p;
}
char *wrap_strdup(const char *s,
const char *file, unsigned int line, const char *func) {
char *p = strdup(s);
if (NULL == p) {
fprintf(stderr,
"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
func, strlen(s), file, line, strerror(errno));
exit(EXIT_FAILURE);
}
return p;
}

47
malloc_wrap.h 100644
View File

@ -0,0 +1,47 @@
#ifndef MALLOC_WRAP_H
#define MALLOC_WRAP_H
#include <stdlib.h> /* Avoid breaking the usual definitions */
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
void *wrap_calloc(size_t nmemb, size_t size,
const char *file, unsigned int line, const char *func);
void *wrap_malloc(size_t size,
const char *file, unsigned int line, const char *func);
void *wrap_realloc(void *ptr, size_t size,
const char *file, unsigned int line, const char *func);
char *wrap_strdup(const char *s,
const char *file, unsigned int line, const char *func);
#ifdef __cplusplus
}
#endif
#ifdef USE_MALLOC_WRAPPERS
# ifdef calloc
# undef calloc
# endif
# define calloc(n, s) wrap_calloc( (n), (s), __FILE__, __LINE__, __func__)
# ifdef malloc
# undef malloc
# endif
# define malloc(s) wrap_malloc( (s), __FILE__, __LINE__, __func__)
# ifdef realloc
# undef realloc
# endif
# define realloc(p, s) wrap_realloc((p), (s), __FILE__, __LINE__, __func__)
# ifdef strdup
# undef strdup
# endif
# define strdup(s) wrap_strdup( (s), __FILE__, __LINE__, __func__)
#endif /* USE_MALLOC_WRAPPERS */
#endif /* MALLOC_WRAP_H */

67
maxk.c 100644
View File

@ -0,0 +1,67 @@
#include <zlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <unistd.h>
#include "bwa.h"
#include "bwamem.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
int main_maxk(int argc, char *argv[])
{
int i, c, self = 0, max_len = 0;
uint8_t *cnt = 0;
uint64_t hist[256];
bwt_t *bwt;
kseq_t *ks;
smem_i *itr;
gzFile fp;
while ((c = getopt(argc, argv, "s")) >= 0) {
if (c == 's') self = 1;
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa maxk [-s] <index.prefix> <seq.fa>\n");
return 1;
}
fp = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "rb") : gzdopen(fileno(stdin), "rb");
ks = kseq_init(fp);
bwt = bwt_restore_bwt(argv[optind]);
itr = smem_itr_init(bwt);
if (self) smem_config(itr, 2, INT_MAX, 0);
memset(hist, 0, 8 * 256);
while (kseq_read(ks) >= 0) {
const bwtintv_v *a;
if (ks->seq.l > max_len) {
max_len = ks->seq.l;
kroundup32(max_len);
cnt = realloc(cnt, max_len);
}
memset(cnt, 0, ks->seq.l);
for (i = 0; i < ks->seq.l; ++i)
ks->seq.s[i] = nst_nt4_table[(int)ks->seq.s[i]];
smem_set_query(itr, ks->seq.l, (uint8_t*)ks->seq.s);
while ((a = smem_next(itr)) != 0) {
for (i = 0; i < a->n; ++i) {
bwtintv_t *p = &a->a[i];
int j, l, start = p->info>>32, end = (uint32_t)p->info;
l = end - start < 255? end - start : 255;
for (j = start; j < end; ++j)
cnt[j] = cnt[j] > l? cnt[j] : l;
}
}
for (i = 0; i < ks->seq.l; ++i) ++hist[cnt[i]];
}
for (i = 0; i < 256; ++i)
printf("%d\t%lld\n", i, (long long)hist[i]);
free(cnt);
smem_itr_destroy(itr);
bwt_destroy(bwt);
kseq_destroy(ks);
gzclose(fp);
return 0;
}

33
neon_sse.h 100644
View File

@ -0,0 +1,33 @@
#ifndef NEON_SSE_H
#define NEON_SSE_H
#include <arm_neon.h>
typedef uint8x16_t __m128i;
static inline __m128i _mm_load_si128(const __m128i *ptr) { return vld1q_u8((const uint8_t *) ptr); }
static inline __m128i _mm_set1_epi32(int n) { return vreinterpretq_u8_s32(vdupq_n_s32(n)); }
static inline void _mm_store_si128(__m128i *ptr, __m128i a) { vst1q_u8((uint8_t *) ptr, a); }
static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) { return vqaddq_u8(a, b); }
static inline __m128i _mm_max_epu8(__m128i a, __m128i b) { return vmaxq_u8(a, b); }
static inline __m128i _mm_set1_epi8(int8_t n) { return vreinterpretq_u8_s8(vdupq_n_s8(n)); }
static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) { return vqsubq_u8(a, b); }
#define M128I(a) vreinterpretq_u8_s16((a))
#define UM128I(a) vreinterpretq_u8_u16((a))
#define S16(a) vreinterpretq_s16_u8((a))
#define U16(a) vreinterpretq_u16_u8((a))
static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) { return M128I(vqaddq_s16(S16(a), S16(b))); }
static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return UM128I(vcgtq_s16(S16(a), S16(b))); }
static inline __m128i _mm_max_epi16(__m128i a, __m128i b) { return M128I(vmaxq_s16(S16(a), S16(b))); }
static inline __m128i _mm_set1_epi16(int16_t n) { return vreinterpretq_u8_s16(vdupq_n_s16(n)); }
static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) { return UM128I(vqsubq_u16(U16(a), U16(b))); }
#undef M128I
#undef UM128I
#undef S16
#undef U16
#endif

291
pemerge.c 100644
View File

@ -0,0 +1,291 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <zlib.h>
#include <pthread.h>
#include <errno.h>
#include "ksw.h"
#include "kseq.h"
#include "kstring.h"
#include "bwa.h"
#include "utils.h"
KSEQ_DECLARE(gzFile)
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#define MAX_SCORE_RATIO 0.9f
#define MAX_ERR 8
static const char *err_msg[MAX_ERR+1] = {
"successful merges",
"low-scoring pairs",
"pairs where the best SW alignment is not an overlap (long left end)",
"pairs where the best SW alignment is not an overlap (long right end)",
"pairs with large 2nd best SW score",
"pairs with gapped overlap",
"pairs where the end-to-end alignment is inconsistent with SW",
"pairs potentially with tandem overlaps",
"pairs with high sum of errors"
};
typedef struct {
int a, b, q, r, w;
int q_def, q_thres;
int T;
int chunk_size;
int n_threads;
int flag; // bit 1: print merged; 2: print unmerged
int8_t mat[25];
} pem_opt_t;
pem_opt_t *pem_opt_init()
{
pem_opt_t *opt;
opt = calloc(1, sizeof(pem_opt_t));
opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20;
opt->T = opt->a * 10;
opt->q_def = 20;
opt->q_thres = 70;
opt->chunk_size = 10000000;
opt->n_threads = 1;
opt->flag = 3;
bwa_fill_scmat(opt->a, opt->b, opt->mat);
return opt;
}
int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2])
{
uint8_t *s[2], *q[2], *seq, *qual;
int i, xtra, l, l_seq, sum_q, ret = 0;
kswr_t r;
s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq);
s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq);
for (i = 0; i < x[0].l_seq; ++i) {
int c = x[0].seq[i];
s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c];
q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def;
}
for (i = 0; i < x[1].l_seq; ++i) {
int c = x[1].seq[x[1].l_seq - 1 - i];
c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c];
s[1][i] = c < 4? 3 - c : 4;
q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def;
}
xtra = KSW_XSTART | KSW_XSUBO;
r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0);
++r.qe; ++r.te; // change to the half-close-half-open coordinates
if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment
if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end
if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end
if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large
if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps
{ // test tandem match; O(n^2)
int max_m, max_m2, min_l, max_l, max_l2;
max_m = max_m2 = 0; max_l = max_l2 = 0;
min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq;
for (l = 1; l < min_l; ++l) {
int m = 0, o = x[0].l_seq - l;
uint8_t *s0o = &s[0][o], *s1 = s[1];
for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck!
m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i]
if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l;
else if (m > max_m2) max_m2 = m, max_l2 = l;
}
if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; }
if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) {
ret = -7; goto pem_ret;
}
if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; }
}
l = x[0].l_seq - (r.tb - r.qb); // length to merge
l_seq = x[0].l_seq + x[1].l_seq - l;
seq = malloc(l_seq + 1);
qual = malloc(l_seq + 1);
memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l);
memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l);
for (i = 0, sum_q = 0; i < l; ++i) {
int k = x[0].l_seq - l + i;
if (s[0][k] == 4) { // ambiguous
seq[k] = s[1][i];
qual[k] = q[1][i];
} else if (s[1][i] == 4) { // do nothing
} else if (s[0][k] == s[1][i]) {
qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i];
} else { // s[0][k] != s[1][i] and neither is N
int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i];
sum_q += qq >= 3? qq<<1 : 1;
seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i];
qual[k] = abs((int)q[0][k] - (int)q[1][i]);
}
}
if (sum_q>>1 > opt->q_thres) { // too many mismatches
free(seq); free(qual);
ret = -8; goto pem_ret;
}
for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33;
seq[l_seq] = qual[l_seq] = 0;
free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment);
memset(&x[1], 0, sizeof(bseq1_t));
free(x[0].seq); free(x[0].qual);
x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual;
pem_ret:
free(s[0]); free(s[1]); free(q[0]); free(q[1]);
return ret;
}
static inline void print_bseq(const bseq1_t *s, int rn)
{
err_putchar(s->qual? '@' : '>');
err_fputs(s->name, stdout);
if (rn == 1 || rn == 2) {
err_putchar('/'); err_putchar('0' + rn); err_putchar('\n');
} else err_puts(" merged");
err_puts(s->seq);
if (s->qual) {
err_puts("+"); err_puts(s->qual);
}
}
typedef struct {
int n, start;
bseq1_t *seqs;
int64_t cnt[MAX_ERR+1];
const pem_opt_t *opt;
} worker_t;
void *worker(void *data)
{
worker_t *w = (worker_t*)data;
int i;
for (i = w->start; i < w->n>>1; i += w->opt->n_threads)
++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])];
return 0;
}
static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1])
{
int i, j, n = n_>>1<<1;
worker_t *w;
w = calloc(opt->n_threads, sizeof(worker_t));
for (i = 0; i < opt->n_threads; ++i) {
worker_t *p = &w[i];
p->start = i; p->n = n;
p->opt = opt;
p->seqs = seqs;
}
if (opt->n_threads == 1) {
worker(w);
} else {
pthread_t *tid;
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]);
for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
free(tid);
}
for (i = 0; i < opt->n_threads; ++i) {
worker_t *p = &w[i];
for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j];
}
free(w);
for (i = 0; i < n>>1; ++i) {
if (seqs[i<<1|1].l_seq != 0) {
if (opt->flag&2) {
print_bseq(&seqs[i<<1|0], 1);
print_bseq(&seqs[i<<1|1], 2);
}
} else if (opt->flag&1)
print_bseq(&seqs[i<<1|0], 0);
}
for (i = 0; i < n; ++i) {
bseq1_t *s = &seqs[i];
free(s->name); free(s->seq); free(s->qual); free(s->comment);
}
}
int main_pemerge(int argc, char *argv[])
{
int c, flag = 0, i, n, min_ovlp = 10;
int64_t cnt[MAX_ERR+1];
bseq1_t *bseq;
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
pem_opt_t *opt;
opt = pem_opt_init();
while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) {
if (c == 'm') flag |= 1;
else if (c == 'u') flag |= 2;
else if (c == 'Q') opt->q_thres = atoi(optarg);
else if (c == 't') opt->n_threads = atoi(optarg);
else if (c == 'T') min_ovlp = atoi(optarg);
else return 1;
}
if (flag == 0) flag = 3;
opt->flag = flag;
opt->T = opt->a * min_ovlp;
if (optind == argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
fprintf(stderr, "Options: -m output merged reads only\n");
fprintf(stderr, " -u output unmerged reads only\n");
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp);
fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres);
fprintf(stderr, "\n");
free(opt);
return 1;
}
fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
if (NULL == fp) {
fprintf(stderr, "Couldn't open %s : %s\n",
strcmp(argv[optind], "-") ? argv[optind] : "stdin",
errno ? strerror(errno) : "Out of memory");
exit(EXIT_FAILURE);
}
ks = kseq_init(fp);
if (optind + 1 < argc) {
fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r");
if (NULL == fp) {
fprintf(stderr, "Couldn't open %s : %s\n",
strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin",
errno ? strerror(errno) : "Out of memory");
exit(EXIT_FAILURE);
}
ks2 = kseq_init(fp2);
}
memset(cnt, 0, 8 * (MAX_ERR+1));
while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) {
process_seqs(opt, n, bseq, cnt);
free(bseq);
}
fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]);
for (i = 1; i <= MAX_ERR; ++i)
fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]);
kseq_destroy(ks);
err_gzclose(fp);
if (ks2) {
kseq_destroy(ks2);
err_gzclose(fp2);
}
free(opt);
err_fflush(stdout);
return 0;
}

27
qualfa2fq.pl 100755
View File

@ -0,0 +1,27 @@
#!/usr/bin/env perl
use strict;
use warnings;
die("Usage: qualfa2fq.pl <in.fasta> <in.qual>\n") if (@ARGV != 2);
my ($fhs, $fhq, $q);
open($fhs, ($ARGV[0] =~ /\.gz$/)? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die;
open($fhq, ($ARGV[1] =~ /\.gz$/)? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die;
$/ = ">"; <$fhs>; <$fhq>; $/ = "\n";
while (<$fhs>) {
$q = <$fhq>;
print "\@$_";
$/ = ">";
$_ = <$fhs>; $q = <$fhq>;
chomp; chomp($q);
$q =~ s/\s*(\d+)\s*/chr($1+33)/eg;
print $_, "+\n";
for (my $i = 0; $i < length($q); $i += 60) {
print substr($q, $i, 60), "\n";
}
$/ = "\n";
}
close($fhs); close($fhq);

191
rle.c 100644
View File

@ -0,0 +1,191 @@
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include "rle.h"
const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 };
// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase
int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6])
{
uint16_t *nptr = (uint16_t*)block;
int diff;
block += 2; // skip the first 2 counting bytes
if (*nptr == 0) {
memset(cnt, 0, 48);
diff = rle_enc1(block, a, rl);
} else {
uint8_t *p, *end = block + *nptr, *q;
int64_t pre, z, l = 0, tot, beg_l;
int c = -1, n_bytes = 0, n_bytes2, t = 0;
uint8_t tmp[24];
beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5];
tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
if (x < beg_l) {
beg_l = 0, *beg = 0;
memset(bc, 0, 48);
}
if (x == beg_l) {
p = q = block + (*beg); z = beg_l;
memcpy(cnt, bc, 48);
} else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward
z = beg_l; p = block + (*beg);
memcpy(cnt, bc, 48);
while (z < x) {
rle_dec1(p, c, l);
z += l; cnt[c] += l;
}
for (q = p - 1; *q>>6 == 2; --q);
} else { // backward
memcpy(cnt, ec, 48);
z = tot; p = end;
while (z >= x) {
--p;
if (*p>>6 != 2) {
l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3;
z -= l; cnt[*p&7] -= l;
l = 0; t = 0;
} else {
l |= (*p&0x3fL) << t;
t += 6;
}
}
q = p;
rle_dec1(p, c, l);
z += l; cnt[c] += l;
}
*beg = q - block;
memcpy(bc, cnt, 48);
bc[c] -= l;
n_bytes = p - q;
if (x == z && a != c && p < end) { // then try the next run
int tc;
int64_t tl;
q = p;
rle_dec1(q, tc, tl);
if (a == tc)
c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl;
}
if (z != x) cnt[c] -= z - x;
pre = x - (z - l); p -= n_bytes;
if (a == c) { // insert to the same run
n_bytes2 = rle_enc1(tmp, c, l + rl);
} else if (x == z) { // at the end; append to the existing run
p += n_bytes; n_bytes = 0;
n_bytes2 = rle_enc1(tmp, a, rl);
} else { // break the current run
n_bytes2 = rle_enc1(tmp, c, pre);
n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl);
n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre);
}
if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed
memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes);
memcpy(p, tmp, n_bytes2);
diff = n_bytes2 - n_bytes;
}
return (*nptr += diff);
}
int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6])
{
int beg = 0;
int64_t bc[6];
memset(bc, 0, 48);
return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc);
}
void rle_split(uint8_t *block, uint8_t *new_block)
{
int n = *(uint16_t*)block;
uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1);
while (*q>>6 == 2) --q;
memcpy(new_block + 2, q, end - q);
*(uint16_t*)new_block = end - q;
*(uint16_t*)block = q - block - 2;
}
void rle_count(const uint8_t *block, int64_t cnt[6])
{
const uint8_t *q = block + 2, *end = q + *(uint16_t*)block;
while (q < end) {
int c;
int64_t l;
rle_dec1(q, c, l);
cnt[c] += l;
}
}
void rle_print(const uint8_t *block, int expand)
{
const uint16_t *p = (const uint16_t*)block;
const uint8_t *q = block + 2, *end = block + 2 + *p;
while (q < end) {
int c;
int64_t l, x;
rle_dec1(q, c, l);
if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]);
else printf("%c%ld", "$ACGTN"[c], (long)l);
}
putchar('\n');
}
void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6])
{
int a;
int64_t tot, cnt[6];
const uint8_t *p;
y = y >= x? y : x;
tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
if (tot == 0) return;
if (x <= (tot - y) + (tot>>3)) {
int c = 0;
int64_t l, z = 0;
memset(cnt, 0, 48);
p = block + 2;
while (z < x) {
rle_dec1(p, c, l);
z += l; cnt[c] += l;
}
for (a = 0; a != 6; ++a) cx[a] += cnt[a];
cx[c] -= z - x;
if (cy) {
while (z < y) {
rle_dec1(p, c, l);
z += l; cnt[c] += l;
}
for (a = 0; a != 6; ++a) cy[a] += cnt[a];
cy[c] -= z - y;
}
} else {
#define move_backward(_x) \
while (z >= (_x)) { \
--p; \
if (*p>>6 != 2) { \
l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \
z -= l; cnt[*p&7] -= l; \
l = 0; t = 0; \
} else { \
l |= (*p&0x3fL) << t; \
t += 6; \
} \
} \
int t = 0;
int64_t l = 0, z = tot;
memcpy(cnt, ec, 48);
p = block + 2 + *(const uint16_t*)block;
if (cy) {
move_backward(y)
for (a = 0; a != 6; ++a) cy[a] += cnt[a];
cy[*p&7] += y - z;
}
move_backward(x)
for (a = 0; a != 6; ++a) cx[a] += cnt[a];
cx[*p&7] += x - z;
#undef move_backward
}
}

77
rle.h 100644
View File

@ -0,0 +1,77 @@
#ifndef RLE6_H_
#define RLE6_H_
#include <stdint.h>
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x),1)
#else
#define LIKELY(x) (x)
#endif
#ifdef __cplusplus
extern "C" {
#endif
int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]);
int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]);
void rle_split(uint8_t *block, uint8_t *new_block);
void rle_count(const uint8_t *block, int64_t cnt[6]);
void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]);
#define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec)
void rle_print(const uint8_t *block, int expand);
#ifdef __cplusplus
}
#endif
/******************
*** 43+3 codec ***
******************/
extern const uint8_t rle_auxtab[8];
#define RLE_MIN_SPACE 18
#define rle_nptr(block) ((uint16_t*)(block))
// decode one run (c,l) and move the pointer p
#define rle_dec1(p, c, l) do { \
(c) = *(p) & 7; \
if (LIKELY((*(p)&0x80) == 0)) { \
(l) = *(p)++ >> 3; \
} else if (LIKELY(*(p)>>5 == 6)) { \
(l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \
(p) += 2; \
} else { \
int n = ((*(p)&0x10) >> 2) + 4; \
(l) = *(p)++ >> 3 & 1; \
while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \
} \
} while (0)
static inline int rle_enc1(uint8_t *p, int c, int64_t l)
{
if (l < 1LL<<4) {
*p = l << 3 | c;
return 1;
} else if (l < 1LL<<8) {
*p = 0xC0 | l >> 6 << 3 | c;
p[1] = 0x80 | (l & 0x3f);
return 2;
} else if (l < 1LL<<19) {
*p = 0xE0 | l >> 18 << 3 | c;
p[1] = 0x80 | (l >> 12 & 0x3f);
p[2] = 0x80 | (l >> 6 & 0x3f);
p[3] = 0x80 | (l & 0x3f);
return 4;
} else {
int i, shift = 36;
*p = 0xF0 | l >> 42 << 3 | c;
for (i = 1; i < 8; ++i, shift -= 6)
p[i] = 0x80 | (l>>shift & 0x3f);
return 8;
}
}
#endif

318
rope.c 100644
View File

@ -0,0 +1,318 @@
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdio.h>
#include <zlib.h>
#include "rle.h"
#include "rope.h"
/*******************
*** Memory Pool ***
*******************/
#define MP_CHUNK_SIZE 0x100000 // 1MB per chunk
typedef struct { // memory pool for fast and compact memory allocation (no free)
int size, i, n_elems;
int64_t top, max;
uint8_t **mem;
} mempool_t;
static mempool_t *mp_init(int size)
{
mempool_t *mp;
mp = calloc(1, sizeof(mempool_t));
mp->size = size;
mp->i = mp->n_elems = MP_CHUNK_SIZE / size;
mp->top = -1;
return mp;
}
static void mp_destroy(mempool_t *mp)
{
int64_t i;
for (i = 0; i <= mp->top; ++i) free(mp->mem[i]);
free(mp->mem); free(mp);
}
static inline void *mp_alloc(mempool_t *mp)
{
if (mp->i == mp->n_elems) {
if (++mp->top == mp->max) {
mp->max = mp->max? mp->max<<1 : 1;
mp->mem = realloc(mp->mem, sizeof(void*) * mp->max);
}
mp->mem[mp->top] = calloc(mp->n_elems, mp->size);
mp->i = 0;
}
return mp->mem[mp->top] + (mp->i++) * mp->size;
}
/***************
*** B+ rope ***
***************/
rope_t *rope_init(int max_nodes, int block_len)
{
rope_t *rope;
rope = calloc(1, sizeof(rope_t));
if (block_len < 32) block_len = 32;
rope->max_nodes = (max_nodes+ 1)>>1<<1;
rope->block_len = (block_len + 7) >> 3 << 3;
rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes);
rope->leaf = mp_init(rope->block_len);
rope->root = mp_alloc(rope->node);
rope->root->n = 1;
rope->root->is_bottom = 1;
rope->root->p = mp_alloc(rope->leaf);
return rope;
}
void rope_destroy(rope_t *rope)
{
mp_destroy(rope->node);
mp_destroy(rope->leaf);
free(rope);
}
static inline rpnode_t *split_node(rope_t *rope, rpnode_t *u, rpnode_t *v)
{ // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u
int j, i = v - u;
rpnode_t *w; // $w is the sibling of $v
if (u == 0) { // only happens at the root; add a new root
u = v = mp_alloc(rope->node);
v->n = 1; v->p = rope->root; // the new root has the old root as the only child
memcpy(v->c, rope->c, 48);
for (j = 0; j < 6; ++j) v->l += v->c[j];
rope->root = v;
}
if (i != u->n - 1) // then make room for a new node
memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1));
++u->n; w = v + 1;
memset(w, 0, sizeof(rpnode_t));
w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node);
if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node
uint8_t *p = (uint8_t*)v->p, *q = (uint8_t*)w->p;
rle_split(p, q);
rle_count(q, w->c);
} else { // $v->p is a node, not a string
rpnode_t *p = v->p, *q = w->p; // $v and $w are siblings and thus $p and $q are cousins
p->n -= rope->max_nodes>>1;
memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1));
q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy()
q->is_bottom = p->is_bottom;
for (i = 0; i < q->n; ++i)
for (j = 0; j < 6; ++j)
w->c[j] += q[i].c[j];
}
for (j = 0; j < 6; ++j) // compute $w->l and update $v->c
w->l += w->c[j], v->c[j] -= w->c[j];
v->l -= w->l; // update $v->c
return v;
}
int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache)
{ // insert $a after $x symbols in $rope and the returns rank(a, x)
rpnode_t *u = 0, *v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket
int64_t y = 0, z = 0, cnt[6];
int n_runs;
do { // top-down update. Searching and node splitting are done together in one pass.
if (p->n == rope->max_nodes) { // node is full; split
v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root
if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v
y += v->l, z += v->c[a], ++v, p = v->p;
}
u = p;
if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend
p += p->n - 1; y += v->l; z += v->c[a];
for (; y >= x; --p) y -= p->l, z -= p->c[a];
++p;
} else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly
assert(p - u < u->n);
if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split
v = p; p = p->p; // descend
} while (!u->is_bottom);
rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts
if (cache) {
if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t));
n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc);
cache->p = (uint8_t*)p;
} else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c);
z += cnt[a];
v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work
if (n_runs + RLE_MIN_SPACE > rope->block_len) {
split_node(rope, u, v);
if (cache) memset(cache, 0, sizeof(rpcache_t));
}
return z;
}
static rpnode_t *rope_count_to_leaf(const rope_t *rope, int64_t x, int64_t cx[6], int64_t *rest)
{
rpnode_t *u, *v = 0, *p = rope->root;
int64_t y = 0;
int a;
memset(cx, 0, 48);
do {
u = p;
if (v && x - y > v->l>>1) {
p += p->n - 1; y += v->l;
for (a = 0; a != 6; ++a) cx[a] += v->c[a];
for (; y >= x; --p) {
y -= p->l;
for (a = 0; a != 6; ++a) cx[a] -= p->c[a];
}
++p;
} else {
for (; y + p->l < x; ++p) {
y += p->l;
for (a = 0; a != 6; ++a) cx[a] += p->c[a];
}
}
v = p; p = p->p;
} while (!u->is_bottom);
*rest = x - y;
return v;
}
void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy)
{
rpnode_t *v;
int64_t rest;
v = rope_count_to_leaf(rope, x, cx, &rest);
if (y < x || cy == 0) {
rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
} else if (rest + (y - x) <= v->l) {
memcpy(cy, cx, 48);
rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c);
} else {
rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
v = rope_count_to_leaf(rope, y, cy, &rest);
rle_rank1a((const uint8_t*)v->p, rest, cy, v->c);
}
}
/*********************
*** Rope iterator ***
*********************/
void rope_itr_first(const rope_t *rope, rpitr_t *i)
{
memset(i, 0, sizeof(rpitr_t));
i->rope = rope;
for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf
++i->d, i->pa[i->d] = i->pa[i->d - 1]->p;
}
const uint8_t *rope_itr_next_block(rpitr_t *i)
{
const uint8_t *ret;
assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall
if (i->d < 0) return 0;
ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p;
while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking
if (i->d >= 0)
while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf
++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p;
return ret;
}
/***********
*** I/O ***
***********/
void rope_print_node(const rpnode_t *p)
{
if (p->is_bottom) {
int i;
putchar('(');
for (i = 0; i < p->n; ++i) {
uint8_t *block = (uint8_t*)p[i].p;
const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block);
if (i) putchar(',');
while (q < end) {
int c = 0;
int64_t j, l;
rle_dec1(q, c, l);
for (j = 0; j < l; ++j) putchar("$ACGTN"[c]);
}
}
putchar(')');
} else {
int i;
putchar('(');
for (i = 0; i < p->n; ++i) {
if (i) putchar(',');
rope_print_node(p[i].p);
}
putchar(')');
}
}
void rope_dump_node(const rpnode_t *p, FILE *fp)
{
int16_t i, n = p->n;
uint8_t is_bottom = p->is_bottom;
fwrite(&is_bottom, 1, 1, fp);
fwrite(&n, 2, 1, fp);
if (is_bottom) {
for (i = 0; i < n; ++i) {
fwrite(p[i].c, 8, 6, fp);
fwrite(p[i].p, 1, *rle_nptr(p[i].p) + 2, fp);
}
} else {
for (i = 0; i < p->n; ++i)
rope_dump_node(p[i].p, fp);
}
}
void rope_dump(const rope_t *r, FILE *fp)
{
fwrite(&r->max_nodes, 4, 1, fp);
fwrite(&r->block_len, 4, 1, fp);
rope_dump_node(r->root, fp);
}
rpnode_t *rope_restore_node(const rope_t *r, FILE *fp, int64_t c[6])
{
uint8_t is_bottom, a;
int16_t i, n;
rpnode_t *p;
fread(&is_bottom, 1, 1, fp);
fread(&n, 2, 1, fp);
p = mp_alloc(r->node);
p->is_bottom = is_bottom, p->n = n;
if (is_bottom) {
for (i = 0; i < n; ++i) {
uint16_t *q;
p[i].p = mp_alloc(r->leaf);
q = rle_nptr(p[i].p);
fread(p[i].c, 8, 6, fp);
fread(q, 2, 1, fp);
fread(q + 1, 1, *q, fp);
}
} else {
for (i = 0; i < n; ++i)
p[i].p = rope_restore_node(r, fp, p[i].c);
}
memset(c, 0, 48);
for (i = 0; i < n; ++i) {
p[i].l = 0;
for (a = 0; a < 6; ++a)
c[a] += p[i].c[a], p[i].l += p[i].c[a];
}
return p;
}
rope_t *rope_restore(FILE *fp)
{
rope_t *r;
r = calloc(1, sizeof(rope_t));
fread(&r->max_nodes, 4, 1, fp);
fread(&r->block_len, 4, 1, fp);
r->node = mp_init(sizeof(rpnode_t) * r->max_nodes);
r->leaf = mp_init(r->block_len);
r->root = rope_restore_node(r, fp, r->c);
return r;
}

58
rope.h 100644
View File

@ -0,0 +1,58 @@
#ifndef ROPE_H_
#define ROPE_H_
#include <stdint.h>
#include <stdio.h>
#define ROPE_MAX_DEPTH 80
#define ROPE_DEF_MAX_NODES 64
#define ROPE_DEF_BLOCK_LEN 512
typedef struct rpnode_s {
struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs)
uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket
int64_t c[6]; // marginal counts
} rpnode_t;
typedef struct {
int32_t max_nodes, block_len; // both MUST BE even numbers
int64_t c[6]; // marginal counts
rpnode_t *root;
void *node, *leaf; // memory pool
} rope_t;
typedef struct {
const rope_t *rope; // the rope
const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes
int ia[ROPE_MAX_DEPTH]; // index in the parent nodes
int d; // the current depth in the B+-tree
} rpitr_t;
typedef struct {
int beg;
int64_t bc[6];
uint8_t *p;
} rpcache_t;
#ifdef __cplusplus
extern "C" {
#endif
rope_t *rope_init(int max_nodes, int block_len);
void rope_destroy(rope_t *rope);
int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache);
void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy);
#define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0)
void rope_itr_first(const rope_t *rope, rpitr_t *i);
const uint8_t *rope_itr_next_block(rpitr_t *i);
void rope_print_node(const rpnode_t *p);
void rope_dump(const rope_t *r, FILE *fp);
rope_t *rope_restore(FILE *fp);
#ifdef __cplusplus
}
#endif
#endif

119
scalar_sse.h 100644
View File

@ -0,0 +1,119 @@
#ifndef SCALAR_SSE_H
#define SCALAR_SSE_H
#include <assert.h>
#include <stdint.h>
#include <string.h>
typedef union m128i {
uint8_t u8[16];
int16_t i16[8];
} __m128i;
static inline __m128i _mm_set1_epi32(int32_t n) {
assert(n >= 0 && n <= 255);
__m128i r; memset(&r, n, sizeof r); return r;
}
static inline __m128i _mm_load_si128(const __m128i *ptr) { __m128i r; memcpy(&r, ptr, sizeof r); return r; }
static inline void _mm_store_si128(__m128i *ptr, __m128i a) { memcpy(ptr, &a, sizeof a); }
static inline int m128i_allzero(__m128i a) {
static const char zero[] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
return memcmp(&a, zero, sizeof a) == 0;
}
static inline __m128i _mm_slli_si128(__m128i a, int n) {
int i;
memmove(&a.u8[n], &a.u8[0], 16 - n);
for (i = 0; i < n; i++) a.u8[i] = 0;
return a;
}
static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) {
int i;
for (i = 0; i < 16; i++) {
uint16_t aa = a.u8[i];
aa += b.u8[i];
a.u8[i] = (aa < 256)? aa : 255;
}
return a;
}
static inline __m128i _mm_max_epu8(__m128i a, __m128i b) {
int i;
for (i = 0; i < 16; i++)
if (a.u8[i] < b.u8[i]) a.u8[i] = b.u8[i];
return a;
}
static inline uint8_t m128i_max_u8(__m128i a) {
uint8_t max = 0;
int i;
for (i = 0; i < 16; i++)
if (max < a.u8[i]) max = a.u8[i];
return max;
}
static inline __m128i _mm_set1_epi8(int8_t n) { __m128i r; memset(&r, n, sizeof r); return r; }
static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) {
int i;
for (i = 0; i < 16; i++) {
int16_t aa = a.u8[i];
aa -= b.u8[i];
a.u8[i] = (aa >= 0)? aa : 0;
}
return a;
}
static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) {
int i;
for (i = 0; i < 8; i++) {
int32_t aa = a.i16[i];
aa += b.i16[i];
a.i16[i] = (aa < 32768)? aa : 32767;
}
return a;
}
static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) {
int i;
for (i = 0; i < 8; i++)
a.i16[i] = (a.i16[i] > b.i16[i])? 0xffff : 0x0000;
return a;
}
static inline __m128i _mm_max_epi16(__m128i a, __m128i b) {
int i;
for (i = 0; i < 8; i++)
if (a.i16[i] < b.i16[i]) a.i16[i] = b.i16[i];
return a;
}
static inline __m128i _mm_set1_epi16(int16_t n) {
__m128i r;
r.i16[0] = r.i16[1] = r.i16[2] = r.i16[3] =
r.i16[4] = r.i16[5] = r.i16[6] = r.i16[7] = n;
return r;
}
static inline int16_t m128i_max_s16(__m128i a) {
int16_t max = -32768;
int i;
for (i = 0; i < 8; i++)
if (max < a.i16[i]) max = a.i16[i];
return max;
}
static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) {
int i;
for (i = 0; i < 8; i++) {
int32_t aa = a.i16[i];
aa -= b.i16[i];
a.i16[i] = (aa >= 0)? aa : 0;
}
return a;
}
#endif

306
utils.c 100644
View File

@ -0,0 +1,306 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#define FSYNC_ON_FLUSH
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include <errno.h>
#ifdef FSYNC_ON_FLUSH
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#endif
#include <sys/resource.h>
#include <sys/time.h>
#include "utils.h"
#include "ksort.h"
#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
KSORT_INIT(128, pair64_t, pair64_lt)
KSORT_INIT(64, uint64_t, ks_lt_generic)
#include "kseq.h"
KSEQ_INIT2(, gzFile, err_gzread)
/********************
* System utilities *
********************/
FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
{
FILE *fp = 0;
if (strcmp(fn, "-") == 0)
return (strstr(mode, "r"))? stdin : stdout;
if ((fp = fopen(fn, mode)) == 0) {
err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
}
return fp;
}
FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp)
{
if (freopen(fn, mode, fp) == 0) {
err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
}
return fp;
}
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
{
gzFile fp;
if (strcmp(fn, "-") == 0) {
fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode);
/* According to zlib.h, this is the only reason gzdopen can fail */
if (!fp) err_fatal(func, "Out of memory");
return fp;
}
if ((fp = gzopen(fn, mode)) == 0) {
err_fatal(func, "fail to open file '%s' : %s", fn, errno ? strerror(errno) : "Out of memory");
}
return fp;
}
void err_fatal(const char *header, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
fprintf(stderr, "[%s] ", header);
vfprintf(stderr, fmt, args);
fprintf(stderr, "\n");
va_end(args);
exit(EXIT_FAILURE);
}
void err_fatal_core(const char *header, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
fprintf(stderr, "[%s] ", header);
vfprintf(stderr, fmt, args);
fprintf(stderr, " Abort!\n");
va_end(args);
abort();
}
void _err_fatal_simple(const char *func, const char *msg)
{
fprintf(stderr, "[%s] %s\n", func, msg);
exit(EXIT_FAILURE);
}
void _err_fatal_simple_core(const char *func, const char *msg)
{
fprintf(stderr, "[%s] %s Abort!\n", func, msg);
abort();
}
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
{
size_t ret = fwrite(ptr, size, nmemb, stream);
if (ret != nmemb)
_err_fatal_simple("fwrite", strerror(errno));
return ret;
}
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
{
size_t ret = fread(ptr, size, nmemb, stream);
if (ret != nmemb)
{
_err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file");
}
return ret;
}
int err_gzread(gzFile file, void *ptr, unsigned int len)
{
int ret = gzread(file, ptr, len);
if (ret < 0)
{
int errnum = 0;
const char *msg = gzerror(file, &errnum);
_err_fatal_simple("gzread", Z_ERRNO == errnum ? strerror(errno) : msg);
}
return ret;
}
int err_fseek(FILE *stream, long offset, int whence)
{
int ret = fseek(stream, offset, whence);
if (0 != ret)
{
_err_fatal_simple("fseek", strerror(errno));
}
return ret;
}
long err_ftell(FILE *stream)
{
long ret = ftell(stream);
if (-1 == ret)
{
_err_fatal_simple("ftell", strerror(errno));
}
return ret;
}
int err_printf(const char *format, ...)
{
va_list arg;
int done;
va_start(arg, format);
done = vfprintf(stdout, format, arg);
int saveErrno = errno;
va_end(arg);
if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno));
return done;
}
int err_fprintf(FILE *stream, const char *format, ...)
{
va_list arg;
int done;
va_start(arg, format);
done = vfprintf(stream, format, arg);
int saveErrno = errno;
va_end(arg);
if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno));
return done;
}
int err_fputc(int c, FILE *stream)
{
int ret = putc(c, stream);
if (EOF == ret)
{
_err_fatal_simple("fputc", strerror(errno));
}
return ret;
}
int err_fputs(const char *s, FILE *stream)
{
int ret = fputs(s, stream);
if (EOF == ret)
{
_err_fatal_simple("fputs", strerror(errno));
}
return ret;
}
int err_puts(const char *s)
{
int ret = puts(s);
if (EOF == ret)
{
_err_fatal_simple("puts", strerror(errno));
}
return ret;
}
int err_fflush(FILE *stream)
{
int ret = fflush(stream);
if (ret != 0) _err_fatal_simple("fflush", strerror(errno));
#ifdef FSYNC_ON_FLUSH
/* Calling fflush() ensures that all the data has made it to the
kernel buffers, but this may not be sufficient for remote filesystems
(e.g. NFS, lustre) as an error may still occur while the kernel
is copying the buffered data to the file server. To be sure of
catching these errors, we need to call fsync() on the file
descriptor, but only if it is a regular file. */
{
struct stat sbuf;
if (0 != fstat(fileno(stream), &sbuf))
_err_fatal_simple("fstat", strerror(errno));
if (S_ISREG(sbuf.st_mode))
{
if (0 != fsync(fileno(stream)))
_err_fatal_simple("fsync", strerror(errno));
}
}
#endif
return ret;
}
int err_fclose(FILE *stream)
{
int ret = fclose(stream);
if (ret != 0) _err_fatal_simple("fclose", strerror(errno));
return ret;
}
int err_gzclose(gzFile file)
{
int ret = gzclose(file);
if (Z_OK != ret)
{
_err_fatal_simple("gzclose", Z_ERRNO == ret ? strerror(errno) : zError(ret));
}
return ret;
}
/*********
* Timer *
*********/
double cputime(void)
{
struct rusage r;
getrusage(RUSAGE_SELF, &r);
return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
}
double realtime(void)
{
struct timeval tp;
struct timezone tzp;
gettimeofday(&tp, &tzp);
return tp.tv_sec + tp.tv_usec * 1e-6;
}
long peakrss(void)
{
struct rusage r;
getrusage(RUSAGE_SELF, &r);
#ifdef __linux__
return r.ru_maxrss * 1024;
#else
return r.ru_maxrss;
#endif
}

111
utils.h 100644
View File

@ -0,0 +1,111 @@
/* The MIT License
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef LH3_UTILS_H
#define LH3_UTILS_H
#include <stdint.h>
#include <stdio.h>
#include <zlib.h>
#ifdef __GNUC__
// Tell GCC to validate printf format string and args
#define ATTRIBUTE(list) __attribute__ (list)
#else
#define ATTRIBUTE(list)
#endif
#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg)
#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg)
#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp)
#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode)
#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg)
typedef struct {
uint64_t x, y;
} pair64_t;
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
#ifdef __cplusplus
extern "C" {
#endif
void err_fatal(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn));
void err_fatal_core(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn));
void _err_fatal_simple(const char *func, const char *msg) ATTRIBUTE((noreturn));
void _err_fatal_simple_core(const char *func, const char *msg) ATTRIBUTE((noreturn));
FILE *err_xopen_core(const char *func, const char *fn, const char *mode);
FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp);
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode);
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream);
int err_gzread(gzFile file, void *ptr, unsigned int len);
int err_fseek(FILE *stream, long offset, int whence);
#define err_rewind(FP) err_fseek((FP), 0, SEEK_SET)
long err_ftell(FILE *stream);
int err_fprintf(FILE *stream, const char *format, ...)
ATTRIBUTE((format(printf, 2, 3)));
int err_printf(const char *format, ...)
ATTRIBUTE((format(printf, 1, 2)));
int err_fputc(int c, FILE *stream);
#define err_putchar(C) err_fputc((C), stdout)
int err_fputs(const char *s, FILE *stream);
int err_puts(const char *s);
int err_fflush(FILE *stream);
int err_fclose(FILE *stream);
int err_gzclose(gzFile file);
double cputime(void);
double realtime(void);
long peakrss(void);
void ks_introsort_64 (size_t n, uint64_t *a);
void ks_introsort_128(size_t n, pair64_t *a);
#ifdef __cplusplus
}
#endif
static inline uint64_t hash_64(uint64_t key)
{
key += ~(key << 32);
key ^= (key >> 22);
key += ~(key << 13);
key ^= (key >> 8);
key += (key << 3);
key ^= (key >> 15);
key += ~(key << 27);
key ^= (key >> 31);
return key;
}
#endif

27
xa2multi.pl 100755
View File

@ -0,0 +1,27 @@
#!/usr/bin/env perl
use strict;
use warnings;
while (<>) {
if (/\tXA:Z:(\S+)/) {
my $l = $1;
print;
my @t = split("\t");
while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) {
my $mchr = ($t[6] eq '=') ? $t[2] : $t[6];
my $mchr_ = ($mchr eq $1) ? '=' : $mchr;
# FIXME: TLEN/ISIZE is not calculated!
my $seq = $t[9];
my $phred = $t[10];
# if alternative alignment has other orientation than primary,
# then print the reverse (complement) of sequence and phred string
if ((($t[1]&0x10)>0) xor ($2<0)) {
$seq = reverse $seq;
$seq =~ tr/ACGTacgt/TGCAtgca/;
$phred = reverse $phred;
}
print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, $mchr_, $t[7], 0, $seq, $phred, "NM:i:$4"), "\n");
}
} else { print; }
}