150 lines
5.1 KiB
Lua
150 lines
5.1 KiB
Lua
|
|
-- bioinformatics routines
|
||
|
|
|
||
|
|
-- Description: read a fasta/fastq file
|
||
|
|
local function readseq(fp)
|
||
|
|
local finished, last = false, nil;
|
||
|
|
return function()
|
||
|
|
local match;
|
||
|
|
if finished then return nil end
|
||
|
|
if (last == nil) then -- the first record or a record following a fastq
|
||
|
|
for l in fp:lines() do
|
||
|
|
if l:byte(1) == 62 or l:byte(1) == 64 then -- ">" || "@"
|
||
|
|
last = l;
|
||
|
|
break;
|
||
|
|
end
|
||
|
|
end
|
||
|
|
if last == nil then
|
||
|
|
finished = true;
|
||
|
|
return nil;
|
||
|
|
end
|
||
|
|
end
|
||
|
|
local tmp = last:find("%s");
|
||
|
|
name = (tmp and last:sub(2, tmp-1)) or last:sub(2); -- sequence name
|
||
|
|
local seqs = {};
|
||
|
|
local c; -- the first character of the last line
|
||
|
|
last = nil;
|
||
|
|
for l in fp:lines() do -- read sequence
|
||
|
|
c = l:byte(1);
|
||
|
|
if c == 62 or c == 64 or c == 43 then
|
||
|
|
last = l;
|
||
|
|
break;
|
||
|
|
end
|
||
|
|
table.insert(seqs, l);
|
||
|
|
end
|
||
|
|
if last == nil then finished = true end -- end of file
|
||
|
|
if c ~= 43 then return name, table.concat(seqs) end -- a fasta record
|
||
|
|
local seq, len = table.concat(seqs), 0; -- prepare to parse quality
|
||
|
|
seqs = {};
|
||
|
|
for l in fp:lines() do -- read quality
|
||
|
|
table.insert(seqs, l);
|
||
|
|
len = len + #l;
|
||
|
|
if len >= #seq then
|
||
|
|
last = nil;
|
||
|
|
return name, seq, table.concat(seqs);
|
||
|
|
end
|
||
|
|
end
|
||
|
|
finished = true;
|
||
|
|
return name, seq;
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
-- extract subsequence from a fasta file indexe by samtools faidx
|
||
|
|
local function faidxsub(fn)
|
||
|
|
local fpidx = io.open(fn .. ".fai");
|
||
|
|
if fpidx == nil then
|
||
|
|
io.stderr:write("[faidxsub] fail to open the FASTA index file.\n");
|
||
|
|
return nil
|
||
|
|
end
|
||
|
|
local idx = {};
|
||
|
|
for l in fpidx:lines() do
|
||
|
|
local name, len, offset, line_blen, line_len = l:match("(%S+)%s(%d+)%s(%d+)%s(%d+)%s(%d+)");
|
||
|
|
if name then
|
||
|
|
idx[name] = {tonumber(len), offset, line_blen, line_len};
|
||
|
|
end
|
||
|
|
end
|
||
|
|
fpidx:close();
|
||
|
|
local fp = io.open(fn);
|
||
|
|
return function(name, beg_, end_) -- 0-based coordinate
|
||
|
|
if name == nil then fp:close(); return nil; end
|
||
|
|
if idx[name] then
|
||
|
|
local a = idx[name];
|
||
|
|
beg_ = beg_ or 0;
|
||
|
|
end_ = end_ or a[1];
|
||
|
|
end_ = (end_ <= a[1] and end_) or a[1];
|
||
|
|
local fb, fe = math.floor(beg_ / a[3]), math.floor(end_ / a[3]);
|
||
|
|
local qb, qe = beg_ - fb * a[3], end_ - fe * a[3];
|
||
|
|
fp:seek("set", a[2] + fb * a[4] + qb);
|
||
|
|
local s = fp:read((fe - fb) * a[4] + (qe - qb)):gsub("%s", "");
|
||
|
|
return s;
|
||
|
|
end
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
--Description: Index a list of intervals and test if a given interval overlaps with the list
|
||
|
|
--Example: lua -lbio -e 'a={{100,201},{200,300},{400,600}};f=bio.intvovlp(a);print(f(600,700))'
|
||
|
|
--[[
|
||
|
|
By default, we keep for each tiling 8192 window the interval overlaping the
|
||
|
|
window while having the smallest start position. This method may not work
|
||
|
|
well when most intervals are small but few intervals span a long distance.
|
||
|
|
]]--
|
||
|
|
local function intvovlp(intv, bits)
|
||
|
|
bits = bits or 13 -- the default bin size is 8192 = 1<<13
|
||
|
|
table.sort(intv, function(a,b) return a[1] < b[1] end) -- sort by the start
|
||
|
|
-- merge intervals; the step speeds up testing, but can be skipped
|
||
|
|
local b, e, k = -1, -1, 1
|
||
|
|
for i = 1, #intv do
|
||
|
|
if e < intv[i][1] then
|
||
|
|
if e >= 0 then intv[k], k = {b, e}, k + 1 end
|
||
|
|
b, e = intv[i][1], intv[i][2]
|
||
|
|
else e = intv[i][2] end
|
||
|
|
end
|
||
|
|
if e >= 0 then intv[k] = {b, e} end
|
||
|
|
while #a > k do table.remove(a) end -- truncate the interval list
|
||
|
|
-- build the index for the list of intervals
|
||
|
|
local idx, size, max = {}, math.pow(2, bits), 0
|
||
|
|
for i = 1, #a do
|
||
|
|
b = math.modf(intv[i][1] / size)
|
||
|
|
e = math.modf(intv[i][2] / size)
|
||
|
|
if b == e then idx[b] = idx[b] or i
|
||
|
|
else for j = b, e do idx[j] = idx[j] or i end end
|
||
|
|
max = (max > e and max) or e
|
||
|
|
end
|
||
|
|
-- return a function (closure)
|
||
|
|
return function(_beg, _end)
|
||
|
|
local x = math.modf(_beg / size)
|
||
|
|
if x > max then return false end
|
||
|
|
local off = idx[x]; -- the start bin
|
||
|
|
if off == nil then -- the following is not the best in efficiency
|
||
|
|
for i = x - 1, 0, -1 do -- find the minimum bin with a value
|
||
|
|
if idx[i] ~= nil then off = idx[i]; break; end
|
||
|
|
end
|
||
|
|
if off == nil then return false end
|
||
|
|
end
|
||
|
|
for i = off, #intv do -- start from off and search for overlaps
|
||
|
|
if intv[i][1] >= _end then return false
|
||
|
|
elseif intv[i][2] > _beg then return true end
|
||
|
|
end
|
||
|
|
return false
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
bio = {
|
||
|
|
readseq = readseq,
|
||
|
|
faidxsub = faidxsub,
|
||
|
|
intvovlp = intvovlp
|
||
|
|
}
|
||
|
|
|
||
|
|
bio.nt16 = {
|
||
|
|
[0]=15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
|
||
|
|
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
|
||
|
|
15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15,
|
||
|
|
15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15,
|
||
|
|
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
|
||
|
|
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
|
||
|
|
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
|
||
|
|
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
|
||
|
|
}
|
||
|
|
bio.ntcnt = { [0]=4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }
|
||
|
|
bio.ntcomp = { [0]=0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }
|
||
|
|
bio.ntrev = 'XACMGRSVTWYHKDBN'
|