chunkIntervals.lua creates 1Mb interval chunks out of any .intervals file. Useful for methods development pipeline datasets.
remapAmplicons.lua takes a sam file with reads aligned to amplicon references, a reference genome , and an amplicon reference mapping table, and rewrites the sam file with mappings to the reference sequence. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5230 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
03482bf7c4
commit
e7d38247bb
|
|
@ -0,0 +1,21 @@
|
||||||
|
local infile = arg[1] or io.stdin
|
||||||
|
local outfile = (arg[2] and io.open(arg[2], "w")) or io.stdout
|
||||||
|
|
||||||
|
|
||||||
|
local function cutcommas(x)
|
||||||
|
local n = ""
|
||||||
|
for v in x:gmatch("(%d+)") do
|
||||||
|
n = n .. v
|
||||||
|
end
|
||||||
|
if n == "" then return tonumber(x) else return tonumber(n) end
|
||||||
|
end
|
||||||
|
|
||||||
|
for l in io.lines(infile) do
|
||||||
|
local chr, startPos, endPos = l:match("(.*):(%d+)%-([%d,]+)")
|
||||||
|
startPos = cutcommas(startPos)
|
||||||
|
endPos = cutcommas(endPos)
|
||||||
|
for i=startPos,endPos,1000000 do
|
||||||
|
if endPos > i+999999 then outfile:write(chr..":"..i.."-"..i+999999 .."\n")
|
||||||
|
else outfile:write(chr..":"..i.."-"..endPos.."\n") end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
@ -0,0 +1,84 @@
|
||||||
|
local ampFile = arg[1]
|
||||||
|
local samFile = arg[2]
|
||||||
|
local headerFile = io.open(arg[2]:match("(.*).sam")..".header.sam", "w")
|
||||||
|
local bodyFile = io.open(arg[2]:match("(.*).sam")..".body.sam", "w")
|
||||||
|
|
||||||
|
chrlength = {}
|
||||||
|
chrlength["1"] =249250621
|
||||||
|
chrlength["2"] =243199373
|
||||||
|
chrlength["3"] =198022430
|
||||||
|
chrlength["4"] =191154276
|
||||||
|
chrlength["5"] =180915260
|
||||||
|
chrlength["6"] =171115067
|
||||||
|
chrlength["7"] =159138663
|
||||||
|
chrlength["8"] =146364022
|
||||||
|
chrlength["9"] =141213431
|
||||||
|
chrlength["10"]=135534747
|
||||||
|
chrlength["11"]=135006516
|
||||||
|
chrlength["12"]=133851895
|
||||||
|
chrlength["13"]=115169878
|
||||||
|
chrlength["14"]=107349540
|
||||||
|
chrlength["15"]=102531392
|
||||||
|
chrlength["16"]=90354753
|
||||||
|
chrlength["17"]=81195210
|
||||||
|
chrlength["18"]=78077248
|
||||||
|
chrlength["19"]=59128983
|
||||||
|
chrlength["20"]=63025520
|
||||||
|
chrlength["21"]=48129895
|
||||||
|
chrlength["22"]=51304566
|
||||||
|
chrlength["X"] =155270560
|
||||||
|
chrlength["Y"] =59373566
|
||||||
|
chrlength["MT"]=16569
|
||||||
|
|
||||||
|
local header = {}
|
||||||
|
local reads = {}
|
||||||
|
|
||||||
|
-- reads the amplicon file (global var) and returns an amplicon hash table indexed by amplicon name
|
||||||
|
local function readAmpliconFile()
|
||||||
|
local amplicons = {}
|
||||||
|
for l in io.lines(ampFile) do
|
||||||
|
local chr, startPos, endPos, ampName = l:match("([%w]+)%s+(%d+)%s+(%d+)%s+([%w%p_]+)")
|
||||||
|
amplicons[ampName] = {chr=chr, startPos=tonumber(startPos), endPos=tonumber(endPos)}
|
||||||
|
end
|
||||||
|
return amplicons
|
||||||
|
end
|
||||||
|
|
||||||
|
-- updates the global header table with the entries
|
||||||
|
local function processSamHeaderLine(l, amplicons)
|
||||||
|
if l:sub(1,3) == "@HD" then header.hd = l
|
||||||
|
elseif l:sub(1,3) == "@CO" then header.co = l
|
||||||
|
else
|
||||||
|
if not header.sq then header.sq = {} end
|
||||||
|
local ampName = l:match("@SQ%s+SN:ps%d+_([%w%p_]+).*")
|
||||||
|
local chr = amplicons[ampName].chr
|
||||||
|
header.sq[chr] = chrlength[chr]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
local function printHeader()
|
||||||
|
if header.hd then headerFile:write(header.hd.."\n") else headerFile:write("@HD VN:PB_v0.1") end
|
||||||
|
for chr, len in pairs(header.sq) do
|
||||||
|
headerFile:write("@SQ\tSN:"..chr.."\tLN:"..len.."\n")
|
||||||
|
end
|
||||||
|
if header.co then headerFile:write(header.co.."\n") end
|
||||||
|
end
|
||||||
|
|
||||||
|
local function printReads()
|
||||||
|
for _, v in ipairs(reads) do bodyFile:write(v.."\n") end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
local amplicons = readAmpliconFile() -- amplicons indexed by name
|
||||||
|
|
||||||
|
--for i,v in pairs(amplicons) do print("'"..i.."'") end
|
||||||
|
|
||||||
|
for l in io.lines(samFile) do
|
||||||
|
if l:sub(1,1) == "@" then processSamHeaderLine(l, amplicons)
|
||||||
|
else
|
||||||
|
local before, amp, mapStart, after = l:match("(%d+%s+%d+%s+)ps%d+_([%w%p_]+)%s+(%d+)(.*)")
|
||||||
|
table.insert(reads, before..amplicons[amp].chr.."\t"..amplicons[amp].startPos + mapStart..after)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
printHeader()
|
||||||
|
printReads()
|
||||||
Loading…
Reference in New Issue