From e7d38247bbf068a3cd3c5679ac7aa0453291e1ad Mon Sep 17 00:00:00 2001 From: carneiro Date: Fri, 11 Feb 2011 18:21:31 +0000 Subject: [PATCH] chunkIntervals.lua creates 1Mb interval chunks out of any .intervals file. Useful for methods development pipeline datasets. remapAmplicons.lua takes a sam file with reads aligned to amplicon references, a reference genome , and an amplicon reference mapping table, and rewrites the sam file with mappings to the reference sequence. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5230 348d0f76-0448-11de-a6fe-93d51630548a --- lua/chunkIntervals.lua | 21 +++++++++++ lua/remapAmplicons.lua | 84 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 lua/chunkIntervals.lua create mode 100644 lua/remapAmplicons.lua diff --git a/lua/chunkIntervals.lua b/lua/chunkIntervals.lua new file mode 100644 index 000000000..c8ed48aa2 --- /dev/null +++ b/lua/chunkIntervals.lua @@ -0,0 +1,21 @@ +local infile = arg[1] or io.stdin +local outfile = (arg[2] and io.open(arg[2], "w")) or io.stdout + + +local function cutcommas(x) + local n = "" + for v in x:gmatch("(%d+)") do + n = n .. v + end + if n == "" then return tonumber(x) else return tonumber(n) end +end + +for l in io.lines(infile) do + local chr, startPos, endPos = l:match("(.*):(%d+)%-([%d,]+)") + startPos = cutcommas(startPos) + endPos = cutcommas(endPos) + for i=startPos,endPos,1000000 do + if endPos > i+999999 then outfile:write(chr..":"..i.."-"..i+999999 .."\n") + else outfile:write(chr..":"..i.."-"..endPos.."\n") end + end +end \ No newline at end of file diff --git a/lua/remapAmplicons.lua b/lua/remapAmplicons.lua new file mode 100644 index 000000000..f70748129 --- /dev/null +++ b/lua/remapAmplicons.lua @@ -0,0 +1,84 @@ +local ampFile = arg[1] +local samFile = arg[2] +local headerFile = io.open(arg[2]:match("(.*).sam")..".header.sam", "w") +local bodyFile = io.open(arg[2]:match("(.*).sam")..".body.sam", "w") + +chrlength = {} +chrlength["1"] =249250621 +chrlength["2"] =243199373 +chrlength["3"] =198022430 +chrlength["4"] =191154276 +chrlength["5"] =180915260 +chrlength["6"] =171115067 +chrlength["7"] =159138663 +chrlength["8"] =146364022 +chrlength["9"] =141213431 +chrlength["10"]=135534747 +chrlength["11"]=135006516 +chrlength["12"]=133851895 +chrlength["13"]=115169878 +chrlength["14"]=107349540 +chrlength["15"]=102531392 +chrlength["16"]=90354753 +chrlength["17"]=81195210 +chrlength["18"]=78077248 +chrlength["19"]=59128983 +chrlength["20"]=63025520 +chrlength["21"]=48129895 +chrlength["22"]=51304566 +chrlength["X"] =155270560 +chrlength["Y"] =59373566 +chrlength["MT"]=16569 + +local header = {} +local reads = {} + +-- reads the amplicon file (global var) and returns an amplicon hash table indexed by amplicon name +local function readAmpliconFile() + local amplicons = {} + for l in io.lines(ampFile) do + local chr, startPos, endPos, ampName = l:match("([%w]+)%s+(%d+)%s+(%d+)%s+([%w%p_]+)") + amplicons[ampName] = {chr=chr, startPos=tonumber(startPos), endPos=tonumber(endPos)} + end + return amplicons +end + +-- updates the global header table with the entries +local function processSamHeaderLine(l, amplicons) + if l:sub(1,3) == "@HD" then header.hd = l + elseif l:sub(1,3) == "@CO" then header.co = l + else + if not header.sq then header.sq = {} end + local ampName = l:match("@SQ%s+SN:ps%d+_([%w%p_]+).*") + local chr = amplicons[ampName].chr + header.sq[chr] = chrlength[chr] + end +end + +local function printHeader() + if header.hd then headerFile:write(header.hd.."\n") else headerFile:write("@HD VN:PB_v0.1") end + for chr, len in pairs(header.sq) do + headerFile:write("@SQ\tSN:"..chr.."\tLN:"..len.."\n") + end + if header.co then headerFile:write(header.co.."\n") end +end + +local function printReads() + for _, v in ipairs(reads) do bodyFile:write(v.."\n") end +end + + +local amplicons = readAmpliconFile() -- amplicons indexed by name + +--for i,v in pairs(amplicons) do print("'"..i.."'") end + +for l in io.lines(samFile) do + if l:sub(1,1) == "@" then processSamHeaderLine(l, amplicons) + else + local before, amp, mapStart, after = l:match("(%d+%s+%d+%s+)ps%d+_([%w%p_]+)%s+(%d+)(.*)") + table.insert(reads, before..amplicons[amp].chr.."\t"..amplicons[amp].startPos + mapStart..after) + end +end + +printHeader() +printReads() \ No newline at end of file