From a2e2a6a9c366fea554ece3a12be8cb5fe96030fa Mon Sep 17 00:00:00 2001 From: carneiro Date: Fri, 28 Jan 2011 21:46:41 +0000 Subject: [PATCH] A script to convert an intervals file into a sorted unoverlapping intervals file. Super fast implementation using multiple hashes, should do the job instantly no matter how big the intervals are. ps: this utility is similar to -im on GATK, but for queue scripts we need the intervals file fixed before we can run Unified Genotyper because of scatter-gather. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5117 348d0f76-0448-11de-a6fe-93d51630548a --- lua/MergeIntervals.lua | 61 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 lua/MergeIntervals.lua diff --git a/lua/MergeIntervals.lua b/lua/MergeIntervals.lua new file mode 100644 index 000000000..c6a9f7f34 --- /dev/null +++ b/lua/MergeIntervals.lua @@ -0,0 +1,61 @@ +-- Merges all intervals into a sorted list of unoverlapping intervals +-- +-- ps: this utility is similar to -im on GATK, but for queue scripts we need the intervals +-- file fixed before we can run Unified Genotyper because of scatter-gather. +-- +-- Usage: +-- lua MergeIntervals.lua raw.intervals > merged.intervals +-- + +assert(table.getn(arg) > 0, "\n\nMissing input file\n\nUsage:\n\tlua MergeIntervals.lua raw.intervals > merged.intervals\n\n") + +-- read file and create chromosome tables with raw intervals marked as "x" +local intervals = {} +local intervalKeys = {} +for l in io.lines(arg[1]) do + local chr, a, b = l:match("([%l%d]+):(%d+)-(%d+)") + a,b = tonumber(a), tonumber(b) + if not intervals[chr] then + intervals[chr] = {} + intervals[chr].min = a + intervals[chr].max = b + intervalKeys[chr] = {} + end + for i=a,b do intervals[chr][i] = "x" end + intervals[chr].min = math.min(intervals[chr].min, a) + intervals[chr].max = math.max(intervals[chr].max, b) + table.insert(intervalKeys[chr], a) +end + +local intervalSize = {} +local intervalsIndex = {} +for c, chrTable in pairs(intervals) do + intervalSize[c] = {} + intervalsIndex[c] = {} + local intervalStart = -1 + local lastPositionRead = -1 + table.sort(intervalKeys[c]) + for _,startKey in ipairs(intervalKeys[c]) do + local i = startKey + while chrTable[i] and i > lastPositionRead do + if (intervalStart < 0 or i ~= (intervalStart + intervalSize[c][intervalStart] + 1)) then + intervalStart = i + intervalSize[c][intervalStart] = 0 + table.insert(intervalsIndex[c], intervalStart) + else + intervalSize[c][intervalStart] = intervalSize[c][intervalStart] + 1 + end + lastPositionRead = i + i = i + 1 + end + end +end + +for c,v in pairs(intervalsIndex) do + table.sort(v) + for _,intervalStart in ipairs(v) do + if intervalSize[c][intervalStart] > 0 then + print(c..":"..intervalStart.."-"..intervalStart + intervalSize[c][intervalStart]) + end + end +end