simple tool that takes two dbSNP files and subsets the seconds to only include rsID SNPs present in the first. Used to make b129 against b37 by subsetting b131/b37 vs. b129/b36
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3352 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d3c33d4b3f
commit
2a803e9044
|
|
@ -0,0 +1,49 @@
|
|||
import sys
|
||||
from optparse import OptionParser
|
||||
|
||||
def getRsIDSet(dbSNPIds, idIndex):
|
||||
s = set()
|
||||
for line in open(dbSNPIds):
|
||||
#print line
|
||||
rsID = line.split()[idIndex]
|
||||
s.add(rsID)
|
||||
|
||||
return frozenset(s)
|
||||
|
||||
|
||||
def main():
|
||||
global OPTIONS
|
||||
usage = "usage: %prog [options] dbSNP.in.rsids dbSNP.to.match dbSNP.out"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-v", "--verbose", dest="verbose",
|
||||
action='store_true', default=False,
|
||||
help="")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 3:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
dbSNPIds = args[0]
|
||||
dbSNPMatch = args[1]
|
||||
dbSNPOut = args[2]
|
||||
idIndex = 4
|
||||
|
||||
rsSet = getRsIDSet(dbSNPIds, idIndex)
|
||||
print 'rsID set has %d elements' % len(rsSet)
|
||||
|
||||
#
|
||||
count = 0
|
||||
matched = 0
|
||||
out = open(dbSNPOut, 'w')
|
||||
for line in open(dbSNPMatch):
|
||||
count += 1
|
||||
rsID = line.split()[idIndex]
|
||||
if rsID in rsSet:
|
||||
#sys.stdout.write(line)
|
||||
matched += 1
|
||||
out.write(line)
|
||||
print 'Processed %d lines, matching %d elements, excluding %d' % ( count, matched, count - matched )
|
||||
out.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue