Useful library for parsing VCF files, plus a general VCF->table converter
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1964 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
5d5dc989e7
commit
44ea55d338
|
|
@ -0,0 +1,38 @@
|
|||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from vcfReader import *
|
||||
|
||||
if __name__ == "__main__":
|
||||
usage = "usage: %prog files.list [options]"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-f", "--f", dest="fields",
|
||||
type='string', default="",
|
||||
help="Comma separated list of fields to exact")
|
||||
parser.add_option("-e", "--filter", dest="filter",
|
||||
action='store_true', default=False,
|
||||
help="If true, only includes records that aren't filtered in the output")
|
||||
parser.add_option("-s", "--s", dest="skip",
|
||||
type='int', default=0,
|
||||
help="Only print out every 1 / skip records")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 0:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
counter = OPTIONS.skip
|
||||
|
||||
fields = OPTIONS.fields.split(',')
|
||||
for vcf,count in lines2VCF(sys.stdin):
|
||||
#print vcf, count
|
||||
if count == 1 and vcf.hasHeader():
|
||||
print '\t'.join(fields)
|
||||
|
||||
if counter > 0:
|
||||
counter -= 1
|
||||
else:
|
||||
counter = OPTIONS.skip
|
||||
if OPTIONS.filter and vcf.failsFilters():
|
||||
pass
|
||||
else:
|
||||
print '\t'.join([ str(vcf.getField(field, '0')) for field in fields])
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
class VCFRecord:
|
||||
"""Simple support for accessing a VCF record"""
|
||||
def __init__(self, basicBindings, header=None):
|
||||
self.header = header
|
||||
self.bindings = basicBindings
|
||||
self.info = parseInfo(basicBindings["INFO"])
|
||||
|
||||
def hasHeader(self): return self.header <> None
|
||||
def getHeader(self): return self.header
|
||||
|
||||
def get(self, key): return self.bindings[key]
|
||||
|
||||
def getChrom(self): return self.get("CHROM")
|
||||
def getPos(self): return self.get("POS")
|
||||
|
||||
def getID(self): return self.get("ID")
|
||||
def isNovel(self): return self.getID() == "."
|
||||
def isKnown(self): return not self.isNovel()
|
||||
|
||||
def getRef(self): return self.get("REF")
|
||||
def getAlt(self): return self.get("ALT")
|
||||
def getQual(self): return self.get("QUAL")
|
||||
|
||||
def getFilter(self): return self.get("FILTER")
|
||||
def failsFilters(self): return not self.passesFilters()
|
||||
def passesFilters(self):
|
||||
#print self.getFilter(), ">>>", self
|
||||
return self.getFilter() == "." or self.getFilter() == "0"
|
||||
|
||||
def getField(self, field, default = None):
|
||||
if field in self.bindings:
|
||||
return self.get(field)
|
||||
elif field in self.getInfoDict():
|
||||
return self.getInfoKey(field)
|
||||
else:
|
||||
return default
|
||||
|
||||
def getInfo(self): return self.get("INFO")
|
||||
def getInfoDict(self): return self.info
|
||||
|
||||
def getInfoKey(self, name, default = None):
|
||||
info = self.getInfoDict()
|
||||
if name in info:
|
||||
return info[name]
|
||||
else:
|
||||
return default
|
||||
|
||||
def infoHasKeys(self, keys):
|
||||
return all(map(lambda key: key in self.getInfo(), keys))
|
||||
|
||||
def __str__(self):
|
||||
return str(self.bindings) + " INFO: " + str(self.info)
|
||||
|
||||
def parseInfo(s):
|
||||
def handleBoolean(key_val):
|
||||
if len(key_val) == 1:
|
||||
return [key_val[0], True]
|
||||
else:
|
||||
return key_val
|
||||
|
||||
key_val = map( lambda x: handleBoolean(x.split("=")), s.split(";"))
|
||||
return dict(key_val)
|
||||
|
||||
def string2VCF(line, header=None):
|
||||
if line[0] != "#":
|
||||
s = line.split()
|
||||
keys = "CHROM POS ID REF ALT QUAL FILTER INFO".split()
|
||||
bindings = dict(zip(keys, s[0:8]))
|
||||
return VCFRecord(bindings, header)
|
||||
else:
|
||||
return None
|
||||
|
||||
def lines2VCF(lines):
|
||||
header = None
|
||||
counter = 0
|
||||
for line in lines:
|
||||
if line[0] != "#":
|
||||
counter += 1
|
||||
vcf = string2VCF(line, header=header)
|
||||
if vcf <> None:
|
||||
yield vcf, counter
|
||||
else:
|
||||
header = line[1:].split()
|
||||
raise StopIteration()
|
||||
Loading…
Reference in New Issue