2009-11-10 06:48:51 +08:00
|
|
|
import itertools
|
|
|
|
|
|
|
|
|
|
VCF_KEYS = "CHROM POS ID REF ALT QUAL FILTER INFO".split()
|
|
|
|
|
|
2009-11-07 07:00:46 +08:00
|
|
|
TRANSITIONS = dict()
|
|
|
|
|
for p in ["AG", "CT"]:
|
|
|
|
|
TRANSITIONS[p] = True
|
|
|
|
|
TRANSITIONS[''.join(reversed(p))] = True
|
|
|
|
|
|
|
|
|
|
def convertToType(d):
|
|
|
|
|
out = dict()
|
|
|
|
|
types = [int, float, str]
|
|
|
|
|
for key, value in d.items():
|
|
|
|
|
for type in types:
|
|
|
|
|
try:
|
|
|
|
|
#print 'Parsing', key, value, type
|
|
|
|
|
out[key] = type(value)
|
|
|
|
|
#print ' Parsed as', key, value, type
|
|
|
|
|
break
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
return out
|
|
|
|
|
|
2009-11-03 21:14:04 +08:00
|
|
|
class VCFRecord:
|
|
|
|
|
"""Simple support for accessing a VCF record"""
|
2009-11-10 06:48:51 +08:00
|
|
|
def __init__(self, basicBindings, header=None, rest=[]):
|
2009-11-03 21:14:04 +08:00
|
|
|
self.header = header
|
2009-11-07 07:00:46 +08:00
|
|
|
self.info = convertToType(parseInfo(basicBindings["INFO"]))
|
|
|
|
|
self.bindings = convertToType(basicBindings)
|
2009-11-10 06:48:51 +08:00
|
|
|
self.rest = rest
|
2009-11-03 21:14:04 +08:00
|
|
|
|
|
|
|
|
def hasHeader(self): return self.header <> None
|
|
|
|
|
def getHeader(self): return self.header
|
|
|
|
|
|
|
|
|
|
def get(self, key): return self.bindings[key]
|
|
|
|
|
|
|
|
|
|
def getChrom(self): return self.get("CHROM")
|
|
|
|
|
def getPos(self): return self.get("POS")
|
2009-11-08 03:32:12 +08:00
|
|
|
def getLoc(self): return str(self.getChrom()) + ':' + str(self.getPos())
|
2009-11-03 21:14:04 +08:00
|
|
|
|
|
|
|
|
def getID(self): return self.get("ID")
|
|
|
|
|
def isNovel(self): return self.getID() == "."
|
|
|
|
|
def isKnown(self): return not self.isNovel()
|
|
|
|
|
|
|
|
|
|
def getRef(self): return self.get("REF")
|
|
|
|
|
def getAlt(self): return self.get("ALT")
|
|
|
|
|
def getQual(self): return self.get("QUAL")
|
|
|
|
|
|
2009-11-07 07:00:46 +08:00
|
|
|
def getVariation(self): return self.getRef() + self.getAlt()
|
|
|
|
|
|
|
|
|
|
def isTransition(self):
|
|
|
|
|
#print self.getVariation(), TRANSITIONS
|
|
|
|
|
return self.getVariation() in TRANSITIONS
|
|
|
|
|
def isTransversion(self):
|
|
|
|
|
return not self.isTransition()
|
|
|
|
|
|
2009-11-03 21:14:04 +08:00
|
|
|
def getFilter(self): return self.get("FILTER")
|
|
|
|
|
def failsFilters(self): return not self.passesFilters()
|
|
|
|
|
def passesFilters(self):
|
|
|
|
|
#print self.getFilter(), ">>>", self
|
|
|
|
|
return self.getFilter() == "." or self.getFilter() == "0"
|
2009-11-07 07:00:46 +08:00
|
|
|
|
|
|
|
|
def hasField(self, field):
|
|
|
|
|
return field in self.bindings or field in self.info
|
|
|
|
|
|
|
|
|
|
def setField(self, field, value):
|
|
|
|
|
assert value <> None
|
|
|
|
|
|
|
|
|
|
#print 'setting field', field, value
|
|
|
|
|
#print 'getInfo', self.getInfo()
|
|
|
|
|
if field in self.bindings:
|
|
|
|
|
self.bindings[field] = value
|
|
|
|
|
else:
|
|
|
|
|
self.info[field] = value
|
|
|
|
|
self.setField("INFO", self.getInfo())
|
|
|
|
|
#print 'getInfo', self.getInfo()
|
2009-11-03 21:14:04 +08:00
|
|
|
|
|
|
|
|
def getField(self, field, default = None):
|
|
|
|
|
if field in self.bindings:
|
|
|
|
|
return self.get(field)
|
|
|
|
|
elif field in self.getInfoDict():
|
|
|
|
|
return self.getInfoKey(field)
|
|
|
|
|
else:
|
|
|
|
|
return default
|
|
|
|
|
|
2009-11-07 07:00:46 +08:00
|
|
|
#def getInfo(self): return self.get("INFO")
|
|
|
|
|
def getInfo(self):
|
|
|
|
|
def info2str(x,y):
|
|
|
|
|
if type(y) == bool:
|
|
|
|
|
return str(x)
|
|
|
|
|
else:
|
|
|
|
|
return str(x) + '=' + str(y)
|
|
|
|
|
return ';'.join(map(lambda x: info2str(*x), self.info.iteritems()))
|
|
|
|
|
|
2009-11-03 21:14:04 +08:00
|
|
|
def getInfoDict(self): return self.info
|
|
|
|
|
|
|
|
|
|
def getInfoKey(self, name, default = None):
|
|
|
|
|
info = self.getInfoDict()
|
|
|
|
|
if name in info:
|
|
|
|
|
return info[name]
|
|
|
|
|
else:
|
|
|
|
|
return default
|
|
|
|
|
|
|
|
|
|
def infoHasKeys(self, keys):
|
|
|
|
|
return all(map(lambda key: key in self.getInfo(), keys))
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
2009-11-07 07:00:46 +08:00
|
|
|
#return str(self.bindings) + " INFO: " + str(self.info)
|
|
|
|
|
return ' '.join(['%s=%s' % (x,y) for x,y in self.bindings.iteritems()])
|
2009-11-10 06:48:51 +08:00
|
|
|
|
|
|
|
|
def format(self):
|
|
|
|
|
return '\t'.join([str(self.getField(key)) for key in VCF_KEYS] + self.rest)
|
2009-11-03 21:14:04 +08:00
|
|
|
|
|
|
|
|
def parseInfo(s):
|
|
|
|
|
def handleBoolean(key_val):
|
|
|
|
|
if len(key_val) == 1:
|
2009-11-03 21:20:55 +08:00
|
|
|
return [key_val[0], 1]
|
2009-11-03 21:14:04 +08:00
|
|
|
else:
|
|
|
|
|
return key_val
|
|
|
|
|
|
|
|
|
|
key_val = map( lambda x: handleBoolean(x.split("=")), s.split(";"))
|
|
|
|
|
return dict(key_val)
|
|
|
|
|
|
|
|
|
|
def string2VCF(line, header=None):
|
|
|
|
|
if line[0] != "#":
|
|
|
|
|
s = line.split()
|
2009-11-10 06:48:51 +08:00
|
|
|
bindings = dict(zip(VCF_KEYS, s[0:8]))
|
|
|
|
|
return VCFRecord(bindings, header, rest=s[8:])
|
2009-11-03 21:14:04 +08:00
|
|
|
else:
|
|
|
|
|
return None
|
|
|
|
|
|
2009-11-10 06:48:51 +08:00
|
|
|
def readVCFHeader(lines):
|
|
|
|
|
header = []
|
|
|
|
|
columnNames = None
|
|
|
|
|
for line in lines:
|
|
|
|
|
if line[0] == "#":
|
|
|
|
|
header.append(line.strip())
|
|
|
|
|
else:
|
|
|
|
|
if header <> []:
|
|
|
|
|
columnNames = header[-1]
|
|
|
|
|
return header, columnNames, itertools.chain([line], lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def lines2VCF(lines, extendedOutput = False):
|
|
|
|
|
header, columnNames, lines = readVCFHeader(lines)
|
2009-11-03 21:14:04 +08:00
|
|
|
counter = 0
|
2009-11-10 06:48:51 +08:00
|
|
|
|
2009-11-03 21:14:04 +08:00
|
|
|
for line in lines:
|
|
|
|
|
if line[0] != "#":
|
|
|
|
|
counter += 1
|
2009-11-10 06:48:51 +08:00
|
|
|
vcf = string2VCF(line, header=columnNames)
|
2009-11-03 21:14:04 +08:00
|
|
|
if vcf <> None:
|
2009-11-10 06:48:51 +08:00
|
|
|
if extendedOutput:
|
|
|
|
|
yield header, vcf, counter
|
|
|
|
|
else:
|
|
|
|
|
yield vcf
|
2009-11-03 21:14:04 +08:00
|
|
|
raise StopIteration()
|
2009-11-10 06:48:51 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def formatVCF(header, records):
|
|
|
|
|
#print records
|
|
|
|
|
print records[0]
|
|
|
|
|
return itertools.chain(header, map(VCFRecord.format, records))
|
|
|
|
|
|