2009-07-09 06:04:26 +08:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
|
|
import sys, itertools
|
|
|
|
|
|
2009-12-17 04:38:50 +08:00
|
|
|
def record_generator(filename, sep="\t", skip_n_lines=0, skip_until_regex_line=""):
|
2009-07-09 06:04:26 +08:00
|
|
|
"""Given a file with field headers on the first line and records on subsequent lines,
|
|
|
|
|
generates a dictionary for each line keyed by the header fields"""
|
|
|
|
|
fin = open(filename)
|
2009-07-31 05:45:23 +08:00
|
|
|
|
2009-12-17 04:38:50 +08:00
|
|
|
if skip_n_lines > 0:
|
|
|
|
|
for i in range(skip_n_lines): # Skip a number of lines
|
|
|
|
|
fin.readline()
|
2009-07-31 05:45:23 +08:00
|
|
|
|
2009-12-17 04:38:50 +08:00
|
|
|
found_regex = False
|
|
|
|
|
if skip_until_regex_line != "":
|
|
|
|
|
import re
|
|
|
|
|
regex_line = re.compile(skip_until_regex_line)
|
|
|
|
|
for line in fin:
|
|
|
|
|
match = regex_line.search(line)
|
|
|
|
|
if match:
|
|
|
|
|
found_regex = line
|
|
|
|
|
break
|
|
|
|
|
if not found_regex:
|
|
|
|
|
print "Warning: Regex "+skip_until_regex_line+" not found in FlatFileTable:record_generator"
|
|
|
|
|
|
|
|
|
|
if found_regex:
|
|
|
|
|
header = found_regex.rstrip().split(sep) # Parse header
|
|
|
|
|
else:
|
|
|
|
|
header = fin.readline().rstrip().split(sep) # Pull off header
|
2009-07-31 05:45:23 +08:00
|
|
|
|
|
|
|
|
for line in fin: #
|
2009-07-09 06:04:26 +08:00
|
|
|
fields = line.rstrip().split(sep)
|
|
|
|
|
record = dict(itertools.izip(header, fields))
|
|
|
|
|
yield record
|
|
|
|
|
|
|
|
|
|
def record_matches_values(record, match_field_values):
|
|
|
|
|
for match_field, match_values in match_field_values:
|
|
|
|
|
if record[match_field] not in match_values:
|
|
|
|
|
return False
|
|
|
|
|
return True
|