1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
#!/usr/bin/python2.5
"""
This application converts the various text files stored in the source-data
directory into a pickled python object to be used by the random data
generator scripts
Copyright (C) 2007 Chris Moffitt
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
import csv
import string
import cPickle as pickle
import random
import os
data_dir = "barnum/source-data"
simple_files_to_process = ['street-names.txt', 'street-types.txt', 'latin-words.txt',
'email-domains.txt', 'job-titles.txt', 'company-names.txt',
'company-types.txt']
def load_files():
# Process Zip Codes
all_zips = {}
reader = csv.reader(open(os.path.join(data_dir,"zip-codes.txt"), "rb"))
for row in reader:
data = [string.capwords(row[3]), row[4]]
all_zips[row[0]] = data
output = open('source-data.pkl', 'wb')
pickle.dump(all_zips, output)
#Process area codes
area_code_file = open(os.path.join(data_dir,"area-codes.txt"), "rb")
state_area_codes = {}
for line in area_code_file:
clean_line = line.replace(' ','').rstrip('\n')
state_area_codes[line.split(':')[0]] = clean_line[3:].split(',')
pickle.dump(state_area_codes, output)
area_code_file.close()
#Process Last Names
last_names = []
last_name_file = open(os.path.join(data_dir,"last-name.txt"),"rb")
for line in last_name_file:
clean_line = line.rstrip('\n')
last_names.append(string.capwords(clean_line.split(' ')[0]))
pickle.dump(last_names, output)
last_name_file.close()
#Process Male First Names
male_first_names = []
male_first_name_file = open(os.path.join(data_dir,"male-first-name.txt"),"rb")
for line in male_first_name_file:
clean_line = line.rstrip('\n')
male_first_names.append(string.capwords(clean_line.split(' ')[0]))
pickle.dump(male_first_names, output)
male_first_name_file.close()
#Process Female First Names
female_first_names = []
female_first_name_file = open(os.path.join(data_dir,"female-first-name.txt"),"rb")
for line in female_first_name_file:
clean_line = line.rstrip('\n')
female_first_names.append(string.capwords(clean_line.split(' ')[0]))
pickle.dump(female_first_names, output)
female_first_name_file.close()
#Process the simple files
for f in simple_files_to_process:
temp = []
sample_file = open(os.path.join(data_dir, f), "rb")
for line in sample_file:
clean_line = line.rstrip('\n')
temp.append(clean_line)
pickle.dump(temp, output)
sample_file.close()
temp = []
output.close()
if __name__ == "__main__":
response = string.lower(raw_input("Type 'yes' to reload the data from source files and create a new source file: "))
if response == 'yes':
load_files()
|