utils/data-generators/barnum/convert_data.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

#!/usr/bin/python2.5
"""
This application converts the various text files stored in the source-data
directory into a pickled python object to be used by the random data
generator scripts

Copyright (C) 2007 Chris Moffitt
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

"""


import csv
import string
import cPickle as pickle
import random
import os

data_dir = "barnum/source-data"
simple_files_to_process = ['street-names.txt', 'street-types.txt', 'latin-words.txt',
                            'email-domains.txt', 'job-titles.txt', 'company-names.txt',
                            'company-types.txt']

def load_files():
    # Process Zip Codes
    all_zips = {}
    reader = csv.reader(open(os.path.join(data_dir,"zip-codes.txt"), "rb"))
    for row in reader:
        data = [string.capwords(row[3]), row[4]]
        all_zips[row[0]] = data
    output = open('source-data.pkl', 'wb')
    pickle.dump(all_zips, output)

    #Process area codes
    area_code_file = open(os.path.join(data_dir,"area-codes.txt"), "rb")
    state_area_codes = {}
    for line in area_code_file:
        clean_line = line.replace(' ','').rstrip('\n')
        state_area_codes[line.split(':')[0]] = clean_line[3:].split(',')
    pickle.dump(state_area_codes, output)
    area_code_file.close()

    #Process Last Names
    last_names = []
    last_name_file = open(os.path.join(data_dir,"last-name.txt"),"rb")
    for line in last_name_file:
        clean_line = line.rstrip('\n')
        last_names.append(string.capwords(clean_line.split(' ')[0]))
    pickle.dump(last_names, output)
    last_name_file.close()

    #Process Male First Names
    male_first_names = []
    male_first_name_file = open(os.path.join(data_dir,"male-first-name.txt"),"rb")
    for line in male_first_name_file:
        clean_line = line.rstrip('\n')
        male_first_names.append(string.capwords(clean_line.split(' ')[0]))
    pickle.dump(male_first_names, output)
    male_first_name_file.close()

    #Process Female First Names
    female_first_names = []
    female_first_name_file = open(os.path.join(data_dir,"female-first-name.txt"),"rb")
    for line in female_first_name_file:
        clean_line = line.rstrip('\n')
        female_first_names.append(string.capwords(clean_line.split(' ')[0]))
    pickle.dump(female_first_names, output)
    female_first_name_file.close()

    #Process the simple files
    for f in simple_files_to_process:
        temp = []
        sample_file = open(os.path.join(data_dir, f), "rb")
        for line in sample_file:
            clean_line = line.rstrip('\n')
            temp.append(clean_line)
        pickle.dump(temp, output)
        sample_file.close()
        temp = []
    output.close()

if __name__ == "__main__":
    response = string.lower(raw_input("Type 'yes' to reload the data from source files and create a new source file: "))
    if response == 'yes':
        load_files()