-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
194 lines (161 loc) · 6.66 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import json
import redis
import pandas as pd
def decompose_korean_char(char):
"""
Manually decompose a Korean character into its constituent parts (choseong, jungseong, jongseong).
This is an improved implementation that also handles double jongseong and jungseong.
"""
# Korean character decomposition
if '가' <= char <= '힣':
base = ord(char) - ord('가')
jongseong = base % 28
jungseong = ((base - jongseong) // 28) % 21
choseong = ((base - jongseong) // 28) // 21
# Korean character components
choseong_chars = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"
jungseong_chars = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ"
jongseong_chars = ['', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ',
'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ',
'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
# Handling double jongseong and jungseong
double_jongseong_map = {'ㄳ': 'ㄱㅅ', 'ㄵ': 'ㄴㅈ', 'ㄶ': 'ㄴㅎ', 'ㄺ': 'ㄹㄱ',
'ㄻ': 'ㄹㅁ',
'ㄼ': 'ㄹㅂ', 'ㄽ': 'ㄹㅅ', 'ㄾ': 'ㄹㅌ', 'ㄿ': 'ㄹㅍ',
'ㅀ': 'ㄹㅎ', 'ㅄ': 'ㅂㅅ'}
double_jungseong_map = {'ㅘ': 'ㅗㅏ', 'ㅙ': 'ㅗㅐ', 'ㅚ': 'ㅗㅣ', 'ㅝ': 'ㅜㅓ',
'ㅞ': 'ㅜㅔ', 'ㅟ': 'ㅜㅣ', 'ㅢ': 'ㅡㅣ'}
# Decompose double jongseong and jungseong
jongseong_char = double_jongseong_map.get(jongseong_chars[jongseong],
jongseong_chars[jongseong])
jungseong_char = double_jungseong_map.get(jungseong_chars[jungseong],
jungseong_chars[jungseong])
return choseong_chars[choseong], jungseong_char, jongseong_char
else:
return char, '', ''
def generate_autocomplete_data(input_str):
"""
Generate autocomplete data from the input string by decomposing Korean characters.
This version handles double jongseong and jungseong and includes partial syllables in the result.
"""
result = []
temp_str = ""
complete_char = ""
for char in input_str:
# Decompose the Korean character
choseong, jungseong, jongseong = decompose_korean_char(char)
complete_char += char
# Add partial syllables to the result
if choseong:
temp_str += choseong
result.append((temp_str, complete_char))
if jungseong:
temp_str += jungseong
result.append((temp_str, complete_char))
if jongseong:
temp_str += jongseong
result.append((temp_str, complete_char))
return result
def apply_highlighting(org_data, highlighted_data):
"""
Apply HTML <strong> tag to the highlighted part of the original data.
"""
# Finding the index where the highlighted data ends in the original data
end_index = org_data.find(highlighted_data) + len(highlighted_data)
# Constructing the new string with the <strong> tag applied
highlighted_org_data = org_data[:end_index].replace(
highlighted_data,
f"<strong>{highlighted_data}</strong>") + org_data[
end_index:]
return highlighted_org_data
file_path = 'restaurants.csv'
menu_file_path = 'menus.csv'
df = pd.read_csv(file_path)
menu_df = pd.read_csv(menu_file_path)
restaurant_names = df['name'].tolist()
restaurant_categories = df['category'].tolist()
restaurant_custom_categories = list(set(df['custom_category'].tolist()))
menu_names = menu_df['menu_name'].tolist()
data = {}
print("Generating autocomplete data...")
print("Generating autocomplete data for restaurant names...", len(restaurant_names))
for org_data in restaurant_names:
candidates = [org_data]
candidates.extend(org_data.split())
candidates.append(org_data.replace(" ", ""))
for candidate in candidates:
generated_data = generate_autocomplete_data(candidate)
for gd, gdc in generated_data:
json_data = {
'org_display': org_data,
'highlighted_display': apply_highlighting(org_data, gdc),
'category': '가게 이름',
}
if gd in data:
if json_data not in data[gd]:
data[gd].append(json_data)
else:
data[gd] = [json_data]
print("Generating autocomplete data for restaurant categories...", len(restaurant_categories))
for org_data in restaurant_categories:
candidates = [org_data]
candidates.extend(org_data.split())
candidates.append(org_data.replace(" ", ""))
for candidate in candidates:
generated_data = generate_autocomplete_data(candidate)
for gd, gdc in generated_data:
json_data = {
'org_display': org_data,
'highlighted_display': apply_highlighting(org_data, gdc),
'category': '(소)카테고리',
}
if gd in data:
if json_data not in data[gd]:
data[gd].append(json_data)
else:
data[gd] = [json_data]
print("Generating autocomplete data for restaurant custom categories...", len(restaurant_custom_categories))
for org_data in restaurant_custom_categories:
candidates = [org_data]
candidates.extend(org_data.split())
candidates.append(org_data.replace(" ", ""))
for candidate in candidates:
generated_data = generate_autocomplete_data(candidate)
for gd, gdc in generated_data:
json_data = {
'org_display': org_data,
'highlighted_display': apply_highlighting(org_data, gdc),
'category': '(대)카테고리',
}
if gd in data:
if json_data not in data[gd]:
data[gd].append(json_data)
else:
data[gd] = [json_data]
print("Generating autocomplete data for menu names...", len(menu_names))
for i, org_data in enumerate(menu_names):
if i % 100 == 0:
print("Generating autocomplete data for menu names...", i, "/", len(menu_names))
candidates = [org_data]
candidates.extend(org_data.split())
candidates.append(org_data.replace(" ", ""))
for candidate in candidates:
generated_data = generate_autocomplete_data(candidate)
for gd, gdc in generated_data:
json_data = {
'org_display': org_data,
'highlighted_display': apply_highlighting(org_data, gdc),
'category': '메뉴 이름',
}
if gd in data:
if json_data not in data[gd]:
data[gd].append(json_data)
else:
data[gd] = [json_data]
r = redis.Redis(host='skku-redis', port=6379, db=0)
version = '20240403232030'
r.setex('restaurant:v1:version', 3600 * 24 * 2, version)
print("Saving autocomplete data to Redis...")
for key, value in data.items():
r.setex('restaurant:v1:' + version + ':' + key, 3600 * 24 * 2, json.dumps(value))
print("Autocomplete data generation completed.")