-
Notifications
You must be signed in to change notification settings - Fork 196
/
Copy pathscript_variant.py
238 lines (201 loc) · 8.27 KB
/
script_variant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# coding:utf-8
# Copyright 2019 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import jautils
from unidecode import unidecode
import os.path
import re
import logging
def read_dictionary(file_name):
"""
Reads dictionary file.
Args:
file_name: file name.
format: kanji + '\t' + yomigana
Return:
{kanj: yomigana, ...}
"""
dictionary = {}
try:
if os.path.exists(file_name):
with open(file_name, 'r') as f:
for line in f:
if not line.strip() or line.strip()[0] == "#":
continue
kanji, hiragana = line.rstrip('\n').split('\t')
kanji = kanji.decode('utf-8')
hiragana = hiragana.decode('utf-8')
if kanji in dictionary:
dictionary[kanji].add(hiragana)
else:
dictionary[kanji] = set([hiragana])
except IOError:
return None
return dictionary
JAPANESE_NAME_LOCATION_DICTIONARY = read_dictionary('japanese_name_location_dict.txt')
CHINESE_FAMILY_NAME_DICTIONARY = read_dictionary('chinese_family_name_dict.txt')
def has_kanji(word):
"""
Returns whether word contains kanji or not.
"""
# [\u3400-\u9fff]: kanji
# TODO: update this range (some of kanji characters out of this range.)
return re.match(ur'([\u3400-\u9fff])', word)
def romanize_single_japanese_word(word):
"""
This method romanizes a single Japanese word using a dictionary.
If the word isn't found in the dictionary, this method returns the word as is.
This method can return multiple romanizations
(because there are multiple ways to read the same kanji name in Japanese).
This method doesn't support romanizing full names using first/last
names in the dictionary.
Returns:
[romanized_jp_word, ...]
"""
if not word:
return ['']
if word in JAPANESE_NAME_LOCATION_DICTIONARY:
yomigana_list = JAPANESE_NAME_LOCATION_DICTIONARY[word]
return [jautils.hiragana_to_romaji(yomigana)
for yomigana in yomigana_list]
return [word]
def romanize_japanese_word(word, for_index=True):
"""
This method romanizes a Japanese text chunk using a dictionary.
If the word isn't found in the dictionary, this method returns the word as is.
This method can return multiple romanizations
(because there are multiple ways to read the same kanji name in Japanese).
This method can romanize full names without a white space (e.g., "山田太郎")
if the first/last names are in the dictionary.
Args:
for_index: Set this to True for indexing purpose.
Set this to False when you want to romanize query text.
Returns:
[romanized_jp_word, ...]
"""
if not word:
return ['']
words = set()
for index in xrange(1, len(word)):
# Splits the word to support romanizing fullname without space.
# If the query is a full name without white space(e.g., "山田太郎"),
# and the first/last name is in the dictionary,
# but the full name is not in the dictionary,
# it can still return romanization "yamadataro".
first_part = word[:index]
last_part = word[index:]
romanized_first_parts = romanize_single_japanese_word(
first_part)
romanized_last_parts = romanize_single_japanese_word(
last_part)
for romanized_first_part in romanized_first_parts:
for romanized_last_part in romanized_last_parts:
if (romanized_first_part != first_part and
romanized_last_part != last_part):
words.add(romanized_first_part + romanized_last_part)
# For indexing purpose, if the input is "山田太郎", we need
# to add "yamada" and "taro" in addition to "yamadataro"
# because it must match queries e.g., "山田" or "太郎".
#
# But, when we apply this method for search queries, we
# must not do this. If the search query is [山田太郎] and
# we return ["yamadataro", "yamada", "taro"], the query
# will be "yamadataro OR yamada OR taro". Then it will
# also match records with name "yamada hanako" etc., which
# is bad.
#
# TODO(ichikawa) Consider applying this for search queries,
# but construct a query 'yamadataro OR "yamada taro"'
# instead. Then it will also match a record with name
# "yamada taro".
if for_index:
words.add(romanized_first_part)
words.add(romanized_last_part)
words.update(romanize_single_japanese_word(word))
return list(words)
def romanize_word_by_unidecode(word):
"""
This method romanizes all languages by unidecode.
If word is hiragana or katakana, it is romanized by jautils.
kanji is romanized in Chinese way.
Args:
word: should be script varianted
Returns:
an array of romanzied_word by unidecode [romanized_word]
"""
if not word:
return ['']
if jautils.should_normalize(word):
hiragana_word = jautils.normalize(word)
return [jautils.hiragana_to_romaji(hiragana_word)]
romanized_word = unidecode(word)
return [romanized_word.strip()]
def split_chinese_name(word):
"""
This method tries to split a chinese name into two parts: family_name and given_name
Args:
word: a chinese name string
Returns:
family_name and given_name if it is a valid chinese name
else None, None
"""
if not word:
return None, None
word = word.replace(" ", "")
if not re.search(ur'^[\u3400-\u9fff]+$', word):
return None, None
for i in range(1,3):
if word[:i] in CHINESE_FAMILY_NAME_DICTIONARY:
return word[:i], word[i:]
return None, None
def romanize_chinese_name(word):
"""
This method romanizes a chinese person name including family_name and given_name
Returns:
Romanized Chinese name
"""
family_name, given_name = split_chinese_name(word)
if not family_name:
return []
romanized_surname_list = list(CHINESE_FAMILY_NAME_DICTIONARY[family_name])
return [romanized_surname_list[0] + unidecode(given_name).strip()]
def romanize_search_query(word):
"""
This method romanizes all languages for search query.
If word is hiragana or katakana, it is romanized by jautils.
Args:
word: should be script varianted
Returns:
[romanized_word, ... ]
(if word can be romanized by unidecode and jp_dictionary,
returns multiple romanizations.)
"""
if not word:
return []
romanized_words = []
if has_kanji(word):
romanized_words = romanize_japanese_word(word, for_index=False)
if jautils.should_normalize(word):
hiragana_word = jautils.normalize(word)
romanized_words.append(jautils.hiragana_to_romaji(hiragana_word))
# if the name is a Chinese name and the chinese_romanize can produce
# a different result, append the result to the romanzied_words with
# unidecode results together
unidecode_romanize_word = unidecode(word).strip()
chinese_romanize_list = romanize_chinese_name(word)
chinese_romanize_word = chinese_romanize_list[0] if chinese_romanize_list else ''
if chinese_romanize_word and chinese_romanize_word != unidecode_romanize_word:
romanized_words.append(chinese_romanize_word)
romanized_words.append(unidecode_romanize_word)
return romanized_words