-
Notifications
You must be signed in to change notification settings - Fork 196
/
Copy pathindexing.py
254 lines (215 loc) · 9.82 KB
/
indexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# Copyright 2010 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Support for approximate string prefix queries.
A hit is defined when the words entered in the query are all prefixes of one
of the words in the given and family names on the record. For example, a
record with the fields:
given_name: ABC 123
family_name: DEF 456
will be retrieved by:
"ABC 456"
"45 ED"
"123 ABC"
"ABC 123 DEF"
but will not be retrieved by:
"ABC 1234"
"ABC 123 DEF 456 789"
"""
from text_query import TextQuery
from google.appengine.ext import db
import unicodedata
import logging
import model
import re
import jautils
def update_index_properties(entity):
"""Finds and updates all prefix-related properties on the given entity."""
# Using set to make sure I'm not adding the same string more than once.
names_prefixes = set()
for property in entity._fields_to_index_properties:
for value in TextQuery(getattr(entity, property)).query_words:
if property in entity._fields_to_index_by_prefix_properties:
for n in xrange(1,len(value)+1):
pref = value[:n]
if pref not in names_prefixes:
names_prefixes.add(pref)
else:
if value not in names_prefixes:
names_prefixes.add(value)
# Add alternate names to the index tokens. We choose not to index prefixes
# of alternate names so that we can keep the index size small.
# TODI(ryok): This strategy works well for Japanese, but how about other
# languages?
names_prefixes |= get_alternate_name_tokens(entity)
# Put a cap on the number of tokens, just as a precaution.
MAX_TOKENS = 100
entity.names_prefixes = list(names_prefixes)[:MAX_TOKENS]
if len(names_prefixes) > MAX_TOKENS:
logging.debug('MAX_TOKENS exceeded for %s' %
' '.join(list(names_prefixes)))
def get_alternate_name_tokens(person):
"""Returns alternate name tokens and their variations."""
tokens = set(TextQuery(person.alternate_names).query_words)
# This is no-op for non-Japanese.
tokens |= set(jautils.get_additional_tokens(tokens))
return tokens
class CmpResults():
def __init__(self, query):
self.query = query
self.query_words_set = set(query.words)
def __call__(self, p1, p2):
if ((p1.primary_full_name and
p1.primary_full_name == p2.primary_full_name) or
((p1.given_name or p1.family_name) and
p1.given_name == p2.given_name and
p1.family_name == p2.family_name)):
return 0
self.set_ranking_attr(p1)
self.set_ranking_attr(p2)
r1 = self.rank(p1)
r2 = self.rank(p2)
if r1 == r2:
# if rank is the same sort by name so same names will be together
return cmp(p1._normalized_full_name.normalized,
p2._normalized_full_name.normalized)
else:
return cmp(r2, r1)
def set_ranking_attr(self, person):
"""Consider save these into to db"""
if not hasattr(person, '_normalized_given_name'):
person._normalized_given_name = TextQuery(person.given_name)
person._normalized_family_name = TextQuery(person.family_name)
person._normalized_full_name = TextQuery(person.full_name)
person._name_words = set(person._normalized_full_name.words)
person._alt_name_words = set(
TextQuery(person.alternate_names).words)
# TODO(ryok): re-consider the ranking putting more weight on full_name (a
# required field) instead of given name and family name pair (optional).
def rank(self, person):
# The normalized query words, in the order as entered.
ordered_words = self.query.normalized.split()
if (ordered_words ==
person._normalized_given_name.words +
person._normalized_family_name.words):
# Matches a Latin name exactly (given name followed by surname).
return 10
if (re.match(ur'^[\u3400-\u9fff]$', person.family_name) and
ordered_words in [
[person.family_name + person.given_name],
[person.family_name, person.given_name]
]):
# Matches a CJK name exactly (surname followed by given name).
return 10
if (re.match(ur'^[\u3400-\u9fff]+$', person.family_name) and
ordered_words in [
[person.family_name + person.given_name],
[person.family_name, person.given_name]
]):
# Matches a CJK name exactly (surname followed by given name).
# A multi-character surname is uncommon, so it is ranked a bit lower.
return 9.5
if (ordered_words ==
person._normalized_family_name.words +
person._normalized_given_name.words):
# Matches a Latin name with given and family name switched.
return 9
if (re.match(ur'^[\u3400-\u9fff]$', person.given_name) and
ordered_words in [
[person.given_name + person.family_name],
[person.given_name, person.family_name]
]):
# Matches a CJK name with surname and given name switched.
return 9
if (re.match(ur'^[\u3400-\u9fff]+$', person.given_name) and
ordered_words in [
[person.given_name + person.family_name],
[person.given_name, person.family_name]
]):
# Matches a CJK name with surname and given name switched.
# A multi-character surname is uncommon, so it's ranked a bit lower.
return 8.5
if person._name_words == self.query_words_set:
# Matches all the words in the given and family name, out of order.
return 8
if self.query.normalized in [
person._normalized_given_name.normalized,
person._normalized_family_name.normalized,
]:
# Matches the given name exactly or the family name exactly.
return 7
if person._name_words.issuperset(self.query_words_set):
# All words in the query appear somewhere in the name.
return 6
# Count the number of words in the query that appear in the name and
# also in the alternate names.
matched_words = person._name_words.union(
person._alt_name_words).intersection(self.query_words_set)
return min(5, 1 + len(matched_words))
def rank_and_order(results, query, max_results):
results.sort(CmpResults(query))
return results[:max_results]
def sort_query_words(query_words):
"""Sort query_words so that the query filters created from query_words are
more effective and consistent when truncated due to NeedIndexError, and
return the sorted list."""
# (1) Sort them lexicographically so that we return consistent search
# results for query 'AA BB CC DD' and 'DD AA BB CC' even when filters
# are truncated.
sorted_query_words = sorted(query_words)
# (2) Sort them according to popularity so that less popular query words,
# which are usually more effective filters, come first.
sorted_query_words = jautils.sorted_by_popularity(sorted_query_words)
# (3) Sort them according to the lengths so that longer query words,
# which are usually more effective filters, come first.
return sorted(sorted_query_words, key=len, reverse=True)
def search(repo, query_obj, max_results):
# As there are limits on the number of filters that we can apply and the
# number of entries we can fetch at once, the order of query words could
# potentially matter. In particular, this is the case for most Japanese
# names, many of which consist of 4 to 6 Chinese characters, each
# coresponding to an additional filter.
query_words = sort_query_words(query_obj.query_words)
logging.debug('query_words: %r' % query_words)
# First try the query with all the filters, and then keep backing off
# if we get NeedIndexError.
fetch_limit = 400
fetched = []
filters_to_try = len(query_words)
while filters_to_try:
query = model.Person.all_in_repo(repo)
for word in query_words[:filters_to_try]:
query.filter('names_prefixes =', word)
try:
fetched = query.fetch(fetch_limit)
logging.debug('query succeeded with %d filters' % filters_to_try)
break
except db.NeedIndexError:
filters_to_try -= 1
continue
logging.debug('indexing.search fetched: %d' % len(fetched))
# Now perform any filtering that App Engine was unable to do for us.
matched = []
for result in fetched:
for word in query_words:
if word not in result.names_prefixes:
break
else:
matched.append(result)
logging.debug('indexing.search matched: %d' % len(matched))
if len(fetched) == fetch_limit and len(matched) < max_results:
logging.debug('Warning: Fetch reached a limit of %d, but only %d '
'exact-matched the query (max_results = %d).' %
(fetch_limit, len(matched), max_results))
# Now rank and order the results.
return rank_and_order(matched, query_obj, max_results)