Skip to content
This repository was archived by the owner on Nov 21, 2017. It is now read-only.

Commit 4e7c988

Browse files
committed
Issue #1: Add support for whoosh Schemas
Note that when using a Schema, the parser will require Unicode objects to function
1 parent d9a40b5 commit 4e7c988

File tree

3 files changed

+131
-41
lines changed

3 files changed

+131
-41
lines changed

l2cs.py

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,15 @@
99

1010
import sys
1111

12+
import whoosh.fields
1213
import whoosh.qparser.default
1314
import whoosh.qparser.plugins
1415
import whoosh.qparser.syntax
1516
import whoosh.qparser.taggers
1617
import whoosh.query
1718

1819

19-
__version__ = "1.0.8"
20+
__version__ = "2.0.0"
2021

2122

2223
HANDLERS = {}
@@ -234,8 +235,32 @@ def do_minus(self, parser, group):
234235

235236
def make_parser(default_field='text', plugins=DEFAULT_PLUGINS, schema=None,
236237
int_fields=None, yesno_fields=None, aliases=None):
238+
'''Helper function to create a QueryParser.
239+
240+
Parameters:
241+
default_field: the default field to search against for non-field
242+
queries
243+
plugins: a list of plugins to use when parsing
244+
schema: If provided, a schema to check fieldnames against. If not
245+
provided, any query of the form "foo:bar" will yield searches
246+
against the "foo" field; if provided and "foo" is not a field,
247+
then the search will look for "foo bar" in the default_field.
248+
NOTE: If provided, search queries MUST use unicode
249+
int_fields: A list of fields that expect integer values from
250+
CloudSearch
251+
yesno_fields: A list of fields to convert "yes" and "no" queries to
252+
boolean 1 / 0 searches
253+
aliases: A dictionary of aliases to use for the AliasPlugin
254+
255+
'''
237256
parser = whoosh.qparser.default.QueryParser(default_field, schema,
238257
plugins=plugins)
258+
parser_parse = parser.parse
259+
def parse(text, *args, **kwargs):
260+
assert isinstance(text, unicode), 'Cannot parse non-unicode objects (%r)' % text
261+
return parser_parse(text, *args, **kwargs)
262+
parser.parse = parse
263+
parser.parse.__doc__ = parser_parse.__doc__
239264
if int_fields:
240265
parser.add_plugin(IntNodePlugin(int_fields))
241266
if yesno_fields:
@@ -245,26 +270,58 @@ def make_parser(default_field='text', plugins=DEFAULT_PLUGINS, schema=None,
245270
return parser
246271

247272

273+
def make_schema(fields, datefields=()):
274+
'''Create a whoosh.fields.Schema object from a list of field names.
275+
All fields will be set as TEXT fields. If datefields is supplied,
276+
additionally create DATETIME fields with those names
277+
278+
'''
279+
fields = dict.fromkeys(fields, whoosh.fields.TEXT)
280+
if datefields:
281+
datefields = dict.fromkeys(datefields, whoosh.fields.DATETIME)
282+
fields.update(datefields)
283+
return whoosh.fields.Schema(**fields)
284+
285+
248286
def convert(query, parser):
249287
parsed = parser.parse(query)
250288
pieces = walk_clause(parsed)
251-
return ''.join(pieces)
289+
return u''.join(pieces)
252290

253291

254-
def __sample_parser():
292+
def __sample_parser(schema=None):
255293
return make_parser(int_fields=["count", "number"],
256294
yesno_fields=["active", "ready"],
257-
aliases={"alias": ["alias1", "alias2"]})
295+
aliases={"alias": ["alias1", "alias2"]},
296+
schema=schema)
297+
298+
299+
def __sample_schema():
300+
return make_schema(["foo", "bar", "baz", "count", "number", "active",
301+
"text", "ready", "active", "alias", "alias1",
302+
"alias2"])
258303

259304

260305
def main(args):
261-
'''For command line experimentation'''
262-
query = ' '.join(args[1:])
306+
'''For command line experimentation. Sample output:
307+
308+
$ python l2cs.py 'foo:bar AND baz:bork'
309+
Lucene input: foo:bar AND baz:bork
310+
Parsed representation: And([Term(u'foo', u'bar'), Term(u'baz', u'bork')])
311+
Lucene form: (foo:bar AND baz:bork)
312+
Cloudsearch form: (and (field foo 'bar') (field baz 'bork'))
313+
314+
'''
315+
args = [unicode(u, 'utf-8') for u in args[1:]]
316+
schema = __sample_schema() if "--schema" in args else None
317+
if schema:
318+
args.pop(args.index("--schema"))
319+
query = u' '.join(args)
263320
print "Lucene input:", query
264-
parser = __sample_parser()
321+
parser = __sample_parser(schema=schema)
265322
parsed = parser.parse(query)
266323
print "Parsed representation:", repr(parsed)
267-
print "Lucene form:", str(parsed)
324+
print "Lucene form:", unicode(parsed)
268325
cloudsearch_query = ''.join(walk_clause(parsed))
269326
print "Cloudsearch form:", cloudsearch_query
270327

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
use_setuptools()
88
from setuptools import setup
99

10-
version = "1.0.8"
10+
version = "2.0.0"
1111

1212
setup(
1313
name='l2cs',

test_l2cs.py

Lines changed: 65 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,115 +13,148 @@ def setUp(self):
1313
self.parser = l2cs.make_parser(int_fields=["count", "number"],
1414
yesno_fields=["active", "ready"],
1515
aliases={"alias": ["alias1", "alias2"]})
16+
self.schema = l2cs.make_schema(["foo", "bar", "baz", "count", "number",
17+
"active", "text", "ready", "active",
18+
"alias", "alias1", "alias2"],
19+
["timestamp", "date"])
20+
self.schema_parser = l2cs.make_parser(int_fields=["count", "number"],
21+
yesno_fields=["active", "ready"],
22+
aliases={"alias": ["alias1",
23+
"alias2"]},
24+
schema=self.schema)
1625

1726
def tearDown(self):
1827
self.parser = None
28+
self.schema = None
29+
self.schema_parser = None
1930

2031
def _run_test(self, input_, expected, parser=None):
2132
parser = parser or self.parser
2233
parsed = parser.parse(input_, debug=DEBUG)
2334
pieces = l2cs.walk_clause(parsed)
24-
result = ''.join(pieces)
35+
result = u''.join(pieces)
2536
errmsg = ("\ninput: %s\nparsed: %r\nresult: %s\nexpected: %s" %
2637
(input_, parsed, result, expected))
2738
self.assertEqual(result, expected, errmsg)
2839

2940
# basic fields
3041
def test_fields1(self):
31-
self._run_test("foo", "(field text 'foo')")
42+
self._run_test(u"foo", u"(field text 'foo')")
3243
def test_fields2(self):
33-
self._run_test("foo:bar", "(field foo 'bar')")
44+
self._run_test(u"foo:bar", u"(field foo 'bar')")
3445

3546
# phrases
3647
def test_phrases1(self):
37-
self._run_test('"foo bar baz"', "(field text 'foo bar baz')")
48+
self._run_test(u'"foo bar baz"', u"(field text 'foo bar baz')")
3849

3950
# AND clauses
4051
def test_and1(self):
41-
self._run_test("foo AND bar", "(and (field text 'foo') (field text 'bar'))")
52+
self._run_test(u"foo AND bar", u"(and (field text 'foo') (field text 'bar'))")
4253
def test_and2(self):
43-
self._run_test("foo AND bar:baz", "(and (field text 'foo') (field bar 'baz'))")
54+
self._run_test(u"foo AND bar:baz", u"(and (field text 'foo') (field bar 'baz'))")
4455

4556
# OR clauses
4657
def test_or1(self):
47-
self._run_test("foo OR bar", "(or (field text 'foo') (field text 'bar'))")
58+
self._run_test(u"foo OR bar", u"(or (field text 'foo') (field text 'bar'))")
4859
def test_or2(self):
49-
self._run_test("bar:baz OR foo", "(or (field bar 'baz') (field text 'foo'))")
60+
self._run_test(u"bar:baz OR foo", u"(or (field bar 'baz') (field text 'foo'))")
5061

5162
# NOT clauses
5263
def test_not1(self):
53-
self._run_test("NOT foo", "(not (field text 'foo'))")
64+
self._run_test(u"NOT foo", u"(not (field text 'foo'))")
5465
def test_not2(self):
55-
self._run_test("baz NOT bar", "(and (field text 'baz') (not (field text 'bar')))")
66+
self._run_test(u"baz NOT bar", u"(and (field text 'baz') (not (field text 'bar')))")
5667
def test_not3(self):
57-
self._run_test("foo:bar NOT foo:baz", "(and (field foo 'bar') (not (field foo 'baz')))")
68+
self._run_test(u"foo:bar NOT foo:baz", u"(and (field foo 'bar') (not (field foo 'baz')))")
5869
def test_not4(self):
59-
self._run_test("bar AND foo:-baz", "(and (field text 'bar') (not (field text 'baz')))")
70+
self._run_test(u"bar AND foo:-baz", u"(and (field text 'bar') (not (field text 'baz')))")
6071
def test_not5(self):
6172
'''Stray hyphens at the end should not count as NOTs'''
62-
self._run_test("foo:bar -", "(and (field foo 'bar') (field text '-'))")
73+
self._run_test(u"foo:bar -", u"(and (field foo 'bar') (field text '-'))")
6374
def test_not6(self):
6475
'''Stray hyphens at the end should not NOT, even with spaces'''
65-
self._run_test("foo:bar - ", "(and (field foo 'bar') (field text '-'))")
76+
self._run_test(u"foo:bar - ", u"(and (field foo 'bar') (field text '-'))")
6677
def test_not7(self):
6778
'''Duplicate hyphens should be smooshed into one not clause'''
68-
self._run_test("test --foo", "(and (field text 'test') (not (field text 'foo')))")
79+
self._run_test(u"test --foo", u"(and (field text 'test') (not (field text 'foo')))")
6980
def test_not8(self):
7081
'''Duplicate hyphens hanging around in the middle of nowhere'''
71-
self._run_test("test -- foo", "(and (field text 'test') (field text '--') (field text 'foo'))")
82+
self._run_test(u"test -- foo", u"(and (field text 'test') (field text '--') (field text 'foo'))")
7283
def test_not9(self):
7384
'''Duplicate hyphens, spaced out'''
74-
self._run_test("test - - foo", "(and (field text 'test') (field text '-') (field text 'foo'))")
85+
self._run_test(u"test - - foo", u"(and (field text 'test') (field text '-') (field text 'foo'))")
7586

7687
# quotes
7788
def test_quote1(self):
78-
self._run_test("hello:\"goodbye you're sir\"", "(field hello 'goodbye you\\'re sir')")
89+
self._run_test(u"hello:\"goodbye you're sir\"", u"(field hello 'goodbye you\\'re sir')")
7990
def test_quote2(self):
80-
self._run_test("hello:\"goodbye you''re sir\"", "(field hello 'goodbye you\\'\\'re sir')")
91+
self._run_test(u"hello:\"goodbye you''re sir\"", u"(field hello 'goodbye you\\'\\'re sir')")
8192

8293
# int fields
8394
def test_int1(self):
84-
self._run_test("count:12", "count:12")
95+
self._run_test(u"count:12", u"count:12")
8596
def test_int2(self):
86-
self._run_test("count:foo number:12 foo:bar", "(and number:12 (field foo 'bar'))")
97+
self._run_test(u"count:foo number:12 foo:bar", u"(and number:12 (field foo 'bar'))")
8798

8899
# yes/no fields
89100
def test_yesno1(self):
90-
self._run_test("ready:yes active:n", "(and ready:1 active:0)")
101+
self._run_test(u"ready:yes active:n", u"(and ready:1 active:0)")
91102

92103
# prefixes
93104
def test_prefix1(self):
94-
self._run_test("foo:bar*", "(field foo 'bar*')")
105+
self._run_test(u"foo:bar*", u"(field foo 'bar*')")
95106

96107
# Aliases
97108
def test_alias1(self):
98-
self._run_test("alias1:foo", "(field alias 'foo')")
109+
self._run_test(u"alias1:foo", u"(field alias 'foo')")
99110
def test_alias2(self):
100111
'''Make sure that referencing the base of the alias still works'''
101-
self._run_test("alias:foo", "(field alias 'foo')")
112+
self._run_test(u"alias:foo", u"(field alias 'foo')")
102113

103114
# NullQueries
104115
def test_null1(self):
105-
self._run_test('""', '')
116+
self._run_test(u'""', u'')
106117
def test_null2(self):
107-
self._run_test('foo:""', '')
118+
self._run_test(u'foo:""', u'')
108119
def test_null3(self):
109-
self._run_test('foo:"" bar:baz', "(field bar 'baz')")
120+
self._run_test(u'foo:"" bar:baz', u"(field bar 'baz')")
121+
122+
# Schema
123+
def test_schema1(self):
124+
self._run_test(u"foo:bar", u"(field foo 'bar')", self.schema_parser)
125+
def test_schema2(self):
126+
self._run_test(u"foo:bar notfoo:something", u"(and (field foo 'bar') (field text 'notfoo') (field text 'something'))", self.schema_parser)
127+
128+
# Unicode checks
129+
def test_unicode1(self):
130+
'''Non-unicode ASCII input should raise AssertionError'''
131+
self.assertRaises(AssertionError, self._run_test, 'foo:bar', u"(field foo 'bar')")
132+
def test_unicode2(self):
133+
'''Non-unicode UTF-8 input should raise AssertionError'''
134+
self.assertRaises(AssertionError, self._run_test, 'foo:\xe0\xb2\xa0_\xe0\xb2\xa0', u"(field foo '\u0ca0_\u0ca0')")
135+
def test_unicode4(self):
136+
'''Result of l2cs.convert should be unicode'''
137+
result = l2cs.convert(u'foo:bar', self.parser)
138+
self.assertIsInstance(result, unicode)
139+
def test_unicode3(self):
140+
'''Result of l2cs.convert should be unicode, part 2'''
141+
result = l2cs.convert(u'foo:\u0ca0_\u0ca0', self.parser)
142+
self.assertIsInstance(result, unicode)
110143

111144
### Test cases from resolved issues ###
112145
# The remaining test cases protect against issues that have been resolved
113146

114147
# Unsupported "+" syntax gets ignored, AndMaybe clauses are avoided
115148
def test_plus1(self):
116-
self._run_test("learn c++ programming", "(and (field text 'learn') (field text 'c++') (field text 'programming'))")
149+
self._run_test(u"learn c++ programming", u"(and (field text 'learn') (field text 'c++') (field text 'programming'))")
117150
def test_plus2(self):
118-
self._run_test("learn c++", "(and (field text 'learn') (field text 'c++'))")
151+
self._run_test(u"learn c++", u"(and (field text 'learn') (field text 'c++'))")
119152

120153
def test_minus_in_parentheses(self):
121-
self._run_test("text:baz AND url:(-foo AND bar)", "(and (field text 'baz') (not (field url 'foo')) (field url 'bar'))")
154+
self._run_test(u"text:baz AND url:(-foo AND bar)", u"(and (field text 'baz') (not (field url 'foo')) (field url 'bar'))")
122155

123156
def test_minus_midword(self):
124-
self._run_test("baz:foo-bar", "(field baz 'foo-bar')")
157+
self._run_test(u"baz:foo-bar", u"(field baz 'foo-bar')")
125158

126159

127160
if __name__ == '__main__':

0 commit comments

Comments
 (0)