Issue #1: Add support for whoosh Schemas

kemitche · kemitche · commit 4e7c98896549 · 2012-08-31T12:29:55.000-07:00
Note that when using a Schema, the parser
will require Unicode objects to function
diff --git a/l2cs.py b/l2cs.py
@@ -9,14 +9,15 @@
 
 import sys
 
+import whoosh.fields
 import whoosh.qparser.default
 import whoosh.qparser.plugins
 import whoosh.qparser.syntax
 import whoosh.qparser.taggers
 import whoosh.query
 
 
-__version__ = "1.0.8"
+__version__ = "2.0.0"
 
 
 HANDLERS = {}
@@ -234,8 +235,32 @@ def do_minus(self, parser, group):
 
 def make_parser(default_field='text', plugins=DEFAULT_PLUGINS, schema=None,
                 int_fields=None, yesno_fields=None, aliases=None):
+    '''Helper function to create a QueryParser.
+    
+    Parameters:
+        default_field: the default field to search against for non-field
+                        queries
+        plugins: a list of plugins to use when parsing
+        schema: If provided, a schema to check fieldnames against. If not
+                provided, any query of the form "foo:bar" will yield searches
+                against the "foo" field; if provided and "foo" is not a field,
+                then the search will look for "foo bar" in the default_field.
+                NOTE: If provided, search queries MUST use unicode
+        int_fields: A list of fields that expect integer values from
+                    CloudSearch
+        yesno_fields: A list of fields to convert "yes" and "no" queries to
+                      boolean 1 / 0 searches
+        aliases: A dictionary of aliases to use for the AliasPlugin
+    
+    '''
     parser = whoosh.qparser.default.QueryParser(default_field, schema,
                                                 plugins=plugins)
+    parser_parse = parser.parse
+    def parse(text, *args, **kwargs):
+        assert isinstance(text, unicode), 'Cannot parse non-unicode objects (%r)' % text
+        return parser_parse(text, *args, **kwargs)
+    parser.parse = parse
+    parser.parse.__doc__ = parser_parse.__doc__
     if int_fields:
         parser.add_plugin(IntNodePlugin(int_fields))
     if yesno_fields:
@@ -245,26 +270,58 @@ def make_parser(default_field='text', plugins=DEFAULT_PLUGINS, schema=None,
     return parser
 
 
+def make_schema(fields, datefields=()):
+    '''Create a whoosh.fields.Schema object from a list of field names.
+    All fields will be set as TEXT fields. If datefields is supplied,
+    additionally create DATETIME fields with those names
+    
+    '''
+    fields = dict.fromkeys(fields, whoosh.fields.TEXT)
+    if datefields:
+        datefields = dict.fromkeys(datefields, whoosh.fields.DATETIME)
+        fields.update(datefields)
+    return whoosh.fields.Schema(**fields)
+
+
 def convert(query, parser):
     parsed = parser.parse(query)
     pieces = walk_clause(parsed)
-    return ''.join(pieces)
+    return u''.join(pieces)
 
 
-def __sample_parser():
+def __sample_parser(schema=None):
     return make_parser(int_fields=["count", "number"],
                        yesno_fields=["active", "ready"],
-                       aliases={"alias": ["alias1", "alias2"]})
+                       aliases={"alias": ["alias1", "alias2"]},
+                       schema=schema)
+
+
+def __sample_schema():
+    return make_schema(["foo", "bar", "baz", "count", "number", "active",
+                        "text", "ready", "active", "alias", "alias1",
+                        "alias2"])
 
 
 def main(args):
-    '''For command line experimentation'''
-    query = ' '.join(args[1:])
+    '''For command line experimentation. Sample output:
+    
+    $ python l2cs.py 'foo:bar AND baz:bork'
+    Lucene input: foo:bar AND baz:bork
+    Parsed representation: And([Term(u'foo', u'bar'), Term(u'baz', u'bork')])
+    Lucene form: (foo:bar AND baz:bork)
+    Cloudsearch form: (and (field foo 'bar') (field baz 'bork'))
+    
+    '''
+    args = [unicode(u, 'utf-8') for u in args[1:]]
+    schema = __sample_schema() if "--schema" in args else None
+    if schema:
+        args.pop(args.index("--schema"))
+    query = u' '.join(args)
     print "Lucene input:", query
-    parser = __sample_parser()
+    parser = __sample_parser(schema=schema)
     parsed = parser.parse(query)
     print "Parsed representation:", repr(parsed)
-    print "Lucene form:", str(parsed)
+    print "Lucene form:", unicode(parsed)
     cloudsearch_query = ''.join(walk_clause(parsed))
     print "Cloudsearch form:", cloudsearch_query
 
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
     use_setuptools()
     from setuptools import setup
 
-version = "1.0.8"
+version = "2.0.0"
 
 setup(
     name='l2cs',
diff --git a/test_l2cs.py b/test_l2cs.py
@@ -13,115 +13,148 @@ def setUp(self):
         self.parser = l2cs.make_parser(int_fields=["count", "number"],
                                        yesno_fields=["active", "ready"],
                                        aliases={"alias": ["alias1", "alias2"]})
+        self.schema = l2cs.make_schema(["foo", "bar", "baz", "count", "number",
+                                        "active", "text", "ready", "active",
+                                        "alias", "alias1", "alias2"],
+                                       ["timestamp", "date"])
+        self.schema_parser = l2cs.make_parser(int_fields=["count", "number"],
+                                              yesno_fields=["active", "ready"],
+                                              aliases={"alias": ["alias1",
+                                                                 "alias2"]},
+                                              schema=self.schema)
     
     def tearDown(self):
         self.parser = None
+        self.schema = None
+        self.schema_parser = None
     
     def _run_test(self, input_, expected, parser=None):
         parser = parser or self.parser
         parsed = parser.parse(input_, debug=DEBUG)
         pieces = l2cs.walk_clause(parsed)
-        result = ''.join(pieces)
+        result = u''.join(pieces)
         errmsg = ("\ninput: %s\nparsed: %r\nresult: %s\nexpected: %s" %
                   (input_, parsed, result, expected))
         self.assertEqual(result, expected, errmsg)
     
     # basic fields
     def test_fields1(self):
-        self._run_test("foo", "(field text 'foo')")
+        self._run_test(u"foo", u"(field text 'foo')")
     def test_fields2(self):
-        self._run_test("foo:bar", "(field foo 'bar')")
+        self._run_test(u"foo:bar", u"(field foo 'bar')")
     
     # phrases
     def test_phrases1(self):
-        self._run_test('"foo bar baz"', "(field text 'foo bar baz')")
+        self._run_test(u'"foo bar baz"', u"(field text 'foo bar baz')")
     
     # AND clauses
     def test_and1(self):
-        self._run_test("foo AND bar", "(and (field text 'foo') (field text 'bar'))")
+        self._run_test(u"foo AND bar", u"(and (field text 'foo') (field text 'bar'))")
     def test_and2(self):
-        self._run_test("foo AND bar:baz", "(and (field text 'foo') (field bar 'baz'))")
+        self._run_test(u"foo AND bar:baz", u"(and (field text 'foo') (field bar 'baz'))")
     
     # OR clauses
     def test_or1(self):
-        self._run_test("foo OR bar", "(or (field text 'foo') (field text 'bar'))")
+        self._run_test(u"foo OR bar", u"(or (field text 'foo') (field text 'bar'))")
     def test_or2(self):
-        self._run_test("bar:baz OR foo", "(or (field bar 'baz') (field text 'foo'))")
+        self._run_test(u"bar:baz OR foo", u"(or (field bar 'baz') (field text 'foo'))")
     
     # NOT clauses
     def test_not1(self):
-        self._run_test("NOT foo", "(not (field text 'foo'))")
+        self._run_test(u"NOT foo", u"(not (field text 'foo'))")
     def test_not2(self):
-        self._run_test("baz NOT bar", "(and (field text 'baz') (not (field text 'bar')))")
+        self._run_test(u"baz NOT bar", u"(and (field text 'baz') (not (field text 'bar')))")
     def test_not3(self):
-        self._run_test("foo:bar NOT foo:baz", "(and (field foo 'bar') (not (field foo 'baz')))")
+        self._run_test(u"foo:bar NOT foo:baz", u"(and (field foo 'bar') (not (field foo 'baz')))")
     def test_not4(self):
-        self._run_test("bar AND foo:-baz", "(and (field text 'bar') (not (field text 'baz')))")
+        self._run_test(u"bar AND foo:-baz", u"(and (field text 'bar') (not (field text 'baz')))")
     def test_not5(self):
         '''Stray hyphens at the end should not count as NOTs'''
-        self._run_test("foo:bar -", "(and (field foo 'bar') (field text '-'))")
+        self._run_test(u"foo:bar -", u"(and (field foo 'bar') (field text '-'))")
     def test_not6(self):
         '''Stray hyphens at the end should not NOT, even with spaces'''
-        self._run_test("foo:bar -  ", "(and (field foo 'bar') (field text '-'))")
+        self._run_test(u"foo:bar -  ", u"(and (field foo 'bar') (field text '-'))")
     def test_not7(self):
         '''Duplicate hyphens should be smooshed into one not clause'''
-        self._run_test("test --foo", "(and (field text 'test') (not (field text 'foo')))")
+        self._run_test(u"test --foo", u"(and (field text 'test') (not (field text 'foo')))")
     def test_not8(self):
         '''Duplicate hyphens hanging around in the middle of nowhere'''
-        self._run_test("test -- foo", "(and (field text 'test') (field text '--') (field text 'foo'))")
+        self._run_test(u"test -- foo", u"(and (field text 'test') (field text '--') (field text 'foo'))")
     def test_not9(self):
         '''Duplicate hyphens, spaced out'''
-        self._run_test("test - - foo", "(and (field text 'test') (field text '-') (field text 'foo'))")
+        self._run_test(u"test - - foo", u"(and (field text 'test') (field text '-') (field text 'foo'))")
     
     # quotes
     def test_quote1(self):
-        self._run_test("hello:\"goodbye you're sir\"", "(field hello 'goodbye you\\'re sir')")
+        self._run_test(u"hello:\"goodbye you're sir\"", u"(field hello 'goodbye you\\'re sir')")
     def test_quote2(self):
-        self._run_test("hello:\"goodbye you''re sir\"", "(field hello 'goodbye you\\'\\'re sir')")
+        self._run_test(u"hello:\"goodbye you''re sir\"", u"(field hello 'goodbye you\\'\\'re sir')")
     
     # int fields
     def test_int1(self):
-        self._run_test("count:12", "count:12")
+        self._run_test(u"count:12", u"count:12")
     def test_int2(self):
-        self._run_test("count:foo number:12 foo:bar", "(and number:12 (field foo 'bar'))")
+        self._run_test(u"count:foo number:12 foo:bar", u"(and number:12 (field foo 'bar'))")
     
     # yes/no fields
     def test_yesno1(self):
-        self._run_test("ready:yes active:n", "(and ready:1 active:0)")
+        self._run_test(u"ready:yes active:n", u"(and ready:1 active:0)")
     
     # prefixes
     def test_prefix1(self):
-        self._run_test("foo:bar*", "(field foo 'bar*')")
+        self._run_test(u"foo:bar*", u"(field foo 'bar*')")
     
     # Aliases
     def test_alias1(self):
-        self._run_test("alias1:foo", "(field alias 'foo')")
+        self._run_test(u"alias1:foo", u"(field alias 'foo')")
     def test_alias2(self):
         '''Make sure that referencing the base of the alias still works'''
-        self._run_test("alias:foo", "(field alias 'foo')")
+        self._run_test(u"alias:foo", u"(field alias 'foo')")
     
     # NullQueries
     def test_null1(self):
-        self._run_test('""', '')
+        self._run_test(u'""', u'')
     def test_null2(self):
-        self._run_test('foo:""', '')
+        self._run_test(u'foo:""', u'')
     def test_null3(self):
-        self._run_test('foo:"" bar:baz', "(field bar 'baz')")
+        self._run_test(u'foo:"" bar:baz', u"(field bar 'baz')")
+    
+    # Schema
+    def test_schema1(self):
+        self._run_test(u"foo:bar", u"(field foo 'bar')", self.schema_parser)
+    def test_schema2(self):
+        self._run_test(u"foo:bar notfoo:something", u"(and (field foo 'bar') (field text 'notfoo') (field text 'something'))", self.schema_parser)
+    
+    # Unicode checks
+    def test_unicode1(self):
+        '''Non-unicode ASCII input should raise AssertionError'''
+        self.assertRaises(AssertionError, self._run_test, 'foo:bar', u"(field foo 'bar')")
+    def test_unicode2(self):
+        '''Non-unicode UTF-8 input should raise AssertionError'''
+        self.assertRaises(AssertionError, self._run_test, 'foo:\xe0\xb2\xa0_\xe0\xb2\xa0', u"(field foo '\u0ca0_\u0ca0')")
+    def test_unicode4(self):
+        '''Result of l2cs.convert should be unicode'''
+        result = l2cs.convert(u'foo:bar', self.parser)
+        self.assertIsInstance(result, unicode)
+    def test_unicode3(self):
+        '''Result of l2cs.convert should be unicode, part 2'''
+        result = l2cs.convert(u'foo:\u0ca0_\u0ca0', self.parser)
+        self.assertIsInstance(result, unicode)
     
     ### Test cases from resolved issues ###
     # The remaining test cases protect against issues that have been resolved
     
     # Unsupported "+" syntax gets ignored, AndMaybe clauses are avoided
     def test_plus1(self):
-        self._run_test("learn c++ programming", "(and (field text 'learn') (field text 'c++') (field text 'programming'))")
+        self._run_test(u"learn c++ programming", u"(and (field text 'learn') (field text 'c++') (field text 'programming'))")
     def test_plus2(self):
-        self._run_test("learn c++", "(and (field text 'learn') (field text 'c++'))")
+        self._run_test(u"learn c++", u"(and (field text 'learn') (field text 'c++'))")
     
     def test_minus_in_parentheses(self):
-        self._run_test("text:baz AND url:(-foo AND bar)", "(and (field text 'baz') (not (field url 'foo')) (field url 'bar'))")
+        self._run_test(u"text:baz AND url:(-foo AND bar)", u"(and (field text 'baz') (not (field url 'foo')) (field url 'bar'))")
     
     def test_minus_midword(self):
-        self._run_test("baz:foo-bar", "(field baz 'foo-bar')")
+        self._run_test(u"baz:foo-bar", u"(field baz 'foo-bar')")
 
 
 if __name__ == '__main__':