@@ -1484,3 +1484,176 @@ def test_milvus_native_from_feast_data() -> None:
14841484
14851485 # Clean up the collection
14861486 client .drop_collection (collection_name = COLLECTION_NAME )
1487+
1488+
1489+ def test_milvus_keyword_search () -> None :
1490+ """
1491+ Test retrieving documents from the Milvus online store using keyword search.
1492+ """
1493+ random .seed (42 )
1494+ n = 10 # number of samples
1495+ vector_length = 10
1496+ runner = CliRunner ()
1497+ with runner .local_repo (
1498+ example_repo_py = get_example_repo ("example_rag_feature_repo.py" ),
1499+ offline_store = "file" ,
1500+ online_store = "milvus" ,
1501+ apply = False ,
1502+ teardown = False ,
1503+ ) as store :
1504+ from datetime import timedelta
1505+
1506+ from feast import Entity , FeatureView , Field , FileSource
1507+ from feast .types import Array , Float32 , Int64 , String , UnixTimestamp
1508+
1509+ rag_documents_source = FileSource (
1510+ path = "data/embedded_documents.parquet" ,
1511+ timestamp_field = "event_timestamp" ,
1512+ created_timestamp_column = "created_timestamp" ,
1513+ )
1514+
1515+ item = Entity (
1516+ name = "item_id" ,
1517+ join_keys = ["item_id" ],
1518+ value_type = ValueType .INT64 ,
1519+ )
1520+ author = Entity (
1521+ name = "author_id" ,
1522+ join_keys = ["author_id" ],
1523+ value_type = ValueType .STRING ,
1524+ )
1525+
1526+ document_embeddings = FeatureView (
1527+ name = "text_documents" ,
1528+ entities = [item , author ],
1529+ schema = [
1530+ Field (
1531+ name = "vector" ,
1532+ dtype = Array (Float32 ),
1533+ vector_index = True ,
1534+ vector_search_metric = "COSINE" ,
1535+ ),
1536+ Field (name = "item_id" , dtype = Int64 ),
1537+ Field (name = "author_id" , dtype = String ),
1538+ Field (name = "content" , dtype = String ),
1539+ Field (name = "title" , dtype = String ),
1540+ Field (name = "created_timestamp" , dtype = UnixTimestamp ),
1541+ Field (name = "event_timestamp" , dtype = UnixTimestamp ),
1542+ ],
1543+ source = rag_documents_source ,
1544+ ttl = timedelta (hours = 24 ),
1545+ )
1546+
1547+ store .apply ([rag_documents_source , item , document_embeddings ])
1548+
1549+ # Write some data with specific text content for keyword search
1550+ document_embeddings_fv = store .get_feature_view (name = "text_documents" )
1551+ provider = store ._get_provider ()
1552+
1553+ contents = [
1554+ "Feast is an open source feature store for machine learning" ,
1555+ "Feature stores solve the problem of coordinating features for training and serving" ,
1556+ "Milvus is a vector database that can be used with Feast" ,
1557+ "Keyword search uses BM25 algorithm for relevance ranking" ,
1558+ "Vector search uses embeddings for semantic similarity" ,
1559+ "Python is a popular programming language for machine learning" ,
1560+ "Feast supports multiple storage backends for online and offline use cases" ,
1561+ "Online stores are used for low-latency feature serving" ,
1562+ "Offline stores are used for batch feature retrieval during training" ,
1563+ "Feast enables data scientists to define, manage, and share features" ,
1564+ ]
1565+
1566+ titles = [
1567+ "Introduction to Feast" ,
1568+ "Feature Store Benefits" ,
1569+ "Using Milvus with Feast" ,
1570+ "Keyword Search Fundamentals" ,
1571+ "Vector Search Overview" ,
1572+ "Python for ML" ,
1573+ "Feast Storage Options" ,
1574+ "Online Serving with Feast" ,
1575+ "Offline Training Support" ,
1576+ "Feast for Data Scientists" ,
1577+ ]
1578+
1579+ item_keys = [
1580+ EntityKeyProto (
1581+ join_keys = ["item_id" , "author_id" ],
1582+ entity_values = [
1583+ ValueProto (int64_val = i ),
1584+ ValueProto (string_val = f"author_{ i } " ),
1585+ ],
1586+ )
1587+ for i in range (n )
1588+ ]
1589+ data = []
1590+ for i , item_key in enumerate (item_keys ):
1591+ data .append (
1592+ (
1593+ item_key ,
1594+ {
1595+ "vector" : ValueProto (
1596+ float_list_val = FloatListProto (
1597+ val = np .random .random (vector_length )
1598+ )
1599+ ),
1600+ "content" : ValueProto (string_val = contents [i ]),
1601+ "title" : ValueProto (string_val = titles [i ]),
1602+ },
1603+ _utc_now (),
1604+ _utc_now (),
1605+ )
1606+ )
1607+
1608+ provider .online_write_batch (
1609+ config = store .config ,
1610+ table = document_embeddings_fv ,
1611+ data = data ,
1612+ progress = None ,
1613+ )
1614+
1615+ # Test keyword search for "Milvus"
1616+ result_milvus = store .retrieve_online_documents_v2 (
1617+ features = [
1618+ "text_documents:content" ,
1619+ "text_documents:title" ,
1620+ ],
1621+ query_string = "Milvus" ,
1622+ top_k = 3 ,
1623+ ).to_dict ()
1624+
1625+ # Verify that documents containing "Milvus" are returned
1626+ assert len (result_milvus ["content" ]) > 0
1627+ assert any ("Milvus" in content for content in result_milvus ["content" ])
1628+
1629+ # Test keyword search for "machine learning"
1630+ result_ml = store .retrieve_online_documents_v2 (
1631+ features = [
1632+ "text_documents:content" ,
1633+ "text_documents:title" ,
1634+ ],
1635+ query_string = "machine learning" ,
1636+ top_k = 3 ,
1637+ ).to_dict ()
1638+
1639+ # Verify that documents containing "machine learning" are returned
1640+ assert len (result_ml ["content" ]) > 0
1641+ assert any ("machine learning" in content .lower () for content in result_ml ["content" ])
1642+
1643+ # Test hybrid search (vector + keyword)
1644+ query_embedding = np .random .random (vector_length ).tolist ()
1645+ result_hybrid = store .retrieve_online_documents_v2 (
1646+ features = [
1647+ "text_documents:content" ,
1648+ "text_documents:title" ,
1649+ "text_documents:vector" ,
1650+ ],
1651+ query = query_embedding ,
1652+ query_string = "Feast" ,
1653+ top_k = 3 ,
1654+ ).to_dict ()
1655+
1656+ # Verify hybrid search results
1657+ assert len (result_hybrid ["content" ]) > 0
1658+ assert any ("Feast" in content for content in result_hybrid ["content" ])
1659+ assert len (result_hybrid ["vector" ]) > 0
0 commit comments