test: Add keyword and hybrid search tests for Milvus online store

YassinNouh21 · YassinNouh21 · commit f52d5e78c639 · 2025-03-30T22:21:10.000+02:00
Signed-off-by: Yassin Nouh &lt;70436855+YassinNouh21@users.noreply.github.com&gt;
diff --git a/sdk/python/tests/unit/online_store/test_online_retrieval.py b/sdk/python/tests/unit/online_store/test_online_retrieval.py
@@ -1484,3 +1484,176 @@ def test_milvus_native_from_feast_data() -> None:
 
     # Clean up the collection
     client.drop_collection(collection_name=COLLECTION_NAME)
+
+
+def test_milvus_keyword_search() -> None:
+    """
+    Test retrieving documents from the Milvus online store using keyword search.
+    """
+    random.seed(42)
+    n = 10  # number of samples
+    vector_length = 10
+    runner = CliRunner()
+    with runner.local_repo(
+        example_repo_py=get_example_repo("example_rag_feature_repo.py"),
+        offline_store="file",
+        online_store="milvus",
+        apply=False,
+        teardown=False,
+    ) as store:
+        from datetime import timedelta
+    
+        from feast import Entity, FeatureView, Field, FileSource
+        from feast.types import Array, Float32, Int64, String, UnixTimestamp
+    
+        rag_documents_source = FileSource(
+            path="data/embedded_documents.parquet",
+            timestamp_field="event_timestamp",
+            created_timestamp_column="created_timestamp",
+        )
+    
+        item = Entity(
+            name="item_id",
+            join_keys=["item_id"],
+            value_type=ValueType.INT64,
+        )
+        author = Entity(
+            name="author_id",
+            join_keys=["author_id"],
+            value_type=ValueType.STRING,
+        )
+    
+        document_embeddings = FeatureView(
+            name="text_documents",
+            entities=[item, author],
+            schema=[
+                Field(
+                    name="vector",
+                    dtype=Array(Float32),
+                    vector_index=True,
+                    vector_search_metric="COSINE",
+                ),
+                Field(name="item_id", dtype=Int64),
+                Field(name="author_id", dtype=String),
+                Field(name="content", dtype=String),
+                Field(name="title", dtype=String),
+                Field(name="created_timestamp", dtype=UnixTimestamp),
+                Field(name="event_timestamp", dtype=UnixTimestamp),
+            ],
+            source=rag_documents_source,
+            ttl=timedelta(hours=24),
+        )
+    
+        store.apply([rag_documents_source, item, document_embeddings])
+    
+        # Write some data with specific text content for keyword search
+        document_embeddings_fv = store.get_feature_view(name="text_documents")
+        provider = store._get_provider()
+    
+        contents = [
+            "Feast is an open source feature store for machine learning",
+            "Feature stores solve the problem of coordinating features for training and serving",
+            "Milvus is a vector database that can be used with Feast",
+            "Keyword search uses BM25 algorithm for relevance ranking",
+            "Vector search uses embeddings for semantic similarity",
+            "Python is a popular programming language for machine learning",
+            "Feast supports multiple storage backends for online and offline use cases",
+            "Online stores are used for low-latency feature serving",
+            "Offline stores are used for batch feature retrieval during training",
+            "Feast enables data scientists to define, manage, and share features",
+        ]
+    
+        titles = [
+            "Introduction to Feast",
+            "Feature Store Benefits",
+            "Using Milvus with Feast",
+            "Keyword Search Fundamentals",
+            "Vector Search Overview",
+            "Python for ML",
+            "Feast Storage Options",
+            "Online Serving with Feast",
+            "Offline Training Support",
+            "Feast for Data Scientists",
+        ]
+    
+        item_keys = [
+            EntityKeyProto(
+                join_keys=["item_id", "author_id"],
+                entity_values=[
+                    ValueProto(int64_val=i),
+                    ValueProto(string_val=f"author_{i}"),
+                ],
+            )
+            for i in range(n)
+        ]
+        data = []
+        for i, item_key in enumerate(item_keys):
+            data.append(
+                (
+                    item_key,
+                    {
+                        "vector": ValueProto(
+                            float_list_val=FloatListProto(
+                                val=np.random.random(vector_length)
+                            )
+                        ),
+                        "content": ValueProto(string_val=contents[i]),
+                        "title": ValueProto(string_val=titles[i]),
+                    },
+                    _utc_now(),
+                    _utc_now(),
+                )
+            )
+    
+        provider.online_write_batch(
+            config=store.config,
+            table=document_embeddings_fv,
+            data=data,
+            progress=None,
+        )
+    
+        # Test keyword search for "Milvus"
+        result_milvus = store.retrieve_online_documents_v2(
+            features=[
+                "text_documents:content",
+                "text_documents:title",
+            ],
+            query_string="Milvus",
+            top_k=3,
+        ).to_dict()
+        
+        # Verify that documents containing "Milvus" are returned
+        assert len(result_milvus["content"]) > 0
+        assert any("Milvus" in content for content in result_milvus["content"])
+        
+        # Test keyword search for "machine learning"
+        result_ml = store.retrieve_online_documents_v2(
+            features=[
+                "text_documents:content",
+                "text_documents:title",
+            ],
+            query_string="machine learning",
+            top_k=3,
+        ).to_dict()
+        
+        # Verify that documents containing "machine learning" are returned
+        assert len(result_ml["content"]) > 0
+        assert any("machine learning" in content.lower() for content in result_ml["content"])
+        
+        # Test hybrid search (vector + keyword)
+        query_embedding = np.random.random(vector_length).tolist()
+        result_hybrid = store.retrieve_online_documents_v2(
+            features=[
+                "text_documents:content",
+                "text_documents:title",
+                "text_documents:vector",
+            ],
+            query=query_embedding,
+            query_string="Feast",
+            top_k=3,
+        ).to_dict()
+        
+        # Verify hybrid search results
+        assert len(result_hybrid["content"]) > 0
+        assert any("Feast" in content for content in result_hybrid["content"])
+        assert len(result_hybrid["vector"]) > 0