Skip to content

Commit f52d5e7

Browse files
committed
test: Add keyword and hybrid search tests for Milvus online store
Signed-off-by: Yassin Nouh <[email protected]>
1 parent c96e228 commit f52d5e7

File tree

1 file changed

+173
-0
lines changed

1 file changed

+173
-0
lines changed

sdk/python/tests/unit/online_store/test_online_retrieval.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,3 +1484,176 @@ def test_milvus_native_from_feast_data() -> None:
14841484

14851485
# Clean up the collection
14861486
client.drop_collection(collection_name=COLLECTION_NAME)
1487+
1488+
1489+
def test_milvus_keyword_search() -> None:
1490+
"""
1491+
Test retrieving documents from the Milvus online store using keyword search.
1492+
"""
1493+
random.seed(42)
1494+
n = 10 # number of samples
1495+
vector_length = 10
1496+
runner = CliRunner()
1497+
with runner.local_repo(
1498+
example_repo_py=get_example_repo("example_rag_feature_repo.py"),
1499+
offline_store="file",
1500+
online_store="milvus",
1501+
apply=False,
1502+
teardown=False,
1503+
) as store:
1504+
from datetime import timedelta
1505+
1506+
from feast import Entity, FeatureView, Field, FileSource
1507+
from feast.types import Array, Float32, Int64, String, UnixTimestamp
1508+
1509+
rag_documents_source = FileSource(
1510+
path="data/embedded_documents.parquet",
1511+
timestamp_field="event_timestamp",
1512+
created_timestamp_column="created_timestamp",
1513+
)
1514+
1515+
item = Entity(
1516+
name="item_id",
1517+
join_keys=["item_id"],
1518+
value_type=ValueType.INT64,
1519+
)
1520+
author = Entity(
1521+
name="author_id",
1522+
join_keys=["author_id"],
1523+
value_type=ValueType.STRING,
1524+
)
1525+
1526+
document_embeddings = FeatureView(
1527+
name="text_documents",
1528+
entities=[item, author],
1529+
schema=[
1530+
Field(
1531+
name="vector",
1532+
dtype=Array(Float32),
1533+
vector_index=True,
1534+
vector_search_metric="COSINE",
1535+
),
1536+
Field(name="item_id", dtype=Int64),
1537+
Field(name="author_id", dtype=String),
1538+
Field(name="content", dtype=String),
1539+
Field(name="title", dtype=String),
1540+
Field(name="created_timestamp", dtype=UnixTimestamp),
1541+
Field(name="event_timestamp", dtype=UnixTimestamp),
1542+
],
1543+
source=rag_documents_source,
1544+
ttl=timedelta(hours=24),
1545+
)
1546+
1547+
store.apply([rag_documents_source, item, document_embeddings])
1548+
1549+
# Write some data with specific text content for keyword search
1550+
document_embeddings_fv = store.get_feature_view(name="text_documents")
1551+
provider = store._get_provider()
1552+
1553+
contents = [
1554+
"Feast is an open source feature store for machine learning",
1555+
"Feature stores solve the problem of coordinating features for training and serving",
1556+
"Milvus is a vector database that can be used with Feast",
1557+
"Keyword search uses BM25 algorithm for relevance ranking",
1558+
"Vector search uses embeddings for semantic similarity",
1559+
"Python is a popular programming language for machine learning",
1560+
"Feast supports multiple storage backends for online and offline use cases",
1561+
"Online stores are used for low-latency feature serving",
1562+
"Offline stores are used for batch feature retrieval during training",
1563+
"Feast enables data scientists to define, manage, and share features",
1564+
]
1565+
1566+
titles = [
1567+
"Introduction to Feast",
1568+
"Feature Store Benefits",
1569+
"Using Milvus with Feast",
1570+
"Keyword Search Fundamentals",
1571+
"Vector Search Overview",
1572+
"Python for ML",
1573+
"Feast Storage Options",
1574+
"Online Serving with Feast",
1575+
"Offline Training Support",
1576+
"Feast for Data Scientists",
1577+
]
1578+
1579+
item_keys = [
1580+
EntityKeyProto(
1581+
join_keys=["item_id", "author_id"],
1582+
entity_values=[
1583+
ValueProto(int64_val=i),
1584+
ValueProto(string_val=f"author_{i}"),
1585+
],
1586+
)
1587+
for i in range(n)
1588+
]
1589+
data = []
1590+
for i, item_key in enumerate(item_keys):
1591+
data.append(
1592+
(
1593+
item_key,
1594+
{
1595+
"vector": ValueProto(
1596+
float_list_val=FloatListProto(
1597+
val=np.random.random(vector_length)
1598+
)
1599+
),
1600+
"content": ValueProto(string_val=contents[i]),
1601+
"title": ValueProto(string_val=titles[i]),
1602+
},
1603+
_utc_now(),
1604+
_utc_now(),
1605+
)
1606+
)
1607+
1608+
provider.online_write_batch(
1609+
config=store.config,
1610+
table=document_embeddings_fv,
1611+
data=data,
1612+
progress=None,
1613+
)
1614+
1615+
# Test keyword search for "Milvus"
1616+
result_milvus = store.retrieve_online_documents_v2(
1617+
features=[
1618+
"text_documents:content",
1619+
"text_documents:title",
1620+
],
1621+
query_string="Milvus",
1622+
top_k=3,
1623+
).to_dict()
1624+
1625+
# Verify that documents containing "Milvus" are returned
1626+
assert len(result_milvus["content"]) > 0
1627+
assert any("Milvus" in content for content in result_milvus["content"])
1628+
1629+
# Test keyword search for "machine learning"
1630+
result_ml = store.retrieve_online_documents_v2(
1631+
features=[
1632+
"text_documents:content",
1633+
"text_documents:title",
1634+
],
1635+
query_string="machine learning",
1636+
top_k=3,
1637+
).to_dict()
1638+
1639+
# Verify that documents containing "machine learning" are returned
1640+
assert len(result_ml["content"]) > 0
1641+
assert any("machine learning" in content.lower() for content in result_ml["content"])
1642+
1643+
# Test hybrid search (vector + keyword)
1644+
query_embedding = np.random.random(vector_length).tolist()
1645+
result_hybrid = store.retrieve_online_documents_v2(
1646+
features=[
1647+
"text_documents:content",
1648+
"text_documents:title",
1649+
"text_documents:vector",
1650+
],
1651+
query=query_embedding,
1652+
query_string="Feast",
1653+
top_k=3,
1654+
).to_dict()
1655+
1656+
# Verify hybrid search results
1657+
assert len(result_hybrid["content"]) > 0
1658+
assert any("Feast" in content for content in result_hybrid["content"])
1659+
assert len(result_hybrid["vector"]) > 0

0 commit comments

Comments
 (0)