ExpediaGroup
diff --git a/‎sdk/python/feast/infra/online_stores/cassandra_online_store/cassandra_online_store.py‎
Lines changed: 5 additions & 16 deletions b/‎sdk/python/feast/infra/online_stores/cassandra_online_store/cassandra_online_store.py‎
Lines changed: 5 additions & 16 deletions
diff --git a/‎sdk/python/feast/infra/passthrough_provider.py‎
Lines changed: 98 additions & 12 deletions b/‎sdk/python/feast/infra/passthrough_provider.py‎
Lines changed: 98 additions & 12 deletions
diff --git a/‎sdk/python/feast/rate_limiter.py‎
Lines changed: 64 additions & 15 deletions b/‎sdk/python/feast/rate_limiter.py‎
Lines changed: 64 additions & 15 deletions
@@ -48,7 +48,6 @@
 from feast.protos.feast.core.SortedFeatureView_pb2 import SortOrder
 from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto
 from feast.protos.feast.types.Value_pb2 import Value as ValueProto
-from feast.rate_limiter import SlidingWindowRateLimiter
 from feast.repo_config import FeastConfigBaseModel
 from feast.sorted_feature_view import SortedFeatureView
 from feast.types import (
@@ -418,9 +417,7 @@ def on_failure(exc, concurrent_queue):
         ttl_feature_view = table.ttl or timedelta(seconds=0)
         ttl_online_store_config = online_store_config.key_ttl_seconds or 0
         write_concurrency = online_store_config.write_concurrency
-        write_rate_limit = online_store_config.write_rate_limit
         concurrent_queue: Queue = Queue(maxsize=write_concurrency)
-        rate_limiter = SlidingWindowRateLimiter(write_rate_limit, 1)
         feast_array_types = [
             "bytes_list_val",
             "string_list_val",
@@ -540,7 +537,6 @@ def on_failure(exc, concurrent_queue):
                         and 0 < online_store_config.write_batch_size <= batch_count
                     ):
                         CassandraOnlineStore._apply_batch(
-                            rate_limiter,
                             batch,
                             progress,
                             session,
@@ -553,7 +549,6 @@ def on_failure(exc, concurrent_queue):
 
                 if batch_count > 0:
                     CassandraOnlineStore._apply_batch(
-                        rate_limiter,
                         batch,
                         progress,
                         session,
@@ -592,7 +587,6 @@ def on_failure(exc, concurrent_queue):
                         and 0 < online_store_config.write_batch_size <= batch_count
                     ):
                         CassandraOnlineStore._apply_batch(
-                            rate_limiter,
                             batch,
                             progress,
                             session,
@@ -605,7 +599,6 @@ def on_failure(exc, concurrent_queue):
 
                 if batch_count > 0:
                     CassandraOnlineStore._apply_batch(
-                        rate_limiter,
                         batch,
                         progress,
                         session,
@@ -952,9 +945,11 @@ def _build_sorted_table_cql(
         """
         sort_key_names = [sk.name for sk in table.sort_keys]
         feature_columns = ", ".join(
-            f"{feature.name} {self._get_cql_type(feature.dtype)}"
-            if feature.name in sort_key_names
-            else f"{feature.name} BLOB"
+            (
+                f"{feature.name} {self._get_cql_type(feature.dtype)}"
+                if feature.name in sort_key_names
+                else f"{feature.name} BLOB"
+            )
             for feature in table.features
         )
 
@@ -1023,19 +1018,13 @@ def _get_cql_statement(
 
     @staticmethod
     def _apply_batch(
-        rate_limiter: SlidingWindowRateLimiter,
         batch: BatchStatement,
         progress: Optional[Callable[[int], Any]],
         session: Session,
         concurrent_queue: Queue,
         on_success,
         on_failure,
     ):
-        # Wait until the rate limiter allows
-        if not rate_limiter.acquire():
-            while not rate_limiter.acquire():
-                time.sleep(0.001)
-
         future = session.execute_async(batch)
         concurrent_queue.put(future)
         future.add_callbacks(
 
@@ -46,6 +46,7 @@
 from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto
 from feast.protos.feast.types.Value_pb2 import RepeatedValue
 from feast.protos.feast.types.Value_pb2 import Value as ValueProto
+from feast.rate_limiter import TokenBucketRateLimiter  # provider-level write limiter
 from feast.repo_config import BATCH_ENGINE_CLASS_FOR_TYPE, RepoConfig
 from feast.saved_dataset import SavedDataset
 from feast.sorted_feature_view import SortedFeatureView
@@ -62,15 +63,14 @@
 
 
 class PassthroughProvider(Provider):
-    """
-    The passthrough provider delegates all operations to the underlying online and offline stores.
-    """
+    """The passthrough provider delegates all operations to the underlying online and offline stores."""
 
     def __init__(self, config: RepoConfig):
         self.repo_config = config
         self._offline_store = None
         self._online_store = None
         self._batch_engine: Optional[ComputeEngine] = None
+        self._write_token_limiters: Dict[str, TokenBucketRateLimiter] = {}
 
     @property
     def online_store(self):
@@ -199,8 +199,66 @@ def online_write_batch(
         ],
         progress: Optional[Callable[[int], Any]],
     ) -> None:
-        if self.online_store:
-            self.online_store.online_write_batch(config, table, data, progress)
+        """
+        Write data to the online store in rate-limited batches.
+        Uses TokenBucketRateLimiter to throttle writes.
+        """
+
+        # Resolve configured rate limit
+        rate_limit = self._resolve_write_rate_limit(config, table)
+        fv_name = getattr(table, "name", "global") if table is not None else "global"
+        limiter_key = f"{config.project}:{fv_name}"
+
+        # If rate limit is 0 or unset, bypass limiter
+        if rate_limit <= 0:
+            if self.online_store:
+                self.online_store.online_write_batch(config, table, data, progress)
+            return
+
+        # Create or reuse per-feature-view limiter
+        # Calculate percent_usage based on available CPU cores
+        # More processes = lower percent_usage to reduce token contention
+        num_spark_driver_cores = int(os.environ.get("SPARK_DRIVER_CORES", 1))
+
+        if num_spark_driver_cores > 2:
+            num_processes = num_spark_driver_cores - 1
+            # Decrease percent_usage as processes increase to allow fair sharing
+            # 2 processes -> 0.50, 4 processes -> 0.40, 8 processes -> 0.30
+            percent_usage = max(0.6 / (num_processes / 2), 0.25)
+        else:
+            # Single process - can use more tokens per batch
+            percent_usage = 0.9
+
+        interval = 1.0  # seconds
+
+        limiter = self._write_token_limiters.get(limiter_key)
+        if limiter is None or limiter.rate != rate_limit:
+            limiter = TokenBucketRateLimiter(
+                rate=rate_limit, interval=interval, percent_usage=percent_usage
+            )
+            self._write_token_limiters[limiter_key] = limiter
+            logger.info(
+                f"[Limiter] Initialized rate limiter for {limiter_key} at {rate_limit} writes/sec"
+            )
+
+        # Process data in dynamically sized batches based on token availability
+        total_records = len(data)
+        index = 0
+
+        while index < total_records:
+            available = limiter.get_available_tokens()
+            # Ensure we always make progress (at least 1 record)
+            batch_size = min(max(available, 1), total_records - index)
+
+            batch = data[index : index + batch_size]
+            limiter.wait_for_tokens(len(batch))  # blocks until tokens available
+
+            if self.online_store:
+                self.online_store.online_write_batch(config, table, batch, progress)
+
+            index += batch_size
+            if progress:
+                progress(batch_size)
 
     async def online_write_batch_async(
         self,
@@ -216,6 +274,41 @@ async def online_write_batch_async(
                 config, table, data, progress
             )
 
+    def _resolve_write_rate_limit(
+        self,
+        config: RepoConfig,
+        table: Union[FeatureView, BaseFeatureView, OnDemandFeatureView],
+    ) -> int:
+        """Resolve write_rate_limit using precedence:
+        1. feature view tag 'write_rate_limit'
+        2. config.online_store.write_rate_limit
+        3. fallback 0
+        """
+        # 1) Feature view tag override
+        if table is not None and hasattr(table, "tags") and table.tags:
+            tag_val = table.tags.get("write_rate_limit")
+            if tag_val is not None:
+                try:
+                    return int(tag_val)
+                except Exception:
+                    logger.warning(
+                        "Invalid write_rate_limit on feature view %s: %s; falling back",
+                        getattr(table, "name", "<unknown>"),
+                        tag_val,
+                    )
+
+        # 2) Project / online store level config
+        try:
+            if config.online_store and hasattr(config.online_store, "write_rate_limit"):
+                return int(getattr(config.online_store, "write_rate_limit") or 0)
+        except Exception:
+            logger.warning(
+                "Invalid write_rate_limit on online_store config; falling back to 0"
+            )
+
+        # 3) Fallback to 0 (no rate limit)
+        return 0
+
     def offline_write_batch(
         self,
         config: RepoConfig,
@@ -407,7 +500,6 @@ def ingest_df(
 
             # Input table is split into smaller chunks and processed in parallel
             chunks = self.split_table(num_processes, table)
-
             chunks_to_parallelize = [
                 (chunk, feature_view, join_keys) for chunk in chunks
             ]
@@ -465,7 +557,6 @@ def ingest_df_to_offline_store(self, feature_view: FeatureView, table: pa.Table)
             table = _run_pyarrow_field_mapping(
                 table, feature_view.batch_source.field_mapping
             )
-
         self.offline_write_batch(self.repo_config, feature_view, table, None)
 
     def materialize_single_feature_view(
@@ -544,7 +635,6 @@ def get_historical_features(
             full_feature_names=full_feature_names,
             **kwargs,
         )
-
         return job
 
     def retrieve_saved_dataset(
@@ -554,10 +644,8 @@ def retrieve_saved_dataset(
             ref.replace(":", "__") if dataset.full_feature_names else ref.split(":")[1]
             for ref in dataset.features
         ]
-
         # ToDo: replace hardcoded value
         event_ts_column = "event_timestamp"
-
         return self.offline_store.pull_all_from_table_or_query(
             config=config,
             data_source=dataset.storage.to_data_source(),
@@ -578,7 +666,6 @@ def write_feature_service_logs(
         assert feature_service.logging_config is not None, (
             "Logging should be configured for the feature service before calling this function"
         )
-
         self.offline_store.write_logged_features(
             config=config,
             data=logs,
@@ -598,7 +685,6 @@ def retrieve_feature_service_logs(
         assert feature_service.logging_config is not None, (
             "Logging should be configured for the feature service before calling this function"
         )
-
         logging_source = FeatureServiceLoggingSource(feature_service, config.project)
         schema = logging_source.get_schema(registry)
         logging_config = feature_service.logging_config
 
@@ -1,21 +1,70 @@
+import math
+import threading
 import time
+from typing import Optional
 
 
-class SlidingWindowRateLimiter:
-    def __init__(self, max_calls, period):
-        self.max_calls = max_calls
-        self.period = period
-        self.timestamps = [0] * max_calls
-        self.index = 0
+class TokenBucketRateLimiter:
+    def __init__(self, rate: float, interval: float = 1.0, percent_usage: float = 0.6):
+        """
+        Args:
+            rate: Maximum tokens added per interval (writes per interval)
+            interval: Refill interval in seconds
+            percent_usage: Fraction of available tokens allowed for writing
+        """
+        self.rate = float(rate)
+        self.interval = float(interval)
+        self.max_tokens = float(rate)
+        self.tokens = float(rate)
+        self.last_refill = time.monotonic()
+        self.lock = threading.Lock()
+        self.cond = threading.Condition(self.lock)
+        self.percent_usage = float(percent_usage)
 
-    def acquire(self):
-        if self.max_calls == 0:
-            return True
-        now = time.time()
-        window_start = now - self.period
+    def _refill(self):
+        """Refill tokens based on elapsed time."""
+        now = time.monotonic()
+        elapsed = now - self.last_refill
+        if elapsed <= 0:
+            return
+
+        added = (self.rate * elapsed) / self.interval
+        if added > 0:
+            self.tokens = min(self.max_tokens, self.tokens + added)
+            self.last_refill = now
+
+    def get_available_tokens(self) -> int:
+        """
+        Return the current number of tokens available for use,
+        considering percent_usage.
+        """
+        with self.lock:
+            self._refill()
+            return math.floor(self.tokens * self.percent_usage)
 
-        if self.timestamps[self.index] <= window_start:
-            self.timestamps[self.index] = now
-            self.index = (self.index + 1) % self.max_calls
+    def wait_for_tokens(self, num: int, timeout: Optional[float] = None) -> bool:
+        """
+        Block until `num` tokens are available, then consume them.
+        """
+        if num <= 0:
             return True
-        return False
+
+        end_time = None if timeout is None else (time.monotonic() + timeout)
+        with self.cond:
+            while True:
+                self._refill()
+                available = self.tokens * self.percent_usage
+                if available >= num:
+                    # Consume atomically
+                    self.tokens -= num
+                    self.cond.notify_all()
+                    return True
+
+                if end_time is not None:
+                    remaining = end_time - time.monotonic()
+                    if remaining <= 0:
+                        return False
+                    wait_time = min(0.05, remaining)
+                else:
+                    wait_time = 0.05
+                self.cond.wait(wait_time)