-
Notifications
You must be signed in to change notification settings - Fork 1.2k
feat: Add non-entity retrieval support for ClickHouse offline store #6066
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import logging | ||
| import threading | ||
| from datetime import datetime, timedelta, timezone | ||
| from unittest.mock import MagicMock, patch | ||
|
|
||
| import pytest | ||
|
|
@@ -133,3 +134,109 @@ def test_clickhouse_config_handles_none_additional_client_args(): | |
| config = ClickhouseConfig(**raw_config) | ||
|
|
||
| assert config.additional_client_args is None | ||
|
|
||
|
|
||
| class TestNonEntityRetrieval: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests seems over-mocked
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, removed the heavy mocking. For proper coverage, an integration test against a real ClickHouse instance would be more valuable than over-mocked unit tests — the unit tests here may not be necessary at all. |
||
| """Test the non-entity retrieval logic (entity_df=None) for ClickHouse.""" | ||
|
|
||
| _MODULE = "feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse" | ||
|
|
||
| def _call_get_historical_features(self, feature_views, **kwargs): | ||
| """Call get_historical_features with entity_df=None, mocking the pipeline.""" | ||
| from feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse import ( | ||
| ClickhouseOfflineStore, | ||
| ClickhouseOfflineStoreConfig, | ||
| ) | ||
| from feast.repo_config import RepoConfig | ||
|
|
||
| config = RepoConfig( | ||
| project="test_project", | ||
| registry="test_registry", | ||
| provider="local", | ||
| offline_store=ClickhouseOfflineStoreConfig( | ||
| type="clickhouse", | ||
| host="localhost", | ||
| port=9000, | ||
| database="test_db", | ||
| user="default", | ||
| password="password", | ||
| ), | ||
| ) | ||
|
|
||
| end = kwargs.get("end_date", datetime(2023, 1, 7, tzinfo=timezone.utc)) | ||
|
|
||
| with ( | ||
| patch.multiple( | ||
| self._MODULE, | ||
| _upload_entity_df=MagicMock(), | ||
| _get_entity_schema=MagicMock( | ||
| return_value={"event_timestamp": "timestamp"} | ||
| ), | ||
| _get_entity_df_event_timestamp_range=MagicMock( | ||
| return_value=(end - timedelta(days=1), end) | ||
| ), | ||
| ), | ||
| patch( | ||
| f"{self._MODULE}.offline_utils.get_expected_join_keys", | ||
| return_value=[], | ||
| ), | ||
| patch( | ||
| f"{self._MODULE}.offline_utils.assert_expected_columns_in_entity_df", | ||
| ), | ||
| patch( | ||
| f"{self._MODULE}.offline_utils.get_feature_view_query_context", | ||
| return_value=[], | ||
| ), | ||
| ): | ||
| refs = [f"{fv.name}:feature1" for fv in feature_views] | ||
| return ClickhouseOfflineStore.get_historical_features( | ||
| config=config, | ||
| feature_views=feature_views, | ||
| feature_refs=refs, | ||
| entity_df=None, | ||
| registry=MagicMock(), | ||
| project="test_project", | ||
| **kwargs, | ||
| ) | ||
|
|
||
| @staticmethod | ||
| def _make_feature_view(name, ttl=None): | ||
| from feast.entity import Entity | ||
| from feast.feature_view import FeatureView, Field | ||
| from feast.infra.offline_stores.contrib.clickhouse_offline_store.clickhouse_source import ( | ||
| ClickhouseSource, | ||
| ) | ||
| from feast.types import Float32 | ||
|
|
||
| return FeatureView( | ||
| name=name, | ||
| entities=[Entity(name="driver_id", join_keys=["driver_id"])], | ||
| ttl=ttl, | ||
| source=ClickhouseSource( | ||
| name=f"{name}_source", | ||
| table=f"{name}_table", | ||
| timestamp_field="event_timestamp", | ||
| ), | ||
| schema=[ | ||
| Field(name="feature1", dtype=Float32), | ||
| ], | ||
| ) | ||
|
|
||
| def test_non_entity_mode_with_end_date(self): | ||
| """entity_df=None with explicit end_date produces a valid RetrievalJob.""" | ||
| from feast.infra.offline_stores.offline_store import RetrievalJob | ||
|
|
||
| fv = self._make_feature_view("test_fv") | ||
| job = self._call_get_historical_features( | ||
| [fv], | ||
| end_date=datetime(2023, 1, 7, tzinfo=timezone.utc), | ||
| ) | ||
| assert isinstance(job, RetrievalJob) | ||
|
|
||
| def test_non_entity_mode_defaults_end_date(self): | ||
| """entity_df=None without end_date defaults to now.""" | ||
| from feast.infra.offline_stores.offline_store import RetrievalJob | ||
|
|
||
| fv = self._make_feature_view("test_fv") | ||
| job = self._call_get_historical_features([fv]) | ||
| assert isinstance(job, RetrievalJob) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🔴 Non-entity retrieval silently ignores
start_datekwarg unlike Postgres counterpartWhen
entity_df is None, the Clickhouseget_historical_featuresonly readsend_datefromkwargs(clickhouse.py:59) and completely ignoresstart_date. The callerfeature_store.py:1369-1370passesstart_dateas a kwarg when the user provides it. The Postgres implementation (postgres.py:132-160), which this code is modeled after, usesstart_dateto compute the entity_df timestamp and to bound the TTL-based data scan window. In the Clickhouse version, a user-providedstart_dateis silently dropped, meaning the point-in-time join will useend_dateas the sole entity timestamp regardless of the user's intent — potentially returning different (and unexpected) feature data compared to the Postgres offline store for the same inputs.Prompt for agents
Was this helpful? React with 👍 or 👎 to provide feedback.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@YassinNouh21 This seems critical issue.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not a bug — this is intentional. The PIT join uses MAX(entity_timestamp) as the upper bound, so the timestamp in the synthetic entity_df IS the query upper bound. Using [end_date] gives the window [end_date - TTL, end_date], which is correct. The Postgres implementation using pd.date_range(start=start_date, ...)[:1] actually has the bug — it takes start_date as the sole timestamp, making end_date unreachable. Our implementation matches Dask and is the correct behavior.
@ntkathole
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ntkathole sounds like there's a bug in postgres???
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes I noticed that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It will get resolved via #6057 cc @Vperiodt