Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/sphinx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,10 @@ jobs:
path: skore/venv
key: ${{ steps.cache-python-venv.outputs.cache-primary-key }}

- name: Install graphviz
run: |
sudo apt-get install -y graphviz

- name: Install all packages from local branch
run: |
python -m pip install --force-reinstall --no-deps skore/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ graphql-core==3.2.8
# graphql-relay
graphql-relay==3.2.0
# via graphene
greenlet==3.3.2
# via sqlalchemy
gunicorn==25.1.0
# via mlflow
h11==0.16.0
Expand Down Expand Up @@ -394,7 +396,7 @@ skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skore-mlflow-project==0.0.2
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
smmap==5.0.3
# via gitdb
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ six==1.17.0
# via python-dateutil
skore-local-project==0.0.5
# via skore (skore/pyproject.toml)
skrub==0.7.2
skrub==0.8.0
# via skore (skore/pyproject.toml)
stack-data==0.6.3
# via ipython
Expand Down
94 changes: 94 additions & 0 deletions examples/use_cases/plot_fraud_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Tracking all the data processing
================================

To track all operations and be able to apply the fitted estimator to unseen
data, we need to include all the data wrangling in the estimator used for our
skore report. In very simple cases this can be done with a scikit-learn
Pipeline. When we have transformations not supported by the Pipeline (such as
transformations that change the number of rows, or that involve multiple tables
such as joins), skore allows us to use a skrub DataOp instead.

In this example we consider a dataset that is simple, but still requires some
data wrangling (encoding, aggregation and joining) which could not be performed
in a regular scikit-learn estimator.

To track those operations, we use a skrub DataOp, which can perform richer
transformations than normal estimators, and also has built-in support from
skore.

The dataset contains a list of online transactions (each corresponds to a cart,
or "basket"), each linked to one or more products for which we have a description.
The task is to predict which involved credit fraud.
"""

# %%
# We start by defining our data-processing pipeline. Note that it contains
# operations, such as aggregating and joining the product information after
# vectorizing the text it contains, that would not be possible in a normal
# estimator.

# %%
import skore
import skrub
from sklearn.ensemble import HistGradientBoostingClassifier

dataset = skrub.datasets.fetch_credit_fraud(split="all")

products = skrub.var("products", dataset.products)
baskets = skrub.var("baskets", dataset.baskets)

basket_ids = baskets[["ID"]].skb.mark_as_X()
fraud_flags = baskets["fraud_flag"].skb.mark_as_y()


def filter_products(products, basket_ids):
return products[products["basket_ID"].isin(basket_ids["ID"])]


vectorized_products = products.skb.apply_func(filter_products, basket_ids).skb.apply(
skrub.TableVectorizer(), exclude_cols="basket_ID"
)


def join_product_info(basket_ids, vectorized_products):
return basket_ids.merge(
vectorized_products.groupby("basket_ID").agg("mean").reset_index(),
left_on="ID",
right_on="basket_ID",
).drop(columns=["ID", "basket_ID"])


pred = basket_ids.skb.apply_func(join_product_info, vectorized_products).skb.apply(
HistGradientBoostingClassifier(), y=fraud_flags
)

# This would generate a report with previews of intermediate results & fitted
# estimators:
#
# pred.skb.full_report()

pred

# %%
# Above we see a preview on the whole dataset. Click the "show graph" toggle to
# see a drawing of the pipeline we have built.
#
# Just like a normal estimator, a skrub DataOp can be used with skore reports.
# We can either pass separately a SkrubLearner and training and testing data,
# or pass our DataOp with the data it already contains and rely on the default
# train/test split:

# %%
report = skore.EstimatorReport(pred, pos_label=1)
report.metrics.roc_auc()

# %%
report.metrics.precision_recall().plot()

# %%
# Note that the preprocessing operations are captured in the skrub DataOp,
# hence in our report -- so we can replay them later on unseen data.

# %%
report.estimator_.data_op
2 changes: 1 addition & 1 deletion skore/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies = [
"rich>=14.2",
"scikit-learn",
"skore[local]",
"skrub",
"skrub>=0.8",
"seaborn",
"jinja2",
]
Expand Down
38 changes: 14 additions & 24 deletions skore/src/skore/_sklearn/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@ def _rich_repr(self, class_name: str) -> str:
)
return string_buffer.getvalue()

def _get_X_y(
def _get_data_and_y_true(
self,
*,
data_source: Literal["test", "train"],
) -> tuple[ArrayLike, ArrayLike]:
) -> tuple[dict, ArrayLike]:
"""Get the requested dataset.

Parameters
Expand All @@ -86,41 +86,31 @@ def _get_X_y(

Returns
-------
X : array-like of shape (n_samples, n_features)
data : dict of input data
The requested dataset.

y : array-like of shape (n_samples,)
The requested dataset.
The target labels.
"""
if data_source == "test":
if self._parent._X_test is None or self._parent._y_test is None:
missing_data = "X_test and y_test"
raise ValueError(
f"No {data_source} data (i.e. {missing_data}) were provided "
f"when creating the report. Please provide the {data_source} "
"data when creating the report."
)
return self._parent._X_test, self._parent._y_test
elif data_source == "train":
if self._parent._X_train is None or self._parent._y_train is None:
missing_data = "X_train and y_train"
raise ValueError(
f"No {data_source} data (i.e. {missing_data}) were provided "
f"when creating the report. Please provide the {data_source} "
"data when creating the report."
)
return self._parent._X_train, self._parent._y_train
else:
if data_source not in ["train", "test"]:
raise ValueError(
f"Invalid data source: {data_source}. Possible values are: test, train."
)
if getattr(self._parent, f"{data_source}_data") is None:
raise ValueError(
f"No {data_source} data were provided when creating the report."
)
if data_source == "test":
return self._parent.test_data, self._parent.y_test
assert data_source == "train"
return self._parent.train_data, self._parent.y_train


def _get_cached_response_values(
*,
cache: Cache,
estimator: BaseEstimator,
X: ArrayLike | None,
X: ArrayLike | dict | None,
response_method: str | list[str] | tuple[str, ...],
pos_label: PositiveLabel | None = None,
data_source: Literal["test", "train"] = "test",
Expand Down
12 changes: 6 additions & 6 deletions skore/src/skore/_sklearn/_comparison/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,14 +455,14 @@ def create_estimator_report(

estimator_report = cast(EstimatorReport, self.reports_[report_key])
X_concat = (
pd.concat([estimator_report._X_train, estimator_report._X_test])
if isinstance(estimator_report._X_train, pd.DataFrame)
else np.concatenate([estimator_report._X_train, estimator_report._X_test])
pd.concat([estimator_report.X_train, estimator_report.X_test])
if isinstance(estimator_report.X_train, pd.DataFrame)
else np.concatenate([estimator_report.X_train, estimator_report.X_test])
)
y_concat = (
pd.concat([estimator_report._y_train, estimator_report._y_test])
if isinstance(estimator_report._y_train, (pd.DataFrame, pd.Series))
else np.concatenate([estimator_report._y_train, estimator_report._y_test])
pd.concat([estimator_report.y_train, estimator_report.y_test])
if isinstance(estimator_report.y_train, (pd.DataFrame, pd.Series))
else np.concatenate([estimator_report.y_train, estimator_report.y_test])
)
report = EstimatorReport(
estimator_report.estimator,
Expand Down
Loading
Loading