diff --git a/bfabric_app_runner/src/bfabric_app_runner/cli/validate.py b/bfabric_app_runner/src/bfabric_app_runner/cli/validate.py index 5a6645b59..88b8bd74b 100644 --- a/bfabric_app_runner/src/bfabric_app_runner/cli/validate.py +++ b/bfabric_app_runner/src/bfabric_app_runner/cli/validate.py @@ -28,5 +28,5 @@ def cmd_validate_inputs_spec(yaml_file: Path) -> None: def cmd_validate_outputs_spec(yaml_file: Path) -> None: """Validate an outputs spec file.""" - outputs_spec = OutputsSpec.model_validate(yaml.safe_load(yaml_file.read_text())) + outputs_spec = OutputsSpec.read_yaml(yaml_file) pprint(outputs_spec) diff --git a/bfabric_app_runner/src/bfabric_app_runner/output_registration/annotation_table.py b/bfabric_app_runner/src/bfabric_app_runner/output_registration/annotation_table.py new file mode 100644 index 000000000..98797a872 --- /dev/null +++ b/bfabric_app_runner/src/bfabric_app_runner/output_registration/annotation_table.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, assert_never + +import polars as pl + +if TYPE_CHECKING: + from pathlib import Path + + from bfabric_app_runner.specs.outputs.annotations import BfabricOutputDataset, IncludeDatasetRef, IncludeResourceRef + + +def _validate_table_schema(df: pl.DataFrame) -> None: + """Validate that required columns exist and have correct types.""" + if "Resource" not in df.columns: + raise ValueError("Missing required column: Resource") + + if not df["Resource"].dtype.is_integer(): + raise ValueError(f"Column 'Resource' must be integer type, got {df['Resource'].dtype}") + + if df["Resource"].null_count() > 0: + raise ValueError("Column 'Resource' cannot contain null values") + + if "Anchor" in df.columns and df["Anchor"].dtype not in (pl.Null, pl.String): + raise ValueError(f"Column 'Anchor' must be String type, got {df['Anchor'].dtype}") + + +def _load_table(ref: IncludeDatasetRef) -> pl.DataFrame: + format = ref.get_format() + # TODO maybe this should be a generic function somewhere, it's duplicated with input specs probably! + match format: + case "csv": + df = pl.read_csv(ref.local_path) + case "tsv": + df = pl.read_csv(ref.local_path, separator="\t") + case "parquet": + df = pl.read_parquet(ref.local_path) + case _: + assert_never(format) + _validate_table_schema(df) + return df + + +def _get_resource_row(resource: IncludeResourceRef, resource_mapping: dict[Path, int]) -> dict[str, int | str | None]: + # TODO check if accessing store_entry_path like this is robust enough (see comment below) + resource_id = resource_mapping[resource.store_entry_path] + return {"Resource": resource_id, "Anchor": resource.anchor, **resource.metadata} + + +# TODO the resource mapping has some challenges if it's not a model regarding mis-use non-canonical paths, +# e.g. they should never start with `/` but this could also be validated in the originating model + + +def generate_output_table(config: BfabricOutputDataset, resource_mapping: dict[Path, int]) -> pl.DataFrame: + """Generates the output table contents for the specified output dataset configuration. + + :param config: the output dataset configuration + :param resource_mapping: a mapping of store_entry_path to resource_id of the resources which were created + :return: the output table contents + """ + parts: list[pl.DataFrame] = [_load_table(ref) for ref in config.include_tables] + if config.include_resources: + resource_rows = [_get_resource_row(r, resource_mapping) for r in config.include_resources] + resources_df = pl.from_dicts(resource_rows, strict=False) + _validate_table_schema(resources_df) + parts.append(resources_df) + if not parts: + raise ValueError("BfabricOutputDataset requires at least one include_tables or include_resources entry") + return pl.concat(parts, how="diagonal_relaxed") diff --git a/bfabric_app_runner/src/bfabric_app_runner/output_registration/register.py b/bfabric_app_runner/src/bfabric_app_runner/output_registration/register.py index 7f406aa16..d2b1773ec 100644 --- a/bfabric_app_runner/src/bfabric_app_runner/output_registration/register.py +++ b/bfabric_app_runner/src/bfabric_app_runner/output_registration/register.py @@ -1,7 +1,8 @@ from __future__ import annotations import hashlib -from typing import TYPE_CHECKING +from pathlib import Path # noqa: TC003 # used at runtime by RegisterAllOutputs pydantic model +from typing import TYPE_CHECKING, assert_never import polars as pl import yaml @@ -9,7 +10,10 @@ from bfabric.operations.dataset import CreateDatasetParams, create_dataset from bfabric.utils.table_lint import check_for_invalid_characters from loguru import logger +from pydantic import BaseModel +from bfabric_app_runner.output_registration.annotation_table import generate_output_table +from bfabric_app_runner.specs.outputs.annotations import AnnotationType, BfabricOutputDataset from bfabric_app_runner.specs.outputs_spec import ( CopyResourceSpec, OutputsSpec, @@ -21,8 +25,6 @@ from bfabric_app_runner.util.scp import scp if TYPE_CHECKING: - from pathlib import Path - from bfabric import Bfabric from bfabric.experimental.workunit_definition import WorkunitDefinition @@ -39,8 +41,8 @@ def register_file_in_workunit( client: Bfabric, workunit_definition: WorkunitDefinition, resource_id: int | None = None, -) -> None: - """Registers a file in the workunit.""" +) -> int: + """Registers a file in the workunit and returns the resource ID.""" existing_id = _identify_existing_resource_id(client, spec, workunit_definition) if resource_id is not None and existing_id is not None and resource_id != existing_id: raise ValueError(f"Resource id {resource_id} does not match existing resource id {existing_id}") @@ -62,7 +64,8 @@ def register_file_in_workunit( if existing_id is not None: resource_data["id"] = existing_id - _ = client.save("resource", resource_data) + result = client.save("resource", resource_data) + return result[0]["id"] def _identify_existing_resource_id( @@ -172,6 +175,11 @@ def find_default_resource_id(workunit_definition: WorkunitDefinition, client: Bf return None +class RegisterAllOutputs(BaseModel): + resources_mapping: dict[Path, int] = {} + """This maps the store_entry_path to the ids of the resources which were created or updated""" + + def register_all( client: Bfabric, workunit_definition: WorkunitDefinition, @@ -179,13 +187,15 @@ def register_all( ssh_user: str | None, reuse_default_resource: bool, force_storage: Path | None, -) -> None: +) -> RegisterAllOutputs: """Registers all the output specs to the workunit.""" default_resource_was_reused = not reuse_default_resource storage = _get_storage(client, force_storage, specs_list, workunit_definition) logger.info(f"Using storage: {storage}") + outputs = RegisterAllOutputs() + for spec in specs_list: logger.debug(f"Registering {spec}") if isinstance(spec, CopyResourceSpec): @@ -200,7 +210,8 @@ def register_all( default_resource_was_reused = True else: resource_id = None - register_file_in_workunit( + + outputs.resources_mapping[spec.store_entry_path] = register_file_in_workunit( spec, client=client, workunit_definition=workunit_definition, @@ -211,7 +222,9 @@ def register_all( elif isinstance(spec, SaveLinkSpec): _save_link(spec, client, workunit_definition=workunit_definition) else: - raise ValueError(f"Unknown spec type: {type(spec)}") + assert_never(type(spec)) + + return outputs def _get_storage( @@ -227,6 +240,36 @@ def _get_storage( return None +def _save_annotations( + outputs: RegisterAllOutputs, + annotations: list[AnnotationType], + client: Bfabric, + workunit_definition: WorkunitDefinition, +) -> None: + # Create-only for now: if an output dataset already exists on the workunit, `create_dataset` will + # surface the SOAP error. Updating an existing dataset is a follow-up (no Workunit.output_dataset + # field today, so locating the prior dataset requires its own design). + registration = workunit_definition.registration + if registration is None: + raise ValueError("workunit_definition has no registration; cannot save annotations") + for annotation in annotations: + if isinstance(annotation, BfabricOutputDataset): + output_df = generate_output_table(config=annotation, resource_mapping=outputs.resources_mapping) + check_for_invalid_characters(table=output_df, invalid_characters="") + dataset = create_dataset( + client, + output_df, + CreateDatasetParams( + name=annotation.name or f"Output Dataset (workunit {registration.workunit_id})", + container_id=registration.container_id, + workunit_id=registration.workunit_id, + ), + ) + logger.info(f"Output dataset saved with id {dataset.id} for workunit {registration.workunit_id}") + else: + assert_never(type(annotation)) + + def register_outputs( outputs_yaml: Path, workunit_definition: WorkunitDefinition, @@ -236,12 +279,13 @@ def register_outputs( force_storage: Path | None, ) -> None: """Registers outputs to the workunit.""" - specs_list = OutputsSpec.read_yaml(outputs_yaml) - register_all( + spec = OutputsSpec.read_yaml(outputs_yaml) + outputs = register_all( client=client, workunit_definition=workunit_definition, - specs_list=specs_list, + specs_list=spec.outputs, ssh_user=ssh_user, reuse_default_resource=reuse_default_resource, force_storage=force_storage, ) + _save_annotations(outputs, spec.annotations, client=client, workunit_definition=workunit_definition) diff --git a/bfabric_app_runner/src/bfabric_app_runner/specs/inputs/static_file_spec.py b/bfabric_app_runner/src/bfabric_app_runner/specs/inputs/static_file_spec.py index aea8c9fc5..fbeba2903 100644 --- a/bfabric_app_runner/src/bfabric_app_runner/specs/inputs/static_file_spec.py +++ b/bfabric_app_runner/src/bfabric_app_runner/specs/inputs/static_file_spec.py @@ -11,6 +11,7 @@ class StaticFileSpec(BaseModel): type: Literal["static_file"] = "static_file" content: str | bytes + """The text or binary content to write.""" filename: str """The target filename to write to.""" diff --git a/bfabric_app_runner/src/bfabric_app_runner/specs/outputs/__init__.py b/bfabric_app_runner/src/bfabric_app_runner/specs/outputs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/bfabric_app_runner/src/bfabric_app_runner/specs/outputs/annotations.py b/bfabric_app_runner/src/bfabric_app_runner/specs/outputs/annotations.py new file mode 100644 index 000000000..48b0383dd --- /dev/null +++ b/bfabric_app_runner/src/bfabric_app_runner/specs/outputs/annotations.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import typing +from pathlib import Path +from typing import ClassVar, Literal + +from pydantic import BaseModel, model_validator + + +class IncludeDatasetRef(BaseModel): + Formats: ClassVar = Literal["csv", "tsv", "parquet"] + + local_path: Path + format: Formats | None = None + + def get_format(self) -> Formats: + """Returns the format inferring the type from the filename if not specified explicitly.""" + if self.format is not None: + return self.format + return self.local_path.suffix.removeprefix(".") + + @model_validator(mode="after") + def _check_format_or_correct_suffix(self) -> IncludeDatasetRef: + allowed_formats = typing.get_args(self.Formats) + if self.format is None and self.local_path.suffix.removeprefix(".") not in allowed_formats: + msg = f"When format is not specified, the file extension must be one of {allowed_formats}" + raise ValueError(msg) + return self + + +class IncludeResourceRef(BaseModel): + store_entry_path: Path + anchor: str = "" + metadata: dict[str, str] = {} + + +class BfabricOutputDataset(BaseModel): + # Single discriminator value for now; when a second annotation type lands, add a default and migrate consumers. + type: Literal["bfabric_output_dataset"] + name: str | None = None + include_tables: list[IncludeDatasetRef] = [] + include_resources: list[IncludeResourceRef] = [] + + +AnnotationType = BfabricOutputDataset diff --git a/bfabric_app_runner/src/bfabric_app_runner/specs/outputs_spec.py b/bfabric_app_runner/src/bfabric_app_runner/specs/outputs_spec.py index 7ab7d07a6..5afc6da1f 100644 --- a/bfabric_app_runner/src/bfabric_app_runner/specs/outputs_spec.py +++ b/bfabric_app_runner/src/bfabric_app_runner/specs/outputs_spec.py @@ -1,12 +1,15 @@ from __future__ import annotations import enum +from collections import Counter from pathlib import Path # noqa: TCH003 -from typing import Literal, Annotated +from typing import Annotated, Literal import yaml from pydantic import BaseModel, ConfigDict, Field, model_validator +from bfabric_app_runner.specs.outputs.annotations import AnnotationType + class UpdateExisting(enum.Enum): NO = "no" @@ -45,6 +48,7 @@ class SaveDatasetSpec(BaseModel): invalid_characters: str = "" +# TODO deprecate! class SaveLinkSpec(BaseModel): """Saves a link to the workunit, or, if desired to an arbitrary entity of type entity_type with id entity_id.""" @@ -74,13 +78,27 @@ def require_entity_id_if_not_workunit(self) -> SaveLinkSpec: class OutputsSpec(BaseModel): model_config = ConfigDict(extra="forbid") outputs: list[SpecType] + annotations: list[AnnotationType] = [] @classmethod - def read_yaml(cls, path: Path) -> list[SpecType]: - model = cls.model_validate(yaml.safe_load(path.read_text())) - return model.outputs + def read_yaml(cls, path: Path) -> OutputsSpec: + return cls.model_validate(yaml.safe_load(path.read_text())) @classmethod def write_yaml(cls, specs: list[SpecType], path: Path) -> None: model = cls.model_validate(dict(outputs=specs)) path.write_text(yaml.dump(model.model_dump(mode="json"))) + + @model_validator(mode="after") + def _no_duplicate_store_entry_paths(self) -> OutputsSpec: + # arguably, this is a bit strict and might be relaxed in the future + # TODO although the main scenario where this could be a problem is when store_folder_path is used and maybe we + # should handle this differently? + resource_specs = list(filter(lambda x: isinstance(x, CopyResourceSpec), self.outputs)) + store_entry_paths = [spec.store_entry_path for spec in resource_specs] + # check for duplicates + duplicates = [path for path, count in Counter(store_entry_paths).items() if count > 1] + if duplicates: + msg = f"Duplicate store entry paths: {duplicates}" + raise ValueError(msg) + return self diff --git a/tests/bfabric_app_runner/output_registration/test_annotation_table.py b/tests/bfabric_app_runner/output_registration/test_annotation_table.py new file mode 100644 index 000000000..c4a3a227d --- /dev/null +++ b/tests/bfabric_app_runner/output_registration/test_annotation_table.py @@ -0,0 +1,96 @@ +from unittest.mock import MagicMock + +import pytest +import polars as pl +import polars.testing +from pathlib import Path + +from bfabric.results.result_container import ResultContainer +from bfabric_app_runner.output_registration.annotation_table import generate_output_table +from bfabric_app_runner.output_registration.register import RegisterAllOutputs, _save_annotations +from bfabric_app_runner.specs.outputs.annotations import BfabricOutputDataset, IncludeDatasetRef, IncludeResourceRef + + +class TestGenerateOutputTable: + @pytest.fixture + def input_table_path(self, tmp_path): + table = pl.DataFrame( + {"Resource": [100, 200], "Anchor": [None, "#example"], "My custom column": ["value1", "value2"]} + ) + path = tmp_path / "input" / "table.parquet" + path.parent.mkdir() + table.write_parquet(path) + return path + + @pytest.fixture() + def spec(self, input_table_path): + return BfabricOutputDataset( + type="bfabric_output_dataset", + include_tables=[IncludeDatasetRef(local_path=input_table_path)], + include_resources=[IncludeResourceRef(store_entry_path="custom.txt", metadata={"prop": "value"})], + ) + + @pytest.fixture() + def resource_mapping(self) -> dict[Path, int]: + return {Path("custom.txt"): 300} + + def test_generate_output_table(self, spec, resource_mapping): + table = generate_output_table(spec, resource_mapping) + assert table.columns == ["Resource", "Anchor", "My custom column", "prop"] + polars.testing.assert_frame_equal( + table, + pl.DataFrame( + { + "Resource": [100, 200, 300], + "Anchor": [None, "#example", ""], + "My custom column": ["value1", "value2", None], + "prop": [None, None, "value"], + } + ), + ) + + +class TestSaveAnnotations: + @pytest.fixture() + def mock_client(self) -> MagicMock: + client = MagicMock() + client.config.base_url = "https://example.bfabric" + client.save.return_value = ResultContainer(results=[{"id": 999}], total_pages_api=None, errors=[]) + return client + + @pytest.fixture() + def mock_workunit_definition(self) -> MagicMock: + mock = MagicMock() + mock.registration.workunit_id = 42 + mock.registration.container_id = 7 + return mock + + @pytest.fixture() + def outputs(self) -> RegisterAllOutputs: + return RegisterAllOutputs(resources_mapping={Path("custom.txt"): 300}) + + def test_creates_dataset_with_default_name(self, mock_client, mock_workunit_definition, outputs): + annotation = BfabricOutputDataset( + type="bfabric_output_dataset", + include_resources=[IncludeResourceRef(store_entry_path="custom.txt", metadata={"prop": "value"})], + ) + _save_annotations(outputs, [annotation], client=mock_client, workunit_definition=mock_workunit_definition) + mock_client.save.assert_called_once() + endpoint, payload = mock_client.save.call_args[0] + assert endpoint == "dataset" + assert payload["name"] == "Output Dataset (workunit 42)" + assert payload["containerid"] == 7 + assert payload["workunitid"] == 42 + + def test_uses_explicit_name(self, mock_client, mock_workunit_definition, outputs): + annotation = BfabricOutputDataset( + type="bfabric_output_dataset", + name="My Results", + include_resources=[IncludeResourceRef(store_entry_path="custom.txt")], + ) + _save_annotations(outputs, [annotation], client=mock_client, workunit_definition=mock_workunit_definition) + assert mock_client.save.call_args[0][1]["name"] == "My Results" + + def test_no_annotations_no_save(self, mock_client, mock_workunit_definition, outputs): + _save_annotations(outputs, [], client=mock_client, workunit_definition=mock_workunit_definition) + mock_client.save.assert_not_called() diff --git a/tests/bfabric_app_runner/specs/outputs/test_annotations.py b/tests/bfabric_app_runner/specs/outputs/test_annotations.py new file mode 100644 index 000000000..08f857754 --- /dev/null +++ b/tests/bfabric_app_runner/specs/outputs/test_annotations.py @@ -0,0 +1,25 @@ +from pathlib import Path +import pytest +from bfabric_app_runner.specs.outputs.annotations import IncludeDatasetRef + + +class TestIncludeDatasetRef: + @pytest.mark.parametrize("format", ["csv", "tsv", "parquet"]) + def test_get_format_when_unspecified(self, format): + ref = IncludeDatasetRef(local_path=Path(f"dummy.{format}"), format=None) + assert ref.format is None + assert ref.get_format() == format + + @pytest.mark.parametrize("format", ["csv", "tsv", "parquet"]) + def test_get_format_when_specified(self, format): + ref = IncludeDatasetRef(local_path=Path("dummy.png"), format=format) + assert ref.format == format + assert ref.get_format() == format + + def test_validate_invalid_suffix_when_format_unspecified(self): + with pytest.raises(ValueError) as err: + _ = IncludeDatasetRef(local_path=Path("dummy.png"), format=None) + assert ( + err.value.errors()[0]["msg"] + == "Value error, When format is not specified, the file extension must be one of ('csv', 'tsv', 'parquet')" + ) diff --git a/tests/bfabric_app_runner/specs/test_outputs_spec.py b/tests/bfabric_app_runner/specs/test_outputs_spec.py index ade870ea3..376a85bea 100644 --- a/tests/bfabric_app_runner/specs/test_outputs_spec.py +++ b/tests/bfabric_app_runner/specs/test_outputs_spec.py @@ -2,6 +2,7 @@ import yaml from bfabric_app_runner.specs.outputs_spec import OutputsSpec, CopyResourceSpec, SaveDatasetSpec +from pathlib import Path @pytest.fixture() @@ -28,7 +29,8 @@ def parsed() -> OutputsSpec: @pytest.fixture() def serialized() -> str: - return """outputs: + return """annotations: [] +outputs: - local_path: local_path protocol: scp store_entry_path: store_entry_path @@ -49,3 +51,9 @@ def test_serialize(parsed, serialized): def test_parse(parsed, serialized): assert OutputsSpec.model_validate(yaml.safe_load(serialized)) == parsed + + +def test_read_yaml(tmp_path: Path, parsed: OutputsSpec, serialized: str) -> None: + path = tmp_path / "outputs.yml" + path.write_text(serialized) + assert OutputsSpec.read_yaml(path) == parsed