Merge pull request #81 from EnzymeML/measurement-docstrings

JR-1991 · web-flow · commit 5f6cdac5e8e9 · 2025-08-21T16:56:31.000+02:00
Update docstrings in `io.py`
diff --git a/pyenzyme/tabular.py b/pyenzyme/tabular.py
@@ -84,9 +84,9 @@ def read_excel(
     time_unit: str,
     data_type: DataTypes = DataTypes.CONCENTRATION,
 ):
-    """Reads a CSV file from the specified path into a measurement.
+    """Reads an Excel file from the specified path into a measurement.
 
-    This function expects the CSV file to have the following structure:
+    This function expects the Excel file to have the following structure:
 
         - time: The time points of the measurements. Should start at 0.
         - id: The ID of the measurement. Only needed in case of multiple measurements.
@@ -98,11 +98,10 @@ def read_excel(
     file, you need to have an 'id' column. Otherwise it will return a single measurement.
 
     Args:
-        path (str, pathlib.Path): The path to the CSV file.
+        path (str, pathlib.Path): The path to the Excel file.
         data_unit (str): The unit of the data.
         time_unit (str): The unit of the time.
         data_type (DataTypes): The type of the data. Default is DataTypes.CONCENTRATION.
-        sep (str): The separator of the CSV file. Default is ';'.
 
     Returns:
         list[Measurement]: A list of measurements.
@@ -156,7 +155,7 @@ def read_csv(
         data_unit (str): The unit of the data.
         time_unit (str): The unit of the time.
         data_type (DataTypes): The type of the data. Default is DataTypes.CONCENTRATION.
-        sep (str): The separator of the CSV file. Default is ';'.
+        sep (str): The separator of the CSV file. Default is '\t'.
 
     Returns:
         list[Measurement]: A list of measurements.
diff --git a/pyenzyme/versions/io.py b/pyenzyme/versions/io.py
@@ -86,15 +86,30 @@ def from_dataframe(
         data_unit: str,
         time_unit: str,
     ) -> list[v2.Measurement]:  # noqa: F405
-        """Create measurements from a pandas DataFrame.
+        """Parse a pandas DataFrame into a list of measurements.
+
+        This function expects the DataFrame to have the following structure:
+
+            - time: The time points of the measurements. Should start at 0.
+            - id: The ID of the measurement. Only needed in case of multiple measurements.
+            - [species_id]: Per column, the data of a species.
+
+        If there is no 'id' column, the function assumes that there is only one measurement
+        in the file. If there is an 'id' column, the function assumes that there are multiple
+        measurements in the file. Hence, if you want to have multiple measurements in the same
+        file, you need to have an 'id' column. Otherwise, it will return a single measurement.
 
         Args:
-            df: DataFrame containing measurement data
-            data_unit: Unit for the measurement data
-            time_unit: Unit for the time data
+            df (pd.DataFrame): The DataFrame to parse.
+            data_unit (str): The unit of the data.
+            time_unit (str): The unit of the time.
 
         Returns:
-            List of Measurement objects
+            list[Measurement]: A list of Measurement objects.
+
+        Raises:
+            FileNotFoundError: If the file does not exist.
+            ValueError: If the path is not a file.
         """
         return from_dataframe(df, data_unit, time_unit)
 
@@ -136,32 +151,72 @@ def to_sbml(
         cls,
         enzmldoc: v2.EnzymeMLDocument,
         path: Path | str | None = None,
+        verbose: bool = False,
     ) -> tuple[str, pd.DataFrame | None]:  # noqa: F405
-        """Convert an EnzymeML document to SBML format and write to a file.
+        """Convert an EnzymeML document to SBML format and write it to a file.
+
+        The systems biology markup language (SBML) is a machine-readable format for
+        representing models of biochemical reaction networks. This function converts
+        an EnzymeML document to an SBML document. Prior to serialization the EnzymeML
+        document is validated for SBML export.
+
+        Example:
+            >> import pyenzyme as pe
+            >> doc = pe.EnzymeMLDocument()
+            >> [add entities to doc]
+            >> to_sbml(doc, "example.xml")
 
         Args:
-            enzmldoc: The EnzymeML document to convert
-            path: Path to write the SBML document to
+            enzmldoc (pe.EnzymeMLDocument): The EnzymeML document to convert.
+            path (Path | str | None, optional): The output file to write the SBML document to. Defaults to None.
+            verbose (bool, optional): Whether to print warnings during SBML validation. Defaults to False.
 
         Returns:
-            Tuple of the SBML document and the measurement data, or None if path is None
+            tuple[str, pd.DataFrame]: The SBML document as a string, and a DataFrame with the measurement data.
+
+        Raises:
+            ValueError: If the EnzymeML document is not valid for SBML export.
         """
-        return to_sbml(enzmldoc, path)
+        return to_sbml(enzmldoc, path, verbose)
 
     @classmethod
     def to_petab(
         cls,
         enzmldoc: v2.EnzymeMLDocument,
         path: Path | str,
     ) -> PEtab:  # noqa: F405
-        """Convert an EnzymeML document to PEtab format and write to a file.
-
-        Args:
-            enzmldoc: The EnzymeML document to convert
-            path: Path to write the PEtab document to
-
-        Returns:
-            The PEtab object
+        """
+        Convert an EnzymeML document to a PEtab parameter estimation problem and write to file.
+
+        This function exports an EnzymeML document to the PEtab format, which is a
+        standardized format for specifying parameter estimation problems in systems biology.
+        The function creates all necessary PEtab files:
+
+        1. SBML model file: Contains the mathematical model specification
+        2. Condition table: Specifies experimental conditions
+        3. Observable table: Defines model outputs that correspond to measurements
+        4. Measurement table: Contains experimental data points
+        5. Parameter table: Defines model parameters and their estimation settings
+        6. YAML configuration file: Links all files together in a PEtab problem definition
+
+        Args
+        ----
+        enzmldoc : v2.EnzymeMLDocument
+            The EnzymeML document to convert, containing all model information,
+            measurements, and parameters.
+        path : Union[Path, str]
+            Directory path where PEtab files will be written. If the directory
+            doesn't exist, it will be created.
+
+        Returns
+        -------
+        None
+            Files are written to the specified path.
+
+        Notes
+        -----
+        The file naming convention is based on the EnzymeML document name,
+        with spaces replaced by underscores and converted to lowercase.
         """
         return to_petab(enzmldoc, path)
 
@@ -170,13 +225,20 @@ def from_sbml(
         cls,
         path: Path | str,
     ) -> v2.EnzymeMLDocument:  # noqa: F405
-        """Read an EnzymeML document from an SBML file.
+        """
+        Read an SBML file and initialize an EnzymeML document.
+
+        This function reads an SBML file from an OMEX archive, extracts all relevant
+        information, and creates an EnzymeML document with the extracted data. It handles
+        different versions of the EnzymeML format and maps SBML elements to their
+        corresponding EnzymeML entities.
 
         Args:
-            path: Path to the SBML file
+            path (Path | str): The path to the OMEX archive containing the SBML file.
 
         Returns:
-            An EnzymeMLDocument object
+            An initialized EnzymeMLDocument object with extracted units, species, vessels,
+            equations, parameters, reactions, and measurements.
         """
         return read_sbml(v2.EnzymeMLDocument, path)
 
@@ -188,11 +250,20 @@ def to_pandas(
     ) -> pd.DataFrame | dict[str, pd.DataFrame]:  # noqa: F405
         """Convert an EnzymeML document to a pandas DataFrame.
 
+        The resulting DataFrame contains the following columns:
+
+            - time: The time values of the measurement.
+            - id: The ID of the measurement.
+            - [species]: The species data of the measurement per column.
+
         Args:
-            enzmldoc: The EnzymeML document to convert
+            enzmldoc (EnzymeMLDocument): The EnzymeMLDocument object to convert
+            per_measurement (bool): If True, returns a dictionary of DataFrames keyed by measurement ID.
+                If False, returns a single DataFrame containing all measurements.
 
         Returns:
-            DataFrame containing the measurement data, or None if no measurements exist
+            pd.DataFrame or dictionary of DataFrames containing the measurement data,
+                or None if no measurements exist
         """
         df = to_pandas(enzmldoc)
 
@@ -213,17 +284,32 @@ def from_csv(
         data_type: v2.DataTypes = v2.DataTypes.CONCENTRATION,
         sep: str = "\t",
     ) -> list[v2.Measurement]:  # noqa: F405
-        """Create measurements from a CSV file.
+        """Reads a CSV file from the specified path into a measurement.
+
+        This function expects the CSV file to have the following structure:
+
+            - time: The time points of the measurements. Should start at 0.
+            - id: The ID of the measurement. Only needed in case of multiple measurements.
+            - [species_id]: Per column, the data of a species.
+
+        If there is no 'id' column, the function assumes that there is only one measurement
+        in the file. If there is an 'id' column, the function assumes that there are multiple
+        measurements in the file. Hence, if you want to have multiple measurements in the same
+        file, you need to have an 'id' column. Otherwise it will return a single measurement.
 
         Args:
-            path: Path to the CSV file
-            data_unit: Unit for the measurement data
-            time_unit: Unit for the time data
-            data_type: Type of data (default: CONCENTRATION)
-            sep: Separator used in the CSV file (default: tab)
+            path (str, pathlib.Path): The path to the CSV file.
+            data_unit (str): The unit of the data.
+            time_unit (str): The unit of the time.
+            data_type (DataTypes): The type of the data. Default is DataTypes.CONCENTRATION.
+            sep (str): The separator of the CSV file. Default is '\t'.
 
         Returns:
-            List of Measurement objects
+            list[Measurement]: A list of Measurement objects.
+
+        Raises:
+            FileNotFoundError: If the file does not exist.
+            ValueError: If the path is not a file.
         """
         return read_csv(path, data_unit, time_unit, data_type, sep)
 
@@ -235,16 +321,31 @@ def from_excel(
         time_unit: str,
         data_type: v2.DataTypes = v2.DataTypes.CONCENTRATION,
     ) -> list[v2.Measurement]:  # noqa: F405
-        """Create measurements from an Excel file.
+        """Reads an Excel file from the specified path into Measurement objects.
+
+        This function expects the Excel file to have the following structure:
+
+            - time: The time points of the measurements. Should start at 0.
+            - id: The ID of the measurement. Only needed in case of multiple measurements.
+            - [species_id]: Per column, the data of a species.
+
+        If there is no 'id' column, the function assumes that there is only one measurement
+        in the file. If there is an 'id' column, the function assumes that there are multiple
+        measurements in the file. Hence, if you want to have multiple measurements in the same
+        file, you need to have an 'id' column. Otherwise it will return a single measurement.
 
         Args:
-            path: Path to the Excel file
-            data_unit: Unit for the measurement data
-            time_unit: Unit for the time data
-            data_type: Type of data (default: CONCENTRATION)
+            path (str, pathlib.Path): The path to the Excel file.
+            data_unit (str): The unit of the data.
+            time_unit (str): The unit of the time.
+            data_type (DataTypes): The type of the data. Default is DataTypes.CONCENTRATION.
 
         Returns:
-            List of Measurement objects
+            list[Measurement]: A list of measurements.
+
+        Raises:
+            FileNotFoundError: If the file does not exist.
+            ValueError: If the path is not a file.
         """
         return read_excel(path, data_unit, time_unit, data_type)