Fix pyright type errors and AWS Bedrock semantic tag bug (#47)

pragyan-amp · web-flow · commit bee191d44c24 · 2025-12-08T22:33:07.000+05:30
## Summary

- Resolved all 74 pyright type errors across the codebase
- Fixed bug where AWS Bedrock LLM assigns semantic tags to numeric
columns (LONG, BIGINT, etc.), causing Stitch jobs to fail with
"Semantics on fields of type LONG are not supported"

## Changes

**Type fixes:**
- Aligned `ModelInfo` TypedDict across LLM providers
- Fixed `WizardState.models` type hint
- Added proper None handling and type narrowing throughout
- Fixed except clause ordering and Callable imports

**Stitch/PII fix:**
- Added `NUMERIC_TYPES` filter in `stitch_tools.py` to strip semantics
from numeric columns
- Updated PII detection prompt to instruct LLM not to tag numeric
columns

## Test plan

- [x] All 626 unit tests pass
- [x] Pyright reports 0 errors
- [x] Ruff linting passes
diff --git a/chuck_data/agent/manager.py b/chuck_data/agent/manager.py
@@ -210,29 +210,36 @@ def process_with_tools(self, tools, max_iterations: int = 20):
             if response_message.tool_calls:
                 # Add the assistant's response (requesting tool calls) to history
                 # Convert ChatCompletionMessage to dict format for consistency
+                tool_calls_list = []
+                for tc in response_message.tool_calls:
+                    func = getattr(tc, "function", None)
+                    if func is not None:
+                        tool_calls_list.append(
+                            {
+                                "id": tc.id,
+                                "type": getattr(tc, "type", "function"),
+                                "function": {
+                                    "name": getattr(func, "name", ""),
+                                    "arguments": getattr(func, "arguments", "{}"),
+                                },
+                            }
+                        )
                 assistant_msg = {
                     "role": "assistant",
                     "content": response_message.content,
-                    "tool_calls": [
-                        {
-                            "id": tc.id,
-                            "type": getattr(tc, "type", "function"),
-                            "function": {
-                                "name": tc.function.name,
-                                "arguments": tc.function.arguments,
-                            },
-                        }
-                        for tc in response_message.tool_calls
-                    ],
+                    "tool_calls": tool_calls_list,
                 }
                 self.conversation_history.append(assistant_msg)
 
                 # Execute each tool call
                 for tool_call in response_message.tool_calls:
-                    tool_name = tool_call.function.name
+                    func = getattr(tool_call, "function", None)
+                    if func is None:
+                        continue
+                    tool_name = getattr(func, "name", "")
                     tool_id = tool_call.id
                     try:
-                        tool_args = json.loads(tool_call.function.arguments)
+                        tool_args = json.loads(getattr(func, "arguments", "{}"))
                         tool_result = execute_tool(
                             self.api_client,
                             tool_name,
@@ -276,7 +283,7 @@ def process_with_tools(self, tools, max_iterations: int = 20):
                 continue
             else:
                 # No tool calls, this is the final response
-                final_content = response_message.content
+                final_content = response_message.content or ""
                 # remove all lines with any <function> tags
                 final_content = "\n".join(
                     line
diff --git a/chuck_data/agent/tool_executor.py b/chuck_data/agent/tool_executor.py
@@ -23,7 +23,8 @@
 from chuck_data.clients.databricks import (
     DatabricksAPIClient,
 )  # For type hinting api_client
-from typing import Dict, Any, Optional, List
+from typing import Dict, Any, Optional, List, Callable
+from jsonschema.exceptions import ValidationError
 
 
 # The display_to_user utility and individual tool implementation functions
@@ -48,7 +49,7 @@ def execute_tool(
     api_client: Optional[DatabricksAPIClient],
     tool_name: str,
     tool_args: Dict[str, Any],
-    output_callback: Optional[callable] = None,
+    output_callback: Optional[Callable[..., Any]] = None,
 ) -> Dict[str, Any]:
     """Execute a tool (command) by its name with the provided arguments.
 
@@ -87,7 +88,7 @@ def execute_tool(
     try:
         jsonschema.validate(instance=tool_args, schema=schema_to_validate)
         logging.debug(f"Tool arguments for '{tool_name}' validated successfully.")
-    except jsonschema.exceptions.ValidationError as ve:
+    except ValidationError as ve:
         logging.error(
             f"Validation error for tool '{tool_name}' args {tool_args}: {ve.message}"
         )
diff --git a/chuck_data/api_client.py b/chuck_data/api_client.py
@@ -135,6 +135,8 @@ def upload_file(self, path, file_path=None, content=None, overwrite=False):
                 binary_data = f.read()
         else:
             # Convert string content to bytes
+            # content is guaranteed non-None by the validation above
+            assert content is not None
             binary_data = content.encode("utf-8")
 
         try:
diff --git a/chuck_data/clients/amperity.py b/chuck_data/clients/amperity.py
@@ -8,6 +8,7 @@
 import webbrowser
 import readchar
 import json
+from typing import Optional
 from rich.console import Console
 
 from chuck_data.config import set_amperity_token
@@ -113,7 +114,7 @@ def get_auth_status(self) -> dict:
         return {"state": self.state, "nonce": self.nonce, "has_token": bool(self.token)}
 
     def wait_for_auth_completion(
-        self, poll_interval: int = 1, timeout: int = None
+        self, poll_interval: int = 1, timeout: Optional[int] = None
     ) -> tuple[bool, str]:
         """Wait for authentication to complete in a blocking manner."""
         if not self.nonce:
diff --git a/chuck_data/clients/databricks.py b/chuck_data/clients/databricks.py
@@ -724,6 +724,8 @@ def upload_file(self, path, file_path=None, content=None, overwrite=False):
                 binary_data = f.read()
         else:
             # Convert string content to bytes
+            # content is guaranteed non-None by the validation above
+            assert content is not None
             binary_data = content.encode("utf-8")
 
         try:
diff --git a/chuck_data/command_output.py b/chuck_data/command_output.py
@@ -10,7 +10,7 @@
 
 from chuck_data.ui.table_formatter import display_table
 
-from chuck_data.command_result import CommandResult
+from chuck_data.commands.base import CommandResult
 from chuck_data.ui.theme import (
     SUCCESS,
     WARNING,
@@ -132,7 +132,7 @@ def format_for_agent(result: CommandResult) -> Dict[str, Any]:
             }
 
         # Start with a base response
-        response = {"success": True}
+        response: Dict[str, Any] = {"success": True}
 
         # Add the message if available
         if result.message:
diff --git a/chuck_data/commands/catalog_selection.py b/chuck_data/commands/catalog_selection.py
@@ -64,7 +64,7 @@ def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandRe
         client: API client instance
         **kwargs: catalog (str) - catalog name, tool_output_callback (optional)
     """
-    catalog: str = kwargs.get("catalog")
+    catalog = kwargs.get("catalog")
     tool_output_callback = kwargs.get("tool_output_callback")
 
     if not catalog:
diff --git a/chuck_data/commands/job_status.py b/chuck_data/commands/job_status.py
@@ -639,7 +639,7 @@ def handle_list_jobs(client=None, **kwargs) -> CommandResult:
         cached_job_data = job_entry.get("job_data")
 
         # If we have cached data for a terminal state, use it
-        if cached_job_data:
+        if cached_job_data and isinstance(cached_job_data, dict):
             state = (cached_job_data.get("state") or "").lower().replace(":", "")
             # Only use cache for terminal states (succeeded, failed, unknown)
             if state in ["succeeded", "success", "failed", "error", "unknown"]:
diff --git a/chuck_data/commands/jobs.py b/chuck_data/commands/jobs.py
@@ -13,8 +13,8 @@ def handle_launch_job(client: Optional[DatabricksAPIClient], **kwargs) -> Comman
         client: API client instance
         **kwargs: config_path (str), init_script_path (str), run_name (str, optional), tool_output_callback (callable, optional)
     """
-    config_path: str = kwargs.get("config_path")
-    init_script_path: str = kwargs.get("init_script_path")
+    config_path = kwargs.get("config_path")
+    init_script_path = kwargs.get("init_script_path")
     run_name: Optional[str] = kwargs.get("run_name")
     tool_output_callback = kwargs.get("tool_output_callback")
     policy_id: Optional[str] = kwargs.get("policy_id")
diff --git a/chuck_data/commands/model_selection.py b/chuck_data/commands/model_selection.py
@@ -24,7 +24,7 @@ def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandRe
         client: API client instance (used for Databricks provider)
         **kwargs: model_name (str)
     """
-    model_name: str = kwargs.get("model_name")
+    model_name = kwargs.get("model_name")
     if not model_name:
         return CommandResult(False, message="model_name parameter is required.")
 
@@ -45,7 +45,7 @@ def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandRe
         models_list = provider.list_models(tool_calling_only=False)
 
         # Extract model IDs (field name varies by provider)
-        model_ids = [m.get("model_id") or m.get("name") for m in models_list]
+        model_ids = [m.get("model_id") or m.get("name") or "" for m in models_list]
 
         # Validate model exists
         if model_name not in model_ids:
diff --git a/chuck_data/commands/pii_tools.py b/chuck_data/commands/pii_tools.py
@@ -26,17 +26,16 @@ def _helper_tag_pii_columns_logic(
     response_content_for_error = ""
     try:
         # Resolve full table name using APIs directly instead of handler
-        table_details_kwargs = {"full_name": table_name_param}
+        resolved_table_name = table_name_param
         if catalog_name_context and schema_name_context and "." not in table_name_param:
             # Only a table name was provided, construct full name
-            full_name = (
+            resolved_table_name = (
                 f"{catalog_name_context}.{schema_name_context}.{table_name_param}"
             )
-            table_details_kwargs = {"full_name": full_name}
 
         try:
             # Use direct API call instead of handle_table
-            table_info = databricks_client.get_table(**table_details_kwargs)
+            table_info = databricks_client.get_table(full_name=resolved_table_name)
             if not table_info:
                 error_msg = f"Failed to retrieve table details for PII tagging: {table_name_param}"
                 return {
@@ -88,6 +87,8 @@ def _helper_tag_pii_columns_logic(
             "and assign a PII semantic tag to each column if applicable. Use ONLY the following PII semantic tags: "
             "address, address2, birthdate, city, country, create-dt, email, full-name, gender, generational-suffix, "
             "given-name, phone, postal, state, surname, title, update-dt. If a column does not contain PII, assign null. "
+            "IMPORTANT: Do NOT assign semantic tags to numeric columns (types: LONG, BIGINT, INT, INTEGER, SMALLINT, "
+            "TINYINT, DOUBLE, FLOAT, DECIMAL, NUMERIC). Always assign null to numeric columns. "
             "Respond ONLY with a valid JSON list of objects, where each object represents a column and has the following structure: "
             '{"name": "column_name", "semantic": "pii_tag_or_null"}. '
             "Maintain original order. No explanations or introductory text."
@@ -100,9 +101,9 @@ def _helper_tag_pii_columns_logic(
                 {"role": "user", "content": user_prompt},
             ]
         )
-        response_content_for_error = llm_response_obj.choices[
-            0
-        ].message.content  # Store for potential error reporting
+        response_content_for_error = (
+            llm_response_obj.choices[0].message.content or ""
+        )  # Store for potential error reporting
         response_content_clean = response_content_for_error.strip()
         if response_content_clean.startswith("```json"):
             response_content_clean = response_content_clean[7:-3].strip()
diff --git a/chuck_data/commands/schema_selection.py b/chuck_data/commands/schema_selection.py
@@ -65,7 +65,7 @@ def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandRe
         client: API client instance
         **kwargs: schema (str) - schema name, tool_output_callback (optional)
     """
-    schema: str = kwargs.get("schema")
+    schema = kwargs.get("schema")
     tool_output_callback = kwargs.get("tool_output_callback")
 
     if not schema:
diff --git a/chuck_data/commands/setup_stitch.py b/chuck_data/commands/setup_stitch.py
@@ -736,7 +736,7 @@ def _build_post_launch_guidance_message(launch_result, metadata, client=None):
         )
 
     # Get workspace URL for constructing browser links
-    workspace_url = get_workspace_url()
+    workspace_url = get_workspace_url() or ""
     # If workspace_url is already a full URL, normalize it to get just the workspace ID
     # If it's just the workspace ID, this will return it as-is
     workspace_id = normalize_workspace_url(workspace_url)
diff --git a/chuck_data/commands/setup_wizard.py b/chuck_data/commands/setup_wizard.py
@@ -248,7 +248,9 @@ def _clear_context(self):
 
 
 def handle_command(
-    client: Optional[DatabricksAPIClient], interactive_input: str = None, **kwargs: Any
+    client: Optional[DatabricksAPIClient],
+    interactive_input: Optional[str] = None,
+    **kwargs: Any,
 ) -> CommandResult:
     """
     Setup wizard command handler using the new architecture.
diff --git a/chuck_data/commands/sql_external_data.py b/chuck_data/commands/sql_external_data.py
@@ -125,7 +125,7 @@ def get_paginated_rows(
         if start_row < chunk_end and current_row < start_row + num_rows:
             # We need some data from this chunk
             try:
-                chunk_data = fetch_chunk_data([link], link.get("chunk_index"))
+                chunk_data = fetch_chunk_data([link], link.get("chunk_index", 0))
                 if chunk_data:
                     # Calculate which rows from this chunk we need
                     local_start = max(0, start_row - chunk_start)
diff --git a/chuck_data/commands/stitch_tools.py b/chuck_data/commands/stitch_tools.py
@@ -28,6 +28,20 @@
     "GEOMETRY",
 ]
 
+# Numeric types that don't support semantic tags in Stitch
+NUMERIC_TYPES = [
+    "LONG",
+    "BIGINT",
+    "INT",
+    "INTEGER",
+    "SMALLINT",
+    "TINYINT",
+    "DOUBLE",
+    "FLOAT",
+    "DECIMAL",
+    "NUMERIC",
+]
+
 
 def validate_multi_location_access(
     client: DatabricksAPIClient, locations: List[Dict[str, str]]
@@ -201,7 +215,11 @@ def _helper_prepare_stitch_config(
                     "type": col_data["type"],
                     "semantics": [],
                 }
-                if col_data.get("semantic"):  # Only add non-null/empty semantics
+                # Only add semantics for non-numeric types (Stitch doesn't support semantics on LONG, etc.)
+                if (
+                    col_data.get("semantic")
+                    and col_data["type"].upper() not in NUMERIC_TYPES
+                ):
                     field_cfg["semantics"].append(col_data["semantic"])
                 table_cfg["fields"].append(field_cfg)
             else:
diff --git a/chuck_data/commands/tag_pii.py b/chuck_data/commands/tag_pii.py
@@ -31,7 +31,7 @@ def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandRe
             table_name (str): Name of the table to tag
             pii_columns (list): List of columns with PII semantic info
     """
-    table_name: str = kwargs.get("table_name")
+    table_name = kwargs.get("table_name")
     pii_columns: List[Dict[str, Any]] = kwargs.get("pii_columns", [])
 
     if not table_name:
diff --git a/chuck_data/commands/upload_file.py b/chuck_data/commands/upload_file.py
@@ -64,6 +64,8 @@ def handle_command(
                     path=destination_path, contents=contents, overwrite=overwrite
                 )
             else:
+                # local_path is guaranteed non-None by validation above
+                assert local_path is not None
                 with open(local_path, "r") as file:
                     file_contents = file.read()
                 client.store_dbfs_file(
diff --git a/chuck_data/commands/warehouse_selection.py b/chuck_data/commands/warehouse_selection.py
@@ -65,7 +65,7 @@ def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandRe
         client: API client instance
         **kwargs: warehouse (str) - warehouse ID or name, tool_output_callback (optional)
     """
-    warehouse: str = kwargs.get("warehouse")
+    warehouse = kwargs.get("warehouse")
     tool_output_callback = kwargs.get("tool_output_callback")
 
     # Must provide warehouse parameter
@@ -125,6 +125,8 @@ def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandRe
                 _report_step(f"Found warehouse '{selected_name}'", tool_output_callback)
 
         # Set the active warehouse
+        # target_warehouse is guaranteed to be a dict at this point
+        assert isinstance(target_warehouse, dict)
         warehouse_id_to_set = target_warehouse.get("id")
         warehouse_display_name = target_warehouse.get("name", "Unknown")
         warehouse_state = target_warehouse.get("state", "Unknown")
diff --git a/chuck_data/commands/wizard/renderer.py b/chuck_data/commands/wizard/renderer.py
@@ -5,11 +5,12 @@
 import platform
 import subprocess
 import logging
-from typing import List, Dict, Any
+from typing import List
 from rich.console import Console
 from rich.table import Table
 from rich import box
 
+from chuck_data.llm.provider import ModelInfo
 from .state import WizardState, WizardStep
 from .steps import SetupStep
 
@@ -105,7 +106,7 @@ def render_completion(self):
         self.console.print("You are now ready to use Chuck with all features enabled.")
         self.console.print("Type /help to see available commands.")
 
-    def _render_models_list(self, models: List[Dict[str, Any]]):
+    def _render_models_list(self, models: List[ModelInfo]):
         """Render the list of available models."""
         if not models:
             self.render_warning("No models available.")
diff --git a/chuck_data/commands/wizard/state.py b/chuck_data/commands/wizard/state.py
@@ -6,6 +6,8 @@
 from enum import Enum
 from typing import Dict, List, Optional, Any
 
+from chuck_data.llm.provider import ModelInfo
+
 
 class WizardStep(Enum):
     """Steps in the setup wizard."""
@@ -38,7 +40,7 @@ class WizardState:
     workspace_url: Optional[str] = None
     token: Optional[str] = None
     llm_provider: Optional[str] = None
-    models: List[Dict[str, Any]] = field(default_factory=list)
+    models: List[ModelInfo] = field(default_factory=list)
     selected_model: Optional[str] = None
     usage_consent: Optional[bool] = None
     error_message: Optional[str] = None
diff --git a/chuck_data/commands/workspace_selection.py b/chuck_data/commands/workspace_selection.py
@@ -22,7 +22,7 @@ def handle_command(client: Optional[DatabricksAPIClient], **kwargs) -> CommandRe
         client: API client instance (not used by this handler)
         **kwargs: workspace_url (str)
     """
-    workspace_url: str = kwargs.get("workspace_url")
+    workspace_url = kwargs.get("workspace_url")
     if not workspace_url:
         return CommandResult(False, message="workspace_url parameter is required.")
 
diff --git a/chuck_data/interactive_context.py b/chuck_data/interactive_context.py
diff --git a/chuck_data/llm/factory.py b/chuck_data/llm/factory.py
diff --git a/chuck_data/llm/provider.py b/chuck_data/llm/provider.py
diff --git a/chuck_data/llm/providers/aws_bedrock.py b/chuck_data/llm/providers/aws_bedrock.py
diff --git a/chuck_data/llm/providers/databricks.py b/chuck_data/llm/providers/databricks.py
diff --git a/chuck_data/metrics_collector.py b/chuck_data/metrics_collector.py
diff --git a/chuck_data/service.py b/chuck_data/service.py
diff --git a/chuck_data/ui/table_formatter.py b/chuck_data/ui/table_formatter.py
diff --git a/chuck_data/ui/tui.py b/chuck_data/ui/tui.py
diff --git a/tests/unit/llm/providers/test_aws_bedrock.py b/tests/unit/llm/providers/test_aws_bedrock.py