|
7 | 7 | import time |
8 | 8 | import random |
9 | 9 | import re |
| 10 | +import uuid |
10 | 11 | from dataclasses import dataclass, field |
11 | 12 | from traceback import format_exc |
12 | 13 | from jinja2 import Environment, FileSystemLoader |
|
17 | 18 | from kubernetes.client.api.custom_objects_api import CustomObjectsApi |
18 | 19 | from . import cerberus |
19 | 20 |
|
| 21 | +from krkn.rollback.config import RollbackContent |
| 22 | +from krkn.rollback.handler import RollbackHandler |
| 23 | +from krkn.rollback.serialization import Serializer |
| 24 | +from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift |
| 25 | + |
| 26 | +SCENARIO_TYPE = "pod_network_outage" |
| 27 | +ROLLBACK_RESOURCE_IDENTIFIER = "pod_network_chaos" |
| 28 | +def _resolve_run_uuid(cfg: typing.Optional[dict[str, typing.Any]]) -> str: |
| 29 | + """Resolve run_uuid from kraken config or fall back to a timestamp.""" |
| 30 | + if cfg and isinstance(cfg, dict): |
| 31 | + return cfg.get("performance_monitoring", {}).get("uuid", str(uuid.uuid4())) |
| 32 | + return str(uuid.uuid4()) |
| 33 | + |
| 34 | +# Cache rollback handlers and track registrations per run |
| 35 | +_rollback_handler_cache: dict[tuple[str, str], RollbackHandler] = {} |
| 36 | +_registered_rollbacks: set[tuple[str, str, str]] = set() |
| 37 | + |
| 38 | +def _get_or_create_rollback_handler(run_uuid: str, scenario_type: str) -> RollbackHandler: |
| 39 | + """Get cached handler or create and cache a new one.""" |
| 40 | + key = (run_uuid, scenario_type) |
| 41 | + if key not in _rollback_handler_cache: |
| 42 | + serializer = Serializer(scenario_type=scenario_type) |
| 43 | + handler = RollbackHandler(scenario_type, serializer) |
| 44 | + handler.set_context(run_uuid) |
| 45 | + _rollback_handler_cache[key] = handler |
| 46 | + logging.info(f"[Rollback] Created handler for run_uuid={run_uuid}, scenario_type={scenario_type}") |
| 47 | + return _rollback_handler_cache[key] |
| 48 | + |
| 49 | +def _register_rollback_once( |
| 50 | + handler: RollbackHandler, |
| 51 | + run_uuid: str, |
| 52 | + scenario_type: str, |
| 53 | + content: RollbackContent, |
| 54 | +) -> None: |
| 55 | + """Register rollback callback only once per resource_identifier.""" |
| 56 | + key = (run_uuid, scenario_type, content.resource_identifier) |
| 57 | + if key not in _registered_rollbacks: |
| 58 | + handler.set_rollback_callable(rollback_pod_network_outage, content) |
| 59 | + _registered_rollbacks.add(key) |
| 60 | + logging.info(f"[Rollback] Registered rollback for resource_identifier={content.resource_identifier}") |
| 61 | + else: |
| 62 | + logging.info(f"[Rollback] Rollback already registered for resource_identifier={content.resource_identifier}, skipping") |
20 | 63 |
|
| 64 | +def _clear_rollback_caches()->None: |
| 65 | + """Clear rollback caches (useful for testing between runs).""" |
| 66 | + global _rollback_handler_cache, _registered_rollbacks |
| 67 | + _rollback_handler_cache.clear() |
| 68 | + _registered_rollbacks.clear() |
| 69 | + |
21 | 70 | def get_test_pods( |
22 | 71 | pod_name: str, |
23 | 72 | pod_label: str, |
@@ -145,8 +194,8 @@ def wait_for_job( |
145 | 194 | ): |
146 | 195 | count += 1 |
147 | 196 | job_list.remove(job_name) |
148 | | - except Exception: |
149 | | - logging.warning("Exception in getting job status") |
| 197 | + except Exception as e: |
| 198 | + logging.warning(f"Exception in getting job status: {e}") |
150 | 199 | if time.time() > wait_time: |
151 | 200 | raise Exception( |
152 | 201 | "Jobs did not complete within " |
@@ -364,7 +413,6 @@ def apply_ingress_policy( |
364 | 413 | break |
365 | 414 |
|
366 | 415 | for job_body in yml_list: |
367 | | - print('jbo body' + str(job_body)) |
368 | 416 | api_response = kubecli.create_job(job_body) |
369 | 417 | if api_response is None: |
370 | 418 | raise Exception("Error creating job") |
@@ -435,7 +483,7 @@ def apply_net_policy( |
435 | 483 | yml_list = [] |
436 | 484 |
|
437 | 485 | for pod_ip in set(ips): |
438 | | - pod_inf = get_pod_interface(node, pod_ip, pod_template, bridge_name, kubecli) |
| 486 | + pod_inf = get_pod_interface(node, pod_ip, pod_template, bridge_name, kubecli, image) #added a missing argument 'image' |
439 | 487 | exec_cmd = get_egress_cmd( |
440 | 488 | test_execution, pod_inf, mod, network_params, duration |
441 | 489 | ) |
@@ -1057,12 +1105,12 @@ def pod_outage( |
1057 | 1105 | ) -> typing.Tuple[str, typing.Union[PodOutageSuccessOutput, PodOutageErrorOutput]]: |
1058 | 1106 | """ |
1059 | 1107 | Function that performs pod outage chaos scenario based |
1060 | | - on the provided confiapply_net_policyguration |
| 1108 | + on the provided configuration |
1061 | 1109 |
|
1062 | 1110 | Args: |
1063 | 1111 | params (InputParams,) |
1064 | 1112 | - The object containing the configuration for the scenario |
1065 | | -
|
| 1113 | + |
1066 | 1114 | Returns |
1067 | 1115 | A 'success' or 'error' message along with their details |
1068 | 1116 | """ |
@@ -1118,8 +1166,20 @@ def pod_outage( |
1118 | 1166 | list(node_dict.keys())[0], pod_module_template, br_name, kubecli, test_image |
1119 | 1167 | ) |
1120 | 1168 |
|
| 1169 | + # Register rollback before applying chaos |
| 1170 | + run_uuid = _resolve_run_uuid(params.kraken_config) |
| 1171 | + rollback_handler = _get_or_create_rollback_handler(run_uuid, SCENARIO_TYPE) |
| 1172 | + _register_rollback_once( |
| 1173 | + rollback_handler, |
| 1174 | + run_uuid, |
| 1175 | + SCENARIO_TYPE, |
| 1176 | + RollbackContent( |
| 1177 | + namespace=test_namespace, |
| 1178 | + resource_identifier=ROLLBACK_RESOURCE_IDENTIFIER, |
| 1179 | + ), |
| 1180 | + ) |
1121 | 1181 | for direction, ports in filter_dict.items(): |
1122 | | - pass |
| 1182 | + # This is where the actual chaos is applied, so before this we need to add content to rollback handler |
1123 | 1183 | job_list.extend( |
1124 | 1184 | apply_outage_policy( |
1125 | 1185 | node_dict, |
@@ -1388,7 +1448,18 @@ def pod_egress_shaping( |
1388 | 1448 | check_bridge_interface( |
1389 | 1449 | list(node_dict.keys())[0], pod_module_template, br_name, kubecli, test_image |
1390 | 1450 | ) |
1391 | | - |
| 1451 | + # Register rollback before applying chaos |
| 1452 | + run_uuid = _resolve_run_uuid(params.kraken_config) |
| 1453 | + rollback_handler = _get_or_create_rollback_handler(run_uuid, SCENARIO_TYPE) |
| 1454 | + _register_rollback_once( |
| 1455 | + rollback_handler, |
| 1456 | + run_uuid, |
| 1457 | + SCENARIO_TYPE, |
| 1458 | + RollbackContent( |
| 1459 | + namespace=test_namespace, |
| 1460 | + resource_identifier=ROLLBACK_RESOURCE_IDENTIFIER, |
| 1461 | + ), |
| 1462 | + ) |
1392 | 1463 | for mod in mod_lst: |
1393 | 1464 | for node, ips in node_dict.items(): |
1394 | 1465 | job_list.extend( |
@@ -1676,7 +1747,18 @@ def pod_ingress_shaping( |
1676 | 1747 | check_bridge_interface( |
1677 | 1748 | list(node_dict.keys())[0], pod_module_template, br_name, kubecli, test_image |
1678 | 1749 | ) |
1679 | | - |
| 1750 | + # Register rollback before applying chaos |
| 1751 | + run_uuid = _resolve_run_uuid(params.kraken_config) |
| 1752 | + rollback_handler = _get_or_create_rollback_handler(run_uuid, SCENARIO_TYPE) |
| 1753 | + _register_rollback_once( |
| 1754 | + rollback_handler, |
| 1755 | + run_uuid, |
| 1756 | + SCENARIO_TYPE, |
| 1757 | + RollbackContent( |
| 1758 | + namespace=test_namespace, |
| 1759 | + resource_identifier=ROLLBACK_RESOURCE_IDENTIFIER, |
| 1760 | + ), |
| 1761 | + ) |
1680 | 1762 | for mod in mod_lst: |
1681 | 1763 | for node, ips in node_dict.items(): |
1682 | 1764 | job_list.extend( |
@@ -1731,3 +1813,174 @@ def pod_ingress_shaping( |
1731 | 1813 | delete_virtual_interfaces(kubecli, node_dict.keys(), pod_module_template, test_image) |
1732 | 1814 | logging.info("Deleting jobs(if any)") |
1733 | 1815 | delete_jobs(kubecli, job_list[:]) |
| 1816 | + |
| 1817 | + |
| 1818 | +def _sanitize_name(value: str, max_len: int = 63) -> str: |
| 1819 | + """ |
| 1820 | + Function to create a random cleanup pod name that is safe to use as a K8s resource name. |
| 1821 | + DNS-1123–safe name, lowered, non-alnum => '-', trimmed to max_len. |
| 1822 | + """ |
| 1823 | + safe = re.sub(r"[^a-z0-9-]", "-", value.lower()) |
| 1824 | + safe = re.sub(r"-+", "-", safe).strip("-") |
| 1825 | + return safe[:max_len] or "pod" |
| 1826 | + |
| 1827 | +DNS_1123_RE = re.compile(r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?$") |
| 1828 | + |
| 1829 | +def _is_valid_node_name(name: str) -> bool: |
| 1830 | + """Validate node name matches K8s DNS-1123 rules.""" |
| 1831 | + return ( |
| 1832 | + bool(name) |
| 1833 | + and len(name) <= 63 |
| 1834 | + and DNS_1123_RE.match(name) is not None |
| 1835 | + ) |
| 1836 | + |
| 1837 | +def rollback_pod_network_outage( |
| 1838 | + rollback_content: RollbackContent, |
| 1839 | + telemetry_ocp: KrknTelemetryOpenshift |
| 1840 | +): |
| 1841 | + """ |
| 1842 | + Stateless rollback for pod_network_outage scenario. |
| 1843 | + Args: |
| 1844 | + rollback_content (RollbackContent): The content needed for rollback. |
| 1845 | + telemetry_ocp (KrknTelemetryOpenshift): The telemetry OpenShift client. |
| 1846 | + |
| 1847 | + Cleans all OpenFlow rules and IFB interfaces created during the pod network outage chaos. |
| 1848 | + Deletes leftover modtools pods. |
| 1849 | + """ |
| 1850 | + kubecli = telemetry_ocp.get_lib_kubernetes() |
| 1851 | + |
| 1852 | + try: |
| 1853 | + logging.info("[Rollback] Starting pod network outage rollback...") |
| 1854 | + |
| 1855 | + all_nodes = kubecli.list_nodes(label_selector="") |
| 1856 | + if not all_nodes: |
| 1857 | + logging.warning("[Rollback] No nodes found in cluster") |
| 1858 | + return |
| 1859 | + |
| 1860 | + for node in all_nodes: |
| 1861 | + if not _is_valid_node_name(node): |
| 1862 | + logging.warning(f"[Rollback] Skipping invalid node name: {node!r}") |
| 1863 | + continue |
| 1864 | + |
| 1865 | + logging.info(f"[Rollback] Cleaning node: {node}") |
| 1866 | + base_name = f"rollback-clean-{rollback_content.resource_identifier}-{int(time.time())}" |
| 1867 | + cleanup_pod = _sanitize_name(f"{base_name}-{random.randint(1000, 9999)}") |
| 1868 | + |
| 1869 | + pod_body = { |
| 1870 | + "apiVersion": "v1", |
| 1871 | + "kind": "Pod", |
| 1872 | + "metadata": {"name": cleanup_pod}, |
| 1873 | + "spec": { |
| 1874 | + "hostPID": True, |
| 1875 | + "hostNetwork": True, |
| 1876 | + "nodeName": node, |
| 1877 | + "containers": [{ |
| 1878 | + "name": "cleanup", |
| 1879 | + "image": "quay.io/krkn-chaos/krkn:tools", |
| 1880 | + "securityContext": {"privileged": True}, |
| 1881 | + "command": ["sleep", "3600"], |
| 1882 | + "volumeMounts": [{"name": "host", "mountPath": "/host"}], |
| 1883 | + }], |
| 1884 | + "volumes": [{"name": "host", "hostPath": {"path": "/"}}], |
| 1885 | + "restartPolicy": "Never" |
| 1886 | + } |
| 1887 | + } |
| 1888 | + |
| 1889 | + pod_created = False |
| 1890 | + try: |
| 1891 | + # Create cleanup pod |
| 1892 | + kubecli.create_pod(pod_body, namespace="default", timeout=300) |
| 1893 | + pod_created = True |
| 1894 | + |
| 1895 | + # Wait for pod to be ready |
| 1896 | + max_wait = 60 # seconds |
| 1897 | + wait_interval = 2 |
| 1898 | + elapsed = 0 |
| 1899 | + |
| 1900 | + while elapsed < max_wait: |
| 1901 | + try: |
| 1902 | + pod_status = kubecli.read_pod(cleanup_pod, "default") |
| 1903 | + if pod_status.status.phase == "Running": |
| 1904 | + # Double-check container is ready |
| 1905 | + if pod_status.status.container_statuses: |
| 1906 | + if all(c.ready for c in pod_status.status.container_statuses): |
| 1907 | + logging.info(f"[Rollback] Cleanup pod {cleanup_pod} is ready") |
| 1908 | + break |
| 1909 | + elif pod_status.status.phase in ["Failed", "Unknown"]: |
| 1910 | + raise Exception(f"Cleanup pod entered {pod_status.status.phase} state") |
| 1911 | + except Exception as e: |
| 1912 | + logging.warning(f"[Rollback] Waiting for pod readiness: {e}") |
| 1913 | + |
| 1914 | + time.sleep(wait_interval) |
| 1915 | + elapsed += wait_interval |
| 1916 | + |
| 1917 | + if elapsed >= max_wait: |
| 1918 | + raise Exception(f"Cleanup pod {cleanup_pod} did not become ready within {max_wait}s") |
| 1919 | + |
| 1920 | + # --- Clean OpenFlow rules --- |
| 1921 | + for bridge in ["br-int", "br0"]: |
| 1922 | + try: |
| 1923 | + logging.info(f"[Rollback] Removing flows on bridge '{bridge}' (node '{node}')") |
| 1924 | + kubecli.exec_cmd_in_pod( |
| 1925 | + ["/host", "ovs-ofctl", "-O", "OpenFlow13", "del-flows", bridge, "priority=65535"], |
| 1926 | + cleanup_pod, |
| 1927 | + "default", |
| 1928 | + base_command="chroot" |
| 1929 | + ) |
| 1930 | + except Exception: |
| 1931 | + logging.exception( |
| 1932 | + f"[Rollback] Failed to remove flows on bridge '{bridge}' (node '{node}'). " |
| 1933 | + f"Command: ovs-ofctl del-flows {bridge} priority=65535" |
| 1934 | + ) |
| 1935 | + |
| 1936 | + # --- Remove IFB interfaces --- |
| 1937 | + try: |
| 1938 | + logging.info(f"[Rollback] Unloading IFB module (node '{node}')") |
| 1939 | + kubecli.exec_cmd_in_pod( |
| 1940 | + ["/host", "modprobe", "-r", "ifb"], |
| 1941 | + cleanup_pod, |
| 1942 | + "default", |
| 1943 | + base_command="chroot" |
| 1944 | + ) |
| 1945 | + except Exception: |
| 1946 | + logging.exception( |
| 1947 | + f"[Rollback] Failed to unload IFB module (node '{node}', pod '{cleanup_pod}')" |
| 1948 | + ) |
| 1949 | + |
| 1950 | + except Exception: |
| 1951 | + logging.exception(f"[Rollback] Failed during cleanup operations on node '{node}'") |
| 1952 | + |
| 1953 | + finally: |
| 1954 | + # Delete cleanup pod only if it was created |
| 1955 | + if pod_created: |
| 1956 | + try: |
| 1957 | + logging.info(f"[Rollback] Deleting cleanup pod: {cleanup_pod}") |
| 1958 | + kubecli.delete_pod(cleanup_pod, namespace="default") |
| 1959 | + except Exception: |
| 1960 | + logging.exception(f"[Rollback] Failed to delete cleanup pod '{cleanup_pod}'") |
| 1961 | + |
| 1962 | + # --- Remove leftover modtools pods --- |
| 1963 | + try: |
| 1964 | + logging.info("[Rollback] Searching for leftover modtools pods...") |
| 1965 | + pods = kubecli.list_pods(label_selector="", namespace="default") |
| 1966 | + modtools_pods = [p for p in pods if p.startswith("modtools-")] |
| 1967 | + |
| 1968 | + if modtools_pods: |
| 1969 | + logging.info(f"[Rollback] Found {len(modtools_pods)} modtools pods to clean") |
| 1970 | + for pod_name in modtools_pods: |
| 1971 | + try: |
| 1972 | + logging.info(f"[Rollback] Deleting modtools pod: {pod_name}") |
| 1973 | + kubecli.delete_pod(pod_name, "default") |
| 1974 | + except Exception: |
| 1975 | + logging.exception(f"[Rollback] Failed to delete modtools pod '{pod_name}'") |
| 1976 | + else: |
| 1977 | + logging.info("[Rollback] No leftover modtools pods found") |
| 1978 | + |
| 1979 | + except Exception: |
| 1980 | + logging.exception("[Rollback] Failed to list/delete modtools pods") |
| 1981 | + |
| 1982 | + logging.info("[Rollback] Pod network outage rollback completed") |
| 1983 | + |
| 1984 | + except Exception: |
| 1985 | + logging.exception("[Rollback] Critical failure during rollback") |
| 1986 | + raise # Re-raise to signal rollback failure |
0 commit comments