Skip to content

Commit fa3ebf4

Browse files
committed
feat: implement rollback support for pod network outage scenarios with testing
- Add rollback handler caching to prevent duplicate registrations across pod_outage, pod_egress_shaping, and pod_ingress_shaping - Implement stateless rollback_pod_network_outage function with pod readiness checks (60s timeout) - Validate node names (DNS-1123) before creating privileged cleanup pods - Clean OpenFlow rules (priority=65535) on br-int and br0 bridges - Unload IFB kernel module to remove ingress shaping virtual interfaces - Delete leftover modtools pods created during chaos execution - Add UUID-based fallback for run_uuid generation - Add missing image argument to apply_net_policy function - Improve error handling with exception logging and tracebacks - Add comprehensive unit tests for rollback logic with mock pod readiness - Update .gitignore to ignore generated CI/config YAML files (keep only common_test_config.yaml) Fixes #911 Signed-off-by: Lokesh2Arvind <lokiarvstud.gn@gmail.com>
1 parent b3a5fc2 commit fa3ebf4

File tree

4 files changed

+348
-10
lines changed

4 files changed

+348
-10
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ inspect.local.*
6060

6161
# Tests
6262
!CI/config/common_test_config.yaml
63+
CI/config/*.yaml
6364
CI/out/*
6465
CI/ci_results
6566
CI/legacy/*node.yaml

krkn/scenario_plugins/native/pod_network_outage/pod_network_outage_plugin.py

Lines changed: 262 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import time
88
import random
99
import re
10+
import uuid
1011
from dataclasses import dataclass, field
1112
from traceback import format_exc
1213
from jinja2 import Environment, FileSystemLoader
@@ -17,7 +18,55 @@
1718
from kubernetes.client.api.custom_objects_api import CustomObjectsApi
1819
from . import cerberus
1920

21+
from krkn.rollback.config import RollbackContent
22+
from krkn.rollback.handler import RollbackHandler
23+
from krkn.rollback.serialization import Serializer
24+
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
25+
26+
SCENARIO_TYPE = "pod_network_outage"
27+
ROLLBACK_RESOURCE_IDENTIFIER = "pod_network_chaos"
28+
def _resolve_run_uuid(cfg: typing.Optional[dict[str, typing.Any]]) -> str:
29+
"""Resolve run_uuid from kraken config or fall back to a timestamp."""
30+
if cfg and isinstance(cfg, dict):
31+
return cfg.get("performance_monitoring", {}).get("uuid", str(uuid.uuid4()))
32+
return str(uuid.uuid4())
33+
34+
# Cache rollback handlers and track registrations per run
35+
_rollback_handler_cache: dict[tuple[str, str], RollbackHandler] = {}
36+
_registered_rollbacks: set[tuple[str, str, str]] = set()
37+
38+
def _get_or_create_rollback_handler(run_uuid: str, scenario_type: str) -> RollbackHandler:
39+
"""Get cached handler or create and cache a new one."""
40+
key = (run_uuid, scenario_type)
41+
if key not in _rollback_handler_cache:
42+
serializer = Serializer(scenario_type=scenario_type)
43+
handler = RollbackHandler(scenario_type, serializer)
44+
handler.set_context(run_uuid)
45+
_rollback_handler_cache[key] = handler
46+
logging.info(f"[Rollback] Created handler for run_uuid={run_uuid}, scenario_type={scenario_type}")
47+
return _rollback_handler_cache[key]
48+
49+
def _register_rollback_once(
50+
handler: RollbackHandler,
51+
run_uuid: str,
52+
scenario_type: str,
53+
content: RollbackContent,
54+
) -> None:
55+
"""Register rollback callback only once per resource_identifier."""
56+
key = (run_uuid, scenario_type, content.resource_identifier)
57+
if key not in _registered_rollbacks:
58+
handler.set_rollback_callable(rollback_pod_network_outage, content)
59+
_registered_rollbacks.add(key)
60+
logging.info(f"[Rollback] Registered rollback for resource_identifier={content.resource_identifier}")
61+
else:
62+
logging.info(f"[Rollback] Rollback already registered for resource_identifier={content.resource_identifier}, skipping")
2063

64+
def _clear_rollback_caches()->None:
65+
"""Clear rollback caches (useful for testing between runs)."""
66+
global _rollback_handler_cache, _registered_rollbacks
67+
_rollback_handler_cache.clear()
68+
_registered_rollbacks.clear()
69+
2170
def get_test_pods(
2271
pod_name: str,
2372
pod_label: str,
@@ -145,8 +194,8 @@ def wait_for_job(
145194
):
146195
count += 1
147196
job_list.remove(job_name)
148-
except Exception:
149-
logging.warning("Exception in getting job status")
197+
except Exception as e:
198+
logging.warning(f"Exception in getting job status: {e}")
150199
if time.time() > wait_time:
151200
raise Exception(
152201
"Jobs did not complete within "
@@ -364,7 +413,6 @@ def apply_ingress_policy(
364413
break
365414

366415
for job_body in yml_list:
367-
print('jbo body' + str(job_body))
368416
api_response = kubecli.create_job(job_body)
369417
if api_response is None:
370418
raise Exception("Error creating job")
@@ -435,7 +483,7 @@ def apply_net_policy(
435483
yml_list = []
436484

437485
for pod_ip in set(ips):
438-
pod_inf = get_pod_interface(node, pod_ip, pod_template, bridge_name, kubecli)
486+
pod_inf = get_pod_interface(node, pod_ip, pod_template, bridge_name, kubecli, image) #added a missing argument 'image'
439487
exec_cmd = get_egress_cmd(
440488
test_execution, pod_inf, mod, network_params, duration
441489
)
@@ -1057,12 +1105,12 @@ def pod_outage(
10571105
) -> typing.Tuple[str, typing.Union[PodOutageSuccessOutput, PodOutageErrorOutput]]:
10581106
"""
10591107
Function that performs pod outage chaos scenario based
1060-
on the provided confiapply_net_policyguration
1108+
on the provided configuration
10611109
10621110
Args:
10631111
params (InputParams,)
10641112
- The object containing the configuration for the scenario
1065-
1113+
10661114
Returns
10671115
A 'success' or 'error' message along with their details
10681116
"""
@@ -1118,8 +1166,20 @@ def pod_outage(
11181166
list(node_dict.keys())[0], pod_module_template, br_name, kubecli, test_image
11191167
)
11201168

1169+
# Register rollback before applying chaos
1170+
run_uuid = _resolve_run_uuid(params.kraken_config)
1171+
rollback_handler = _get_or_create_rollback_handler(run_uuid, SCENARIO_TYPE)
1172+
_register_rollback_once(
1173+
rollback_handler,
1174+
run_uuid,
1175+
SCENARIO_TYPE,
1176+
RollbackContent(
1177+
namespace=test_namespace,
1178+
resource_identifier=ROLLBACK_RESOURCE_IDENTIFIER,
1179+
),
1180+
)
11211181
for direction, ports in filter_dict.items():
1122-
pass
1182+
# This is where the actual chaos is applied, so before this we need to add content to rollback handler
11231183
job_list.extend(
11241184
apply_outage_policy(
11251185
node_dict,
@@ -1388,7 +1448,18 @@ def pod_egress_shaping(
13881448
check_bridge_interface(
13891449
list(node_dict.keys())[0], pod_module_template, br_name, kubecli, test_image
13901450
)
1391-
1451+
# Register rollback before applying chaos
1452+
run_uuid = _resolve_run_uuid(params.kraken_config)
1453+
rollback_handler = _get_or_create_rollback_handler(run_uuid, SCENARIO_TYPE)
1454+
_register_rollback_once(
1455+
rollback_handler,
1456+
run_uuid,
1457+
SCENARIO_TYPE,
1458+
RollbackContent(
1459+
namespace=test_namespace,
1460+
resource_identifier=ROLLBACK_RESOURCE_IDENTIFIER,
1461+
),
1462+
)
13921463
for mod in mod_lst:
13931464
for node, ips in node_dict.items():
13941465
job_list.extend(
@@ -1676,7 +1747,18 @@ def pod_ingress_shaping(
16761747
check_bridge_interface(
16771748
list(node_dict.keys())[0], pod_module_template, br_name, kubecli, test_image
16781749
)
1679-
1750+
# Register rollback before applying chaos
1751+
run_uuid = _resolve_run_uuid(params.kraken_config)
1752+
rollback_handler = _get_or_create_rollback_handler(run_uuid, SCENARIO_TYPE)
1753+
_register_rollback_once(
1754+
rollback_handler,
1755+
run_uuid,
1756+
SCENARIO_TYPE,
1757+
RollbackContent(
1758+
namespace=test_namespace,
1759+
resource_identifier=ROLLBACK_RESOURCE_IDENTIFIER,
1760+
),
1761+
)
16801762
for mod in mod_lst:
16811763
for node, ips in node_dict.items():
16821764
job_list.extend(
@@ -1731,3 +1813,174 @@ def pod_ingress_shaping(
17311813
delete_virtual_interfaces(kubecli, node_dict.keys(), pod_module_template, test_image)
17321814
logging.info("Deleting jobs(if any)")
17331815
delete_jobs(kubecli, job_list[:])
1816+
1817+
1818+
def _sanitize_name(value: str, max_len: int = 63) -> str:
1819+
"""
1820+
Function to create a random cleanup pod name that is safe to use as a K8s resource name.
1821+
DNS-1123–safe name, lowered, non-alnum => '-', trimmed to max_len.
1822+
"""
1823+
safe = re.sub(r"[^a-z0-9-]", "-", value.lower())
1824+
safe = re.sub(r"-+", "-", safe).strip("-")
1825+
return safe[:max_len] or "pod"
1826+
1827+
DNS_1123_RE = re.compile(r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?$")
1828+
1829+
def _is_valid_node_name(name: str) -> bool:
1830+
"""Validate node name matches K8s DNS-1123 rules."""
1831+
return (
1832+
bool(name)
1833+
and len(name) <= 63
1834+
and DNS_1123_RE.match(name) is not None
1835+
)
1836+
1837+
def rollback_pod_network_outage(
1838+
rollback_content: RollbackContent,
1839+
telemetry_ocp: KrknTelemetryOpenshift
1840+
):
1841+
"""
1842+
Stateless rollback for pod_network_outage scenario.
1843+
Args:
1844+
rollback_content (RollbackContent): The content needed for rollback.
1845+
telemetry_ocp (KrknTelemetryOpenshift): The telemetry OpenShift client.
1846+
1847+
Cleans all OpenFlow rules and IFB interfaces created during the pod network outage chaos.
1848+
Deletes leftover modtools pods.
1849+
"""
1850+
kubecli = telemetry_ocp.get_lib_kubernetes()
1851+
1852+
try:
1853+
logging.info("[Rollback] Starting pod network outage rollback...")
1854+
1855+
all_nodes = kubecli.list_nodes(label_selector="")
1856+
if not all_nodes:
1857+
logging.warning("[Rollback] No nodes found in cluster")
1858+
return
1859+
1860+
for node in all_nodes:
1861+
if not _is_valid_node_name(node):
1862+
logging.warning(f"[Rollback] Skipping invalid node name: {node!r}")
1863+
continue
1864+
1865+
logging.info(f"[Rollback] Cleaning node: {node}")
1866+
base_name = f"rollback-clean-{rollback_content.resource_identifier}-{int(time.time())}"
1867+
cleanup_pod = _sanitize_name(f"{base_name}-{random.randint(1000, 9999)}")
1868+
1869+
pod_body = {
1870+
"apiVersion": "v1",
1871+
"kind": "Pod",
1872+
"metadata": {"name": cleanup_pod},
1873+
"spec": {
1874+
"hostPID": True,
1875+
"hostNetwork": True,
1876+
"nodeName": node,
1877+
"containers": [{
1878+
"name": "cleanup",
1879+
"image": "quay.io/krkn-chaos/krkn:tools",
1880+
"securityContext": {"privileged": True},
1881+
"command": ["sleep", "3600"],
1882+
"volumeMounts": [{"name": "host", "mountPath": "/host"}],
1883+
}],
1884+
"volumes": [{"name": "host", "hostPath": {"path": "/"}}],
1885+
"restartPolicy": "Never"
1886+
}
1887+
}
1888+
1889+
pod_created = False
1890+
try:
1891+
# Create cleanup pod
1892+
kubecli.create_pod(pod_body, namespace="default", timeout=300)
1893+
pod_created = True
1894+
1895+
# Wait for pod to be ready
1896+
max_wait = 60 # seconds
1897+
wait_interval = 2
1898+
elapsed = 0
1899+
1900+
while elapsed < max_wait:
1901+
try:
1902+
pod_status = kubecli.read_pod(cleanup_pod, "default")
1903+
if pod_status.status.phase == "Running":
1904+
# Double-check container is ready
1905+
if pod_status.status.container_statuses:
1906+
if all(c.ready for c in pod_status.status.container_statuses):
1907+
logging.info(f"[Rollback] Cleanup pod {cleanup_pod} is ready")
1908+
break
1909+
elif pod_status.status.phase in ["Failed", "Unknown"]:
1910+
raise Exception(f"Cleanup pod entered {pod_status.status.phase} state")
1911+
except Exception as e:
1912+
logging.warning(f"[Rollback] Waiting for pod readiness: {e}")
1913+
1914+
time.sleep(wait_interval)
1915+
elapsed += wait_interval
1916+
1917+
if elapsed >= max_wait:
1918+
raise Exception(f"Cleanup pod {cleanup_pod} did not become ready within {max_wait}s")
1919+
1920+
# --- Clean OpenFlow rules ---
1921+
for bridge in ["br-int", "br0"]:
1922+
try:
1923+
logging.info(f"[Rollback] Removing flows on bridge '{bridge}' (node '{node}')")
1924+
kubecli.exec_cmd_in_pod(
1925+
["/host", "ovs-ofctl", "-O", "OpenFlow13", "del-flows", bridge, "priority=65535"],
1926+
cleanup_pod,
1927+
"default",
1928+
base_command="chroot"
1929+
)
1930+
except Exception:
1931+
logging.exception(
1932+
f"[Rollback] Failed to remove flows on bridge '{bridge}' (node '{node}'). "
1933+
f"Command: ovs-ofctl del-flows {bridge} priority=65535"
1934+
)
1935+
1936+
# --- Remove IFB interfaces ---
1937+
try:
1938+
logging.info(f"[Rollback] Unloading IFB module (node '{node}')")
1939+
kubecli.exec_cmd_in_pod(
1940+
["/host", "modprobe", "-r", "ifb"],
1941+
cleanup_pod,
1942+
"default",
1943+
base_command="chroot"
1944+
)
1945+
except Exception:
1946+
logging.exception(
1947+
f"[Rollback] Failed to unload IFB module (node '{node}', pod '{cleanup_pod}')"
1948+
)
1949+
1950+
except Exception:
1951+
logging.exception(f"[Rollback] Failed during cleanup operations on node '{node}'")
1952+
1953+
finally:
1954+
# Delete cleanup pod only if it was created
1955+
if pod_created:
1956+
try:
1957+
logging.info(f"[Rollback] Deleting cleanup pod: {cleanup_pod}")
1958+
kubecli.delete_pod(cleanup_pod, namespace="default")
1959+
except Exception:
1960+
logging.exception(f"[Rollback] Failed to delete cleanup pod '{cleanup_pod}'")
1961+
1962+
# --- Remove leftover modtools pods ---
1963+
try:
1964+
logging.info("[Rollback] Searching for leftover modtools pods...")
1965+
pods = kubecli.list_pods(label_selector="", namespace="default")
1966+
modtools_pods = [p for p in pods if p.startswith("modtools-")]
1967+
1968+
if modtools_pods:
1969+
logging.info(f"[Rollback] Found {len(modtools_pods)} modtools pods to clean")
1970+
for pod_name in modtools_pods:
1971+
try:
1972+
logging.info(f"[Rollback] Deleting modtools pod: {pod_name}")
1973+
kubecli.delete_pod(pod_name, "default")
1974+
except Exception:
1975+
logging.exception(f"[Rollback] Failed to delete modtools pod '{pod_name}'")
1976+
else:
1977+
logging.info("[Rollback] No leftover modtools pods found")
1978+
1979+
except Exception:
1980+
logging.exception("[Rollback] Failed to list/delete modtools pods")
1981+
1982+
logging.info("[Rollback] Pod network outage rollback completed")
1983+
1984+
except Exception:
1985+
logging.exception("[Rollback] Critical failure during rollback")
1986+
raise # Re-raise to signal rollback failure

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ zip_safe = False
1515
packages = find:
1616
include_package_data = True
1717
package_dir =
18-
=kraken
18+
=krkn
1919
# Add here dependencies of your project (semicolon/line-separated), e.g.
2020
install_requires = PyYAML
2121
# tests_require = pytest; pytest-cov

0 commit comments

Comments
 (0)