Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/scripts/workflow_rerun/log_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(self,

self.found_matching_error = False
self.found_error_ticket = None
self.matched_error_text = None

def _collect_errors_to_look_for(self) -> None:
with open(file=self._path_to_errors_file,
Expand Down Expand Up @@ -123,6 +124,7 @@ def analyze(self) -> None:
LOGGER.info(f'FOUND "{error["error_text"]}" ERROR IN {log_file["path"]}. TICKET: {error["ticket"]}')
self.found_matching_error = True
self.found_error_ticket = error['ticket']
self.matched_error_text = error['error_text']
return


Expand Down
86 changes: 48 additions & 38 deletions .github/scripts/workflow_rerun/rerunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,6 @@

def record_rerun_to_db(repository_full_name: str, run_id: int, ticket_number: int, rerunner_run_id: int, error_text: str):
"""Record the rerun event to the PostgreSQL database."""

if ticket_number is None:
LOGGER.error('No ticket number provided, cannot record rerun to database.')
raise ValueError('Ticket number is required to record rerun to database.')

db_username = os.environ.get('PGUSER')
db_password = os.environ.get('PGPASSWORD')
db_host = os.environ.get('PGHOST')
Expand Down Expand Up @@ -60,6 +55,53 @@ def record_rerun_to_db(repository_full_name: str, run_id: int, ticket_number: in
cursor.close()
conn.close()

def rerun_failed_jobs(repository_name: str, run_id: int, session: requests.Session):
# PyGitHub does not expose the "/repos/{owner}/{repo}/actions/runs/RUN_ID/rerun-failed-jobs" endpoint
# so we have to use requests
response = session.post(
url=f'https://api.github.com/repos/{repository_name}/actions/runs/{run_id}/rerun-failed-jobs',
headers={'Authorization': f'Bearer {GITHUB_TOKEN}'}
)

response.raise_for_status()

LOGGER.info(f'RUN RETRIGGERED SUCCESSFULLY: {run.html_url}')

def analyze_and_rerun(run, repository_name: str, run_id: int, rerunner_run_id: int,
errors_file: Path, is_dry_run: bool, session: requests.Session):
with tempfile.TemporaryDirectory() as temp_dir:
logs_dir = Path(temp_dir)
collect_logs_for_run(
run=run,
logs_dir=logs_dir,
session=session
)

log_analyzer = LogAnalyzer(
path_to_logs=logs_dir,
path_to_errors_file=errors_file
)
log_analyzer.analyze()

if log_analyzer.found_matching_error:
LOGGER.info(f'FOUND MATCHING ERROR, RETRIGGERING {run.html_url}')
if is_dry_run:
LOGGER.info(f'RUNNING IN DRY RUN MODE, NOT RETRIGGERING, EXITING')
return

rerun_failed_jobs(repository_name, run_id, session)

if log_analyzer.found_error_ticket and log_analyzer.matched_error_text:
record_rerun_to_db(repository_name, run_id,
log_analyzer.found_error_ticket,
rerunner_run_id,
log_analyzer.matched_error_text)
else:
LOGGER.error(f'Cannot record to database: missing ticket_number or error_text')
raise ValueError('Missing ticket_number or error_text for database recording.')
else:
LOGGER.info(f'NO ERROR WAS FOUND, NOT RETRIGGERING')

if __name__ == '__main__':
args = get_arguments()
run_id = args.run_id
Expand Down Expand Up @@ -91,36 +133,4 @@ def record_rerun_to_db(repository_full_name: str, run_id: int, ticket_number: in
LOGGER.info(f'THERE ARE {run.run_attempt} ATTEMPTS ALREADY. NOT CHECKING LOGS AND NOT RETRIGGERING. EXITING')
sys.exit(0)

with tempfile.TemporaryDirectory() as temp_dir:
logs_dir = Path(temp_dir)
collect_logs_for_run(
run=run,
logs_dir=logs_dir,
session=session
)

log_analyzer = LogAnalyzer(
path_to_logs=logs_dir,
path_to_errors_file=errors_file
)
log_analyzer.analyze()

if log_analyzer.found_matching_error:
LOGGER.info(f'FOUND MATCHING ERROR, RETRIGGERING {run.html_url}')
if is_dry_run:
LOGGER.info(f'RUNNING IN DRY RUN MODE, NOT RETRIGGERING, EXITING')
sys.exit(0)

# PyGitHub does not expose the "/repos/{owner}/{repo}/actions/runs/RUN_ID/rerun-failed-jobs" endpoint
# so we have to use requests
response = session.post(url=f'https://api.github.com/repos/{repository_name}/actions/runs/{run_id}/rerun-failed-jobs',
headers={'Authorization': f'Bearer {GITHUB_TOKEN}'})
response.raise_for_status()

LOGGER.info(f'RUN RETRIGGERED SUCCESSFULLY: {run.html_url}')
record_rerun_to_db(repository_name, run_id,
log_analyzer.found_error_ticket,
rerunner_run_id,
log_analyzer.matched_error_text)
else:
LOGGER.info(f'NO ERROR WAS FOUND, NOT RETRIGGERING')
analyze_and_rerun(run, repository_name, run_id, rerunner_run_id, errors_file, is_dry_run, session)
72 changes: 62 additions & 10 deletions .github/scripts/workflow_rerun/tests/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from datetime import datetime, timedelta
import os
import tempfile
import shutil
from unittest.mock import patch, MagicMock

import requests
from github import Github, Auth
Expand All @@ -18,6 +20,7 @@

from workflow_rerun.log_analyzer import LogAnalyzer
from workflow_rerun.log_collector import collect_logs_for_run
from workflow_rerun.rerunner import analyze_and_rerun


class IntegrationTest(unittest.TestCase):
Expand All @@ -31,6 +34,7 @@ def setUpClass(cls) -> None:
cls.errors_to_look_for_file = cls._cwd.parent.joinpath(
'errors_to_look_for.json'
)
cls.test_logs_with_error_dir = cls._cwd.joinpath('data', 'logs_with_error')

cls.session = requests.Session()
retry_strategy = Retry(
Expand All @@ -41,20 +45,23 @@ def setUpClass(cls) -> None:
)
cls.session.mount("https://github.com", HTTPAdapter(max_retries=retry_strategy))

cls.github = Github(auth=Auth.Token(token=os.environ.get('GITHUB_TOKEN')))
gh_repo = cls.github.get_repo(full_name_or_id='openvinotoolkit/openvino')
# Only create a GitHub client/run if token is available (otherwise tests should be offline)
cls.github = None
cls.wf_run = None
token = os.environ.get('GITHUB_TOKEN')
if token:
cls.github = Github(auth=Auth.Token(token=token))
gh_repo = cls.github.get_repo(full_name_or_id='openvinotoolkit/openvino')

# Even if we use "failure" for status we cannot guarantee logs containing any of the known error
# So these tests use the logs of the most recent failed pipeline
# Its "created_at" time should be within 60 days - the log retention window
oldest_allowed_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
cls.wf_run = gh_repo.get_workflow_runs(status='failure',
created=f">={oldest_allowed_date}")[0]
print(f'Workflow run for testing: {cls.wf_run}', flush=True)
oldest_allowed_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
cls.wf_run = gh_repo.get_workflow_runs(status='failure',
created=f">={oldest_allowed_date}")[0]
print(f'Workflow run for testing: {cls.wf_run}', flush=True)

def setUp(self):
print(f'\nIn test: "{self._testMethodName}"', flush=True)

@unittest.skipUnless(os.environ.get('GITHUB_TOKEN'), 'GITHUB_TOKEN not set; skipping live GitHub integration test')
def test_log_collection_and_analysis(self) -> None:
"""
Ensure logs collected by collect_logs_for_run are analyzed by LogAnalyzer
Expand All @@ -76,6 +83,51 @@ def test_log_collection_and_analysis(self) -> None:
if analyzer.found_matching_error:
print(f'Found matching error, ticket: {analyzer.found_error_ticket}')

def test_analyze_and_rerun_records_to_db_offline(self) -> None:
"""Offline integration-style test: uses local logs and mocks all network/DB side effects."""

def fake_collect_logs_for_run(*, run, logs_dir: Path, session):
# Populate the temp logs_dir with our checked-in test logs.
for p in self.test_logs_with_error_dir.rglob('*'):
if p.is_file():
rel = p.relative_to(self.test_logs_with_error_dir)
dst = logs_dir / rel
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(p, dst)

mock_run = MagicMock()
mock_run.html_url = 'https://github.com/example/repo/actions/runs/123'

mock_session = MagicMock()

repository_name = 'openvinotoolkit/openvino'
run_id = 123
rerunner_run_id = 456

with patch('workflow_rerun.rerunner.collect_logs_for_run', side_effect=fake_collect_logs_for_run) as collect_mock, \
patch('workflow_rerun.rerunner.rerun_failed_jobs') as rerun_mock, \
patch('workflow_rerun.rerunner.record_rerun_to_db') as record_mock:
analyze_and_rerun(
run=mock_run,
repository_name=repository_name,
run_id=run_id,
rerunner_run_id=rerunner_run_id,
errors_file=self.errors_to_look_for_file,
is_dry_run=False,
session=mock_session
)

collect_mock.assert_called_once()
rerun_mock.assert_called_once_with(repository_name, run_id, mock_session)
record_mock.assert_called_once()

# Basic sanity on record_rerun_to_db args
args = record_mock.call_args[0]
self.assertEqual(args[0], repository_name)
self.assertEqual(args[1], run_id)
self.assertEqual(args[3], rerunner_run_id)

@classmethod
def tearDownClass(cls) -> None:
cls.github.close()
if cls.github is not None:
cls.github.close()
3 changes: 3 additions & 0 deletions .github/scripts/workflow_rerun/tests/log_analyzer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def test_analyzer_with_error(self) -> None:
)
analyzer.analyze()
self.assertTrue(analyzer.found_matching_error)
self.assertEqual(analyzer.found_error_ticket, 130955)
self.assertEqual(analyzer.matched_error_text,
'Network is unreachable')

def test_analyzer_wo_error(self) -> None:
"""
Expand Down
Loading