Skip to content

Commit 1adb7d0

Browse files
committed
Refactor alert handling in monitoring scripts to improve reliability
- Updated multiple monitoring scripts to conditionally send alerts based on the availability of the `send_alert` function, ensuring that alerts do not fail if the function is unavailable. - Enhanced logging for various checks, including server resources, ETL job execution, and WMS service availability, to improve alert handling and robustness. - Cleaned up code for better readability and consistency across the scripts, contributing to overall maintainability.
1 parent 109db73 commit 1adb7d0

11 files changed

+393
-183
lines changed

bin/monitor/checkPlanetNotes.sh

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -98,27 +98,33 @@ run_planet_check() {
9898
# Check exit code
9999
if [[ ${exit_code} -eq 0 ]]; then
100100
log_info "${COMPONENT}: Planet Notes check passed (duration: ${duration}s)"
101-
record_metric "${COMPONENT}" "planet_check_status" "1" "component=ingestion,check=processCheckPlanetNotes"
102-
record_metric "${COMPONENT}" "planet_check_duration" "${duration}" "component=ingestion,check=processCheckPlanetNotes"
101+
# Record metrics (don't fail if record_metric is not available or fails)
102+
if command -v record_metric >/dev/null 2>&1; then
103+
record_metric "${COMPONENT}" "planet_check_status" "1" "component=ingestion,check=processCheckPlanetNotes" || true
104+
record_metric "${COMPONENT}" "planet_check_duration" "${duration}" "component=ingestion,check=processCheckPlanetNotes" || true
105+
fi
103106

104107
# Check planet check duration threshold
105108
local planet_duration_threshold="${INGESTION_PLANET_CHECK_DURATION_THRESHOLD:-600}"
106109
if [[ ${duration} -gt ${planet_duration_threshold} ]]; then
107110
log_warning "${COMPONENT}: Planet check duration (${duration}s) exceeds threshold (${planet_duration_threshold}s)"
108-
# Only send alert if send_alert function is available and not mocked
109-
if command -v send_alert >/dev/null 2>&1 && [[ "${TEST_MODE:-false}" != "true" ]]; then
110-
send_alert "WARNING" "${COMPONENT}" "Planet Notes check took too long: ${duration}s (threshold: ${planet_duration_threshold}s)"
111+
# Only send alert if send_alert function is available
112+
if command -v send_alert >/dev/null 2>&1; then
113+
send_alert "${COMPONENT}" "WARNING" "planet_check_duration" "Planet Notes check took too long: ${duration}s (threshold: ${planet_duration_threshold}s)" || true
111114
fi
112115
fi
113116

114117
return 0
115118
else
116119
log_error "${COMPONENT}: Planet Notes check failed (exit_code: ${exit_code}, duration: ${duration}s)"
117-
record_metric "${COMPONENT}" "planet_check_status" "0" "component=ingestion,check=processCheckPlanetNotes"
118-
record_metric "${COMPONENT}" "planet_check_duration" "${duration}" "component=ingestion,check=processCheckPlanetNotes"
119-
# Only send alert if send_alert function is available and not mocked
120-
if command -v send_alert >/dev/null 2>&1 && [[ "${TEST_MODE:-false}" != "true" ]]; then
121-
send_alert "ERROR" "${COMPONENT}" "Planet Notes check failed: exit_code=${exit_code}"
120+
# Record metrics (don't fail if record_metric is not available or fails)
121+
if command -v record_metric >/dev/null 2>&1; then
122+
record_metric "${COMPONENT}" "planet_check_status" "0" "component=ingestion,check=processCheckPlanetNotes" || true
123+
record_metric "${COMPONENT}" "planet_check_duration" "${duration}" "component=ingestion,check=processCheckPlanetNotes" || true
124+
fi
125+
# Only send alert if send_alert function is available
126+
if command -v send_alert >/dev/null 2>&1; then
127+
send_alert "${COMPONENT}" "ERROR" "planet_check_failed" "Planet Notes check failed: exit_code=${exit_code}" || true
122128
fi
123129
return 1
124130
fi

bin/monitor/monitorAPI.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,9 @@ check_api_availability() {
127127
alert_message="API returned HTTP ${http_code} (URL: ${api_url})"
128128
fi
129129

130-
send_alert "${COMPONENT}" "WARNING" "api_unavailable" "${alert_message}"
130+
if command -v send_alert >/dev/null 2>&1; then
131+
send_alert "${COMPONENT}" "WARNING" "api_unavailable" "${alert_message}" || true
132+
fi
131133
return 1
132134
fi
133135
else

bin/monitor/monitorAnalytics.sh

Lines changed: 102 additions & 44 deletions
Large diffs are not rendered by default.

bin/monitor/monitorData.sh

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,13 @@ check_backup_freshness() {
112112
local backup_count=0
113113
local backup_files=()
114114

115-
if [[ ! -d "${backup_dir}" ]]; then
116-
log_warning "${COMPONENT}: Backup directory does not exist: ${backup_dir}"
117-
send_alert "${COMPONENT}" "WARNING" "backup_directory_missing" "Backup directory does not exist: ${backup_dir}"
118-
return 1
115+
if [[ ! -d "${backup_dir}" ]]; then
116+
log_warning "${COMPONENT}: Backup directory does not exist: ${backup_dir}"
117+
if command -v send_alert >/dev/null 2>&1; then
118+
send_alert "${COMPONENT}" "WARNING" "backup_directory_missing" "Backup directory does not exist: ${backup_dir}" || true
119119
fi
120+
return 1
121+
fi
120122

121123
# Find backup files (common patterns: *.sql, *.sql.gz, *.dump, *.tar.gz, *.backup)
122124
while IFS= read -r -d '' file; do
@@ -125,11 +127,13 @@ check_backup_freshness() {
125127

126128
backup_count=${#backup_files[@]}
127129

128-
if [[ ${backup_count} -eq 0 ]]; then
129-
log_warning "${COMPONENT}: No backup files found in ${backup_dir}"
130-
send_alert "${COMPONENT}" "WARNING" "no_backups_found" "No backup files found in backup directory: ${backup_dir}"
131-
return 1
130+
if [[ ${backup_count} -eq 0 ]]; then
131+
log_warning "${COMPONENT}: No backup files found in ${backup_dir}"
132+
if command -v send_alert >/dev/null 2>&1; then
133+
send_alert "${COMPONENT}" "WARNING" "no_backups_found" "No backup files found in backup directory: ${backup_dir}" || true
132134
fi
135+
return 1
136+
fi
133137

134138
# Calculate ages of backups
135139
local oldest_backup_time=${current_time}
@@ -160,12 +164,14 @@ check_backup_freshness() {
160164

161165
log_info "${COMPONENT}: Backup freshness - Count: ${backup_count}, Newest: ${newest_backup_age}s, Oldest: ${oldest_backup_age}s (threshold: ${threshold}s)"
162166

163-
# Alert if newest backup is too old
164-
if [[ ${newest_backup_age} -gt ${threshold} ]]; then
165-
log_warning "${COMPONENT}: Newest backup is ${newest_backup_age}s old (threshold: ${threshold}s)"
166-
send_alert "${COMPONENT}" "WARNING" "backup_freshness_exceeded" "Newest backup is ${newest_backup_age}s old (threshold: ${threshold}s, directory: ${backup_dir})"
167-
return 1
167+
# Alert if newest backup is too old
168+
if [[ ${newest_backup_age} -gt ${threshold} ]]; then
169+
log_warning "${COMPONENT}: Newest backup is ${newest_backup_age}s old (threshold: ${threshold}s)"
170+
if command -v send_alert >/dev/null 2>&1; then
171+
send_alert "${COMPONENT}" "WARNING" "backup_freshness_exceeded" "Newest backup is ${newest_backup_age}s old (threshold: ${threshold}s, directory: ${backup_dir})" || true
168172
fi
173+
return 1
174+
fi
169175

170176
return 0
171177
}
@@ -243,12 +249,14 @@ check_repository_sync_status() {
243249

244250
log_info "${COMPONENT}: Repository sync status - Status: ${sync_status}, Behind: ${behind_count}, Ahead: ${ahead_count}"
245251

246-
# Alert if repository is behind
247-
if [[ "${sync_status}" == "behind" ]] && [[ ${behind_count} -gt 0 ]]; then
248-
log_warning "${COMPONENT}: Repository is ${behind_count} commits behind remote"
249-
send_alert "${COMPONENT}" "WARNING" "repo_sync_behind" "Repository is ${behind_count} commits behind remote (repo: ${repo_path})"
250-
return 1
252+
# Alert if repository is behind
253+
if [[ "${sync_status}" == "behind" ]] && [[ ${behind_count} -gt 0 ]]; then
254+
log_warning "${COMPONENT}: Repository is ${behind_count} commits behind remote"
255+
if command -v send_alert >/dev/null 2>&1; then
256+
send_alert "${COMPONENT}" "WARNING" "repo_sync_behind" "Repository is ${behind_count} commits behind remote (repo: ${repo_path})" || true
251257
fi
258+
return 1
259+
fi
252260

253261
return 0
254262
}
@@ -410,19 +418,23 @@ check_storage_availability() {
410418
fi
411419

412420
log_warning "${COMPONENT}: Disk usage (${disk_usage}%) exceeds threshold (${disk_usage_threshold}%)"
413-
send_alert "${COMPONENT}" "${alert_level}" "storage_disk_usage_high" "Disk usage (${disk_usage}%) exceeds threshold (${disk_usage_threshold}%, path: ${storage_path})"
421+
if command -v send_alert >/dev/null 2>&1; then
422+
send_alert "${COMPONENT}" "${alert_level}" "storage_disk_usage_high" "Disk usage (${disk_usage}%) exceeds threshold (${disk_usage_threshold}%, path: ${storage_path})" || true
423+
fi
414424

415425
if [[ "${alert_level}" == "CRITICAL" ]]; then
416426
return 1
417427
fi
418428
fi
419429

420-
# Check if storage path is writable
421-
if [[ ! -w "${storage_path}" ]]; then
422-
log_warning "${COMPONENT}: Storage path is not writable: ${storage_path}"
423-
send_alert "${COMPONENT}" "CRITICAL" "storage_not_writable" "Storage path is not writable: ${storage_path}"
424-
return 1
430+
# Check if storage path is writable
431+
if [[ ! -w "${storage_path}" ]]; then
432+
log_warning "${COMPONENT}: Storage path is not writable: ${storage_path}"
433+
if command -v send_alert >/dev/null 2>&1; then
434+
send_alert "${COMPONENT}" "CRITICAL" "storage_not_writable" "Storage path is not writable: ${storage_path}" || true
425435
fi
436+
return 1
437+
fi
426438

427439
return 0
428440
}

bin/monitor/monitorInfrastructure.sh

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,9 @@ check_server_resources() {
298298
alert_level="CRITICAL"
299299
fi
300300
log_warning "${COMPONENT}: CPU usage (${cpu_usage}%) exceeds threshold (${cpu_threshold}%)"
301-
send_alert "${COMPONENT}" "${alert_level}" "cpu_usage_high" "CPU usage (${cpu_usage}%) exceeds threshold (${cpu_threshold}%)"
301+
if command -v send_alert >/dev/null 2>&1; then
302+
send_alert "${COMPONENT}" "${alert_level}" "cpu_usage_high" "CPU usage (${cpu_usage}%) exceeds threshold (${cpu_threshold}%)" || true
303+
fi
302304
overall_result=1
303305
fi
304306

@@ -309,7 +311,9 @@ check_server_resources() {
309311
alert_level="CRITICAL"
310312
fi
311313
log_warning "${COMPONENT}: Memory usage (${memory_usage}%) exceeds threshold (${memory_threshold}%)"
312-
send_alert "${COMPONENT}" "${alert_level}" "memory_usage_high" "Memory usage (${memory_usage}%) exceeds threshold (${memory_threshold}%)"
314+
if command -v send_alert >/dev/null 2>&1; then
315+
send_alert "${COMPONENT}" "${alert_level}" "memory_usage_high" "Memory usage (${memory_usage}%) exceeds threshold (${memory_threshold}%)" || true
316+
fi
313317
overall_result=1
314318
fi
315319

@@ -320,7 +324,9 @@ check_server_resources() {
320324
alert_level="CRITICAL"
321325
fi
322326
log_warning "${COMPONENT}: Disk usage (${disk_usage}%) exceeds threshold (${disk_threshold}%)"
323-
send_alert "${COMPONENT}" "${alert_level}" "disk_usage_high" "Disk usage (${disk_usage}%) exceeds threshold (${disk_threshold}%)"
327+
if command -v send_alert >/dev/null 2>&1; then
328+
send_alert "${COMPONENT}" "${alert_level}" "disk_usage_high" "Disk usage (${disk_usage}%) exceeds threshold (${disk_threshold}%)" || true
329+
fi
324330
overall_result=1
325331
fi
326332

@@ -409,7 +415,9 @@ check_advanced_system_metrics() {
409415
if [[ "${comparison_result}" == "1" ]]; then
410416
local load_multiplier="${INFRASTRUCTURE_LOAD_THRESHOLD_MULTIPLIER:-2}"
411417
log_warning "${COMPONENT}: Load average (${load_1min}) exceeds threshold (${load_threshold} = ${load_multiplier}x ${cpu_count} CPUs)"
412-
send_alert "${COMPONENT}" "WARNING" "system_load_high" "Load average (${load_1min}) exceeds threshold (${load_threshold})"
418+
if command -v send_alert >/dev/null 2>&1; then
419+
send_alert "${COMPONENT}" "WARNING" "system_load_high" "Load average (${load_1min}) exceeds threshold (${load_threshold})" || true
420+
fi
413421
fi
414422
fi
415423

@@ -431,7 +439,9 @@ check_advanced_system_metrics() {
431439

432440
if [[ ${swap_usage_int} -gt ${swap_threshold} ]]; then
433441
log_warning "${COMPONENT}: Swap usage (${swap_usage}%) exceeds threshold (${swap_threshold}%)"
434-
send_alert "${COMPONENT}" "WARNING" "system_swap_high" "Swap usage (${swap_usage}%) exceeds threshold (${swap_threshold}%)"
442+
if command -v send_alert >/dev/null 2>&1; then
443+
send_alert "${COMPONENT}" "WARNING" "system_swap_high" "Swap usage (${swap_usage}%) exceeds threshold (${swap_threshold}%)" || true
444+
fi
435445
fi
436446
fi
437447

@@ -511,7 +521,9 @@ check_network_connectivity() {
511521
# Alert if connectivity failures
512522
if [[ ${connectivity_failures} -gt 0 ]]; then
513523
log_warning "${COMPONENT}: Network connectivity check found ${connectivity_failures} failure(s)"
514-
send_alert "${COMPONENT}" "WARNING" "network_connectivity_failure" "Network connectivity check found ${connectivity_failures} failure(s) out of ${total_checks} hosts checked"
524+
if command -v send_alert >/dev/null 2>&1; then
525+
send_alert "${COMPONENT}" "WARNING" "network_connectivity_failure" "Network connectivity check found ${connectivity_failures} failure(s) out of ${total_checks} hosts checked" || true
526+
fi
515527
return 1
516528
fi
517529

@@ -529,12 +541,14 @@ check_database_server_health() {
529541
return 0
530542
fi
531543

532-
# Check database connection
533-
if ! check_database_connection; then
534-
log_error "${COMPONENT}: Database connection failed"
535-
send_alert "${COMPONENT}" "CRITICAL" "database_connection_failed" "Database server connection failed"
536-
return 1
544+
# Check database connection
545+
if ! check_database_connection; then
546+
log_error "${COMPONENT}: Database connection failed"
547+
if command -v send_alert >/dev/null 2>&1; then
548+
send_alert "${COMPONENT}" "CRITICAL" "database_connection_failed" "Database server connection failed" || true
537549
fi
550+
return 1
551+
fi
538552

539553
# Get database server status
540554
local db_version=""
@@ -587,7 +601,9 @@ check_database_server_health() {
587601
local connection_usage_percent=$(((active_connections * 100) / max_connections))
588602
if [[ ${connection_usage_percent} -gt 80 ]]; then
589603
log_warning "${COMPONENT}: Database connection usage (${connection_usage_percent}%) is high"
590-
send_alert "${COMPONENT}" "WARNING" "database_connections_high" "Database connection usage (${connection_usage_percent}%, ${active_connections}/${max_connections}) is high"
604+
if command -v send_alert >/dev/null 2>&1; then
605+
send_alert "${COMPONENT}" "WARNING" "database_connections_high" "Database connection usage (${connection_usage_percent}%, ${active_connections}/${max_connections}) is high" || true
606+
fi
591607
return 1
592608
fi
593609
fi
@@ -661,7 +677,9 @@ check_service_dependencies() {
661677
# Alert if service failures
662678
if [[ ${service_failures} -gt 0 ]]; then
663679
log_warning "${COMPONENT}: Service dependencies check found ${service_failures} failure(s)"
664-
send_alert "${COMPONENT}" "WARNING" "service_dependency_failure" "Service dependencies check found ${service_failures} failure(s) out of ${total_services} services checked"
680+
if command -v send_alert >/dev/null 2>&1; then
681+
send_alert "${COMPONENT}" "WARNING" "service_dependency_failure" "Service dependencies check found ${service_failures} failure(s) out of ${total_services} services checked" || true
682+
fi
665683
return 1
666684
fi
667685

0 commit comments

Comments
 (0)