OSM-Notes
diff --git a/‎bin/monitor/checkPlanetNotes.sh‎
Lines changed: 16 additions & 10 deletions b/‎bin/monitor/checkPlanetNotes.sh‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎bin/monitor/monitorAPI.sh‎
Lines changed: 3 additions & 1 deletion b/‎bin/monitor/monitorAPI.sh‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎bin/monitor/monitorAnalytics.sh‎
Lines changed: 102 additions & 44 deletions b/‎bin/monitor/monitorAnalytics.sh‎
Lines changed: 102 additions & 44 deletions
diff --git a/‎bin/monitor/monitorData.sh‎
Lines changed: 36 additions & 24 deletions b/‎bin/monitor/monitorData.sh‎
Lines changed: 36 additions & 24 deletions
diff --git a/‎bin/monitor/monitorInfrastructure.sh‎
Lines changed: 31 additions & 13 deletions b/‎bin/monitor/monitorInfrastructure.sh‎
Lines changed: 31 additions & 13 deletions
@@ -98,27 +98,33 @@ run_planet_check() {
  # Check exit code
  if [[ ${exit_code} -eq 0 ]]; then
   log_info "${COMPONENT}: Planet Notes check passed (duration: ${duration}s)"
-  record_metric "${COMPONENT}" "planet_check_status" "1" "component=ingestion,check=processCheckPlanetNotes"
-  record_metric "${COMPONENT}" "planet_check_duration" "${duration}" "component=ingestion,check=processCheckPlanetNotes"
+  # Record metrics (don't fail if record_metric is not available or fails)
+  if command -v record_metric >/dev/null 2>&1; then
+   record_metric "${COMPONENT}" "planet_check_status" "1" "component=ingestion,check=processCheckPlanetNotes" || true
+   record_metric "${COMPONENT}" "planet_check_duration" "${duration}" "component=ingestion,check=processCheckPlanetNotes" || true
+  fi
 
   # Check planet check duration threshold
   local planet_duration_threshold="${INGESTION_PLANET_CHECK_DURATION_THRESHOLD:-600}"
   if [[ ${duration} -gt ${planet_duration_threshold} ]]; then
    log_warning "${COMPONENT}: Planet check duration (${duration}s) exceeds threshold (${planet_duration_threshold}s)"
-   # Only send alert if send_alert function is available and not mocked
-   if command -v send_alert >/dev/null 2>&1 && [[ "${TEST_MODE:-false}" != "true" ]]; then
-    send_alert "WARNING" "${COMPONENT}" "Planet Notes check took too long: ${duration}s (threshold: ${planet_duration_threshold}s)"
+   # Only send alert if send_alert function is available
+   if command -v send_alert >/dev/null 2>&1; then
+    send_alert "${COMPONENT}" "WARNING" "planet_check_duration" "Planet Notes check took too long: ${duration}s (threshold: ${planet_duration_threshold}s)" || true
    fi
   fi
 
   return 0
  else
   log_error "${COMPONENT}: Planet Notes check failed (exit_code: ${exit_code}, duration: ${duration}s)"
-  record_metric "${COMPONENT}" "planet_check_status" "0" "component=ingestion,check=processCheckPlanetNotes"
-  record_metric "${COMPONENT}" "planet_check_duration" "${duration}" "component=ingestion,check=processCheckPlanetNotes"
-  # Only send alert if send_alert function is available and not mocked
-  if command -v send_alert >/dev/null 2>&1 && [[ "${TEST_MODE:-false}" != "true" ]]; then
-   send_alert "ERROR" "${COMPONENT}" "Planet Notes check failed: exit_code=${exit_code}"
+  # Record metrics (don't fail if record_metric is not available or fails)
+  if command -v record_metric >/dev/null 2>&1; then
+   record_metric "${COMPONENT}" "planet_check_status" "0" "component=ingestion,check=processCheckPlanetNotes" || true
+   record_metric "${COMPONENT}" "planet_check_duration" "${duration}" "component=ingestion,check=processCheckPlanetNotes" || true
+  fi
+  # Only send alert if send_alert function is available
+  if command -v send_alert >/dev/null 2>&1; then
+   send_alert "${COMPONENT}" "ERROR" "planet_check_failed" "Planet Notes check failed: exit_code=${exit_code}" || true
   fi
   return 1
  fi
 
@@ -127,7 +127,9 @@ check_api_availability() {
     alert_message="API returned HTTP ${http_code} (URL: ${api_url})"
    fi
 
-   send_alert "${COMPONENT}" "WARNING" "api_unavailable" "${alert_message}"
+   if command -v send_alert >/dev/null 2>&1; then
+    send_alert "${COMPONENT}" "WARNING" "api_unavailable" "${alert_message}" || true
+   fi
    return 1
   fi
  else
 
@@ -112,11 +112,13 @@ check_backup_freshness() {
  local backup_count=0
  local backup_files=()
 
- if [[ ! -d "${backup_dir}" ]]; then
-  log_warning "${COMPONENT}: Backup directory does not exist: ${backup_dir}"
-  send_alert "${COMPONENT}" "WARNING" "backup_directory_missing" "Backup directory does not exist: ${backup_dir}"
-  return 1
+if [[ ! -d "${backup_dir}" ]]; then
+ log_warning "${COMPONENT}: Backup directory does not exist: ${backup_dir}"
+ if command -v send_alert >/dev/null 2>&1; then
+  send_alert "${COMPONENT}" "WARNING" "backup_directory_missing" "Backup directory does not exist: ${backup_dir}" || true
  fi
+ return 1
+fi
 
  # Find backup files (common patterns: *.sql, *.sql.gz, *.dump, *.tar.gz, *.backup)
  while IFS= read -r -d '' file; do
@@ -125,11 +127,13 @@ check_backup_freshness() {
 
  backup_count=${#backup_files[@]}
 
- if [[ ${backup_count} -eq 0 ]]; then
-  log_warning "${COMPONENT}: No backup files found in ${backup_dir}"
-  send_alert "${COMPONENT}" "WARNING" "no_backups_found" "No backup files found in backup directory: ${backup_dir}"
-  return 1
+if [[ ${backup_count} -eq 0 ]]; then
+ log_warning "${COMPONENT}: No backup files found in ${backup_dir}"
+ if command -v send_alert >/dev/null 2>&1; then
+  send_alert "${COMPONENT}" "WARNING" "no_backups_found" "No backup files found in backup directory: ${backup_dir}" || true
  fi
+ return 1
+fi
 
  # Calculate ages of backups
  local oldest_backup_time=${current_time}
@@ -160,12 +164,14 @@ check_backup_freshness() {
 
  log_info "${COMPONENT}: Backup freshness - Count: ${backup_count}, Newest: ${newest_backup_age}s, Oldest: ${oldest_backup_age}s (threshold: ${threshold}s)"
 
- # Alert if newest backup is too old
- if [[ ${newest_backup_age} -gt ${threshold} ]]; then
-  log_warning "${COMPONENT}: Newest backup is ${newest_backup_age}s old (threshold: ${threshold}s)"
-  send_alert "${COMPONENT}" "WARNING" "backup_freshness_exceeded" "Newest backup is ${newest_backup_age}s old (threshold: ${threshold}s, directory: ${backup_dir})"
-  return 1
+# Alert if newest backup is too old
+if [[ ${newest_backup_age} -gt ${threshold} ]]; then
+ log_warning "${COMPONENT}: Newest backup is ${newest_backup_age}s old (threshold: ${threshold}s)"
+ if command -v send_alert >/dev/null 2>&1; then
+  send_alert "${COMPONENT}" "WARNING" "backup_freshness_exceeded" "Newest backup is ${newest_backup_age}s old (threshold: ${threshold}s, directory: ${backup_dir})" || true
  fi
+ return 1
+fi
 
  return 0
 }
@@ -243,12 +249,14 @@ check_repository_sync_status() {
 
  log_info "${COMPONENT}: Repository sync status - Status: ${sync_status}, Behind: ${behind_count}, Ahead: ${ahead_count}"
 
- # Alert if repository is behind
- if [[ "${sync_status}" == "behind" ]] && [[ ${behind_count} -gt 0 ]]; then
-  log_warning "${COMPONENT}: Repository is ${behind_count} commits behind remote"
-  send_alert "${COMPONENT}" "WARNING" "repo_sync_behind" "Repository is ${behind_count} commits behind remote (repo: ${repo_path})"
-  return 1
+# Alert if repository is behind
+if [[ "${sync_status}" == "behind" ]] && [[ ${behind_count} -gt 0 ]]; then
+ log_warning "${COMPONENT}: Repository is ${behind_count} commits behind remote"
+ if command -v send_alert >/dev/null 2>&1; then
+  send_alert "${COMPONENT}" "WARNING" "repo_sync_behind" "Repository is ${behind_count} commits behind remote (repo: ${repo_path})" || true
  fi
+ return 1
+fi
 
  return 0
 }
@@ -410,19 +418,23 @@ check_storage_availability() {
   fi
 
   log_warning "${COMPONENT}: Disk usage (${disk_usage}%) exceeds threshold (${disk_usage_threshold}%)"
-  send_alert "${COMPONENT}" "${alert_level}" "storage_disk_usage_high" "Disk usage (${disk_usage}%) exceeds threshold (${disk_usage_threshold}%, path: ${storage_path})"
+  if command -v send_alert >/dev/null 2>&1; then
+   send_alert "${COMPONENT}" "${alert_level}" "storage_disk_usage_high" "Disk usage (${disk_usage}%) exceeds threshold (${disk_usage_threshold}%, path: ${storage_path})" || true
+  fi
 
   if [[ "${alert_level}" == "CRITICAL" ]]; then
    return 1
   fi
  fi
 
- # Check if storage path is writable
- if [[ ! -w "${storage_path}" ]]; then
-  log_warning "${COMPONENT}: Storage path is not writable: ${storage_path}"
-  send_alert "${COMPONENT}" "CRITICAL" "storage_not_writable" "Storage path is not writable: ${storage_path}"
-  return 1
+# Check if storage path is writable
+if [[ ! -w "${storage_path}" ]]; then
+ log_warning "${COMPONENT}: Storage path is not writable: ${storage_path}"
+ if command -v send_alert >/dev/null 2>&1; then
+  send_alert "${COMPONENT}" "CRITICAL" "storage_not_writable" "Storage path is not writable: ${storage_path}" || true
  fi
+ return 1
+fi
 
  return 0
 }
 
@@ -298,7 +298,9 @@ check_server_resources() {
    alert_level="CRITICAL"
   fi
   log_warning "${COMPONENT}: CPU usage (${cpu_usage}%) exceeds threshold (${cpu_threshold}%)"
-  send_alert "${COMPONENT}" "${alert_level}" "cpu_usage_high" "CPU usage (${cpu_usage}%) exceeds threshold (${cpu_threshold}%)"
+  if command -v send_alert >/dev/null 2>&1; then
+   send_alert "${COMPONENT}" "${alert_level}" "cpu_usage_high" "CPU usage (${cpu_usage}%) exceeds threshold (${cpu_threshold}%)" || true
+  fi
   overall_result=1
  fi
 
@@ -309,7 +311,9 @@ check_server_resources() {
    alert_level="CRITICAL"
   fi
   log_warning "${COMPONENT}: Memory usage (${memory_usage}%) exceeds threshold (${memory_threshold}%)"
-  send_alert "${COMPONENT}" "${alert_level}" "memory_usage_high" "Memory usage (${memory_usage}%) exceeds threshold (${memory_threshold}%)"
+  if command -v send_alert >/dev/null 2>&1; then
+   send_alert "${COMPONENT}" "${alert_level}" "memory_usage_high" "Memory usage (${memory_usage}%) exceeds threshold (${memory_threshold}%)" || true
+  fi
   overall_result=1
  fi
 
@@ -320,7 +324,9 @@ check_server_resources() {
    alert_level="CRITICAL"
   fi
   log_warning "${COMPONENT}: Disk usage (${disk_usage}%) exceeds threshold (${disk_threshold}%)"
-  send_alert "${COMPONENT}" "${alert_level}" "disk_usage_high" "Disk usage (${disk_usage}%) exceeds threshold (${disk_threshold}%)"
+  if command -v send_alert >/dev/null 2>&1; then
+   send_alert "${COMPONENT}" "${alert_level}" "disk_usage_high" "Disk usage (${disk_usage}%) exceeds threshold (${disk_threshold}%)" || true
+  fi
   overall_result=1
  fi
 
@@ -409,7 +415,9 @@ check_advanced_system_metrics() {
   if [[ "${comparison_result}" == "1" ]]; then
    local load_multiplier="${INFRASTRUCTURE_LOAD_THRESHOLD_MULTIPLIER:-2}"
    log_warning "${COMPONENT}: Load average (${load_1min}) exceeds threshold (${load_threshold} = ${load_multiplier}x ${cpu_count} CPUs)"
-   send_alert "${COMPONENT}" "WARNING" "system_load_high" "Load average (${load_1min}) exceeds threshold (${load_threshold})"
+   if command -v send_alert >/dev/null 2>&1; then
+    send_alert "${COMPONENT}" "WARNING" "system_load_high" "Load average (${load_1min}) exceeds threshold (${load_threshold})" || true
+   fi
   fi
  fi
 
@@ -431,7 +439,9 @@ check_advanced_system_metrics() {
 
   if [[ ${swap_usage_int} -gt ${swap_threshold} ]]; then
    log_warning "${COMPONENT}: Swap usage (${swap_usage}%) exceeds threshold (${swap_threshold}%)"
-   send_alert "${COMPONENT}" "WARNING" "system_swap_high" "Swap usage (${swap_usage}%) exceeds threshold (${swap_threshold}%)"
+   if command -v send_alert >/dev/null 2>&1; then
+    send_alert "${COMPONENT}" "WARNING" "system_swap_high" "Swap usage (${swap_usage}%) exceeds threshold (${swap_threshold}%)" || true
+   fi
   fi
  fi
 
@@ -511,7 +521,9 @@ check_network_connectivity() {
  # Alert if connectivity failures
  if [[ ${connectivity_failures} -gt 0 ]]; then
   log_warning "${COMPONENT}: Network connectivity check found ${connectivity_failures} failure(s)"
-  send_alert "${COMPONENT}" "WARNING" "network_connectivity_failure" "Network connectivity check found ${connectivity_failures} failure(s) out of ${total_checks} hosts checked"
+  if command -v send_alert >/dev/null 2>&1; then
+   send_alert "${COMPONENT}" "WARNING" "network_connectivity_failure" "Network connectivity check found ${connectivity_failures} failure(s) out of ${total_checks} hosts checked" || true
+  fi
   return 1
  fi
 
@@ -529,12 +541,14 @@ check_database_server_health() {
   return 0
  fi
 
- # Check database connection
- if ! check_database_connection; then
-  log_error "${COMPONENT}: Database connection failed"
-  send_alert "${COMPONENT}" "CRITICAL" "database_connection_failed" "Database server connection failed"
-  return 1
+# Check database connection
+if ! check_database_connection; then
+ log_error "${COMPONENT}: Database connection failed"
+ if command -v send_alert >/dev/null 2>&1; then
+  send_alert "${COMPONENT}" "CRITICAL" "database_connection_failed" "Database server connection failed" || true
  fi
+ return 1
+fi
 
  # Get database server status
  local db_version=""
@@ -587,7 +601,9 @@ check_database_server_health() {
   local connection_usage_percent=$(((active_connections * 100) / max_connections))
   if [[ ${connection_usage_percent} -gt 80 ]]; then
    log_warning "${COMPONENT}: Database connection usage (${connection_usage_percent}%) is high"
-   send_alert "${COMPONENT}" "WARNING" "database_connections_high" "Database connection usage (${connection_usage_percent}%, ${active_connections}/${max_connections}) is high"
+   if command -v send_alert >/dev/null 2>&1; then
+    send_alert "${COMPONENT}" "WARNING" "database_connections_high" "Database connection usage (${connection_usage_percent}%, ${active_connections}/${max_connections}) is high" || true
+   fi
    return 1
   fi
  fi
@@ -661,7 +677,9 @@ check_service_dependencies() {
  # Alert if service failures
  if [[ ${service_failures} -gt 0 ]]; then
   log_warning "${COMPONENT}: Service dependencies check found ${service_failures} failure(s)"
-  send_alert "${COMPONENT}" "WARNING" "service_dependency_failure" "Service dependencies check found ${service_failures} failure(s) out of ${total_services} services checked"
+  if command -v send_alert >/dev/null 2>&1; then
+   send_alert "${COMPONENT}" "WARNING" "service_dependency_failure" "Service dependencies check found ${service_failures} failure(s) out of ${total_services} services checked" || true
+  fi
   return 1
  fi