I’ve enhanced the script with several important improvements:
-
Error Handling:
- Added proper error checking for VM existence
- Implemented retry logic with configurable maximum attempts
- Added cooldown period between restart attempts
-
Logging:
- Added comprehensive logging with timestamps
- Logs are stored in
/boot/logs/vm_watchdog.log
- Both console and file logging for better monitoring
-
Process Management:
- Added lock file mechanism to prevent multiple instances
- Proper cleanup on script exit
- Trap handler for clean script termination
-
Configuration:
- Moved all configurable variables to the top
- Added comments for easy customization
- Made important variables readonly to prevent accidental modification
-
Code Structure:
- Modular design with separate functions for different tasks
- Better variable naming and consistency
- Added comprehensive comments for maintainability
To use this script:
- Install User Scripts plugin from Community Apps
- Navigate to Settings > User Utilities > User Scripts
- Create a new script and paste this code
- Modify the configuration variables at the top as needed:
VM_NAME
: Name of your VMMAX_RESTART_ATTEMPTS
: Maximum number of restart attemptsRESTART_COOLDOWN
: Time to wait between restart attemptsLOG_FILE
: Location of the log file
- Set up the schedule using cron notation (e.g.,
*/3 * * * *
for every 3 minutes)
The script will now:
- Check if the VM exists before attempting any operations
- Try to restart the VM multiple times if it fails
- Wait between attempts to avoid rapid cycling
- Log all actions with timestamps
- Prevent multiple instances from running simultaneously
- Clean up after itself properly
You can monitor the script’s activity by checking the log file at /boot/logs/vm_watchdog.log
.
#!/bin/bash
# ===========================================
# Unraid VM Watchdog Script
# This script monitors and automatically restarts specified VMs if they crash
# ===========================================
# Configuration
readonly VM_NAME="Windows 11" # Name of the VM to monitor
readonly MAX_RESTART_ATTEMPTS=3 # Maximum number of restart attempts
readonly RESTART_COOLDOWN=300 # Cooldown period between restarts (in seconds)
readonly LOG_FILE="/boot/logs/vm_watchdog.log" # Log file location
readonly LOCK_FILE="/tmp/vm_watchdog.lock" # Lock file to prevent multiple instances
# Function to log messages with timestamps
log_message() {
local message="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] $message" >> "$LOG_FILE"
echo "[$timestamp] $message"
}
# Function to check if another instance is running
check_lock() {
if [ -f "$LOCK_FILE" ]; then
pid=$(cat "$LOCK_FILE")
if ps -p "$pid" > /dev/null 2>&1; then
log_message "Another instance is already running (PID: $pid)"
exit 1
else
# Clean up stale lock file
rm -f "$LOCK_FILE"
fi
fi
echo $$ > "$LOCK_FILE"
}
# Function to clean up lock file
cleanup() {
rm -f "$LOCK_FILE"
log_message "Script execution completed"
}
# Set up trap for cleanup
trap cleanup EXIT
# Function to get VM state
get_vm_state() {
virsh list --all | grep -w "$VM_NAME" | awk '{print $NF}'
}
# Function to check if VM exists
check_vm_exists() {
if ! virsh list --all | grep -q "$VM_NAME"; then
log_message "ERROR: VM '$VM_NAME' not found"
exit 1
fi
}
# Function to attempt VM restart
restart_vm() {
local attempt=$1
log_message "Attempting to start VM '$VM_NAME' (Attempt $attempt of $MAX_RESTART_ATTEMPTS)"
if virsh start "$VM_NAME"; then
log_message "Successfully started VM '$VM_NAME'"
return 0
else
log_message "Failed to start VM '$VM_NAME'"
return 1
fi
}
# Main script execution
main() {
# Create log directory if it doesn't exist
mkdir -p "$(dirname "$LOG_FILE")"
# Check for lock file
check_lock
# Verify VM exists
check_vm_exists
# Get current VM state
local state=$(get_vm_state)
if [ "$state" != "running" ]; then
log_message "VM '$VM_NAME' is not running (Current state: $state)"
# Attempt to restart the VM with retry logic
for attempt in $(seq 1 $MAX_RESTART_ATTEMPTS); do
if restart_vm "$attempt"; then
exit 0
fi
if [ "$attempt" -lt "$MAX_RESTART_ATTEMPTS" ]; then
log_message "Waiting $RESTART_COOLDOWN seconds before next attempt..."
sleep "$RESTART_COOLDOWN"
fi
done
if [ "$attempt" -eq "$MAX_RESTART_ATTEMPTS" ]; then
log_message "ERROR: Failed to start VM after $MAX_RESTART_ATTEMPTS attempts"
exit 1
fi
else
log_message "VM '$VM_NAME' is running normally"
fi
}
# Execute main function
main