📊 System Monitoring Scripts

Các scripts trong phần này giúp bạn giám sát hệ thống Linux một cách tự động, theo dõi hiệu năng và nhận cảnh báo khi có vấn đề.

🖥️ 1. Script Giám Sát Tài Nguyên Hệ Thống

Monitor CPU, RAM, Disk Usage

#!/bin/bash
# system_monitor.sh - Giám sát tài nguyên hệ thống

# Cấu hình
LOG_FILE="/var/log/system_monitor.log"
ALERT_EMAIL="[email protected]"
CPU_THRESHOLD=80    # Cảnh báo khi CPU > 80%
RAM_THRESHOLD=85    # Cảnh báo khi RAM > 85%
DISK_THRESHOLD=90   # Cảnh báo khi Disk > 90%
LOAD_THRESHOLD=5.0  # Cảnh báo khi Load Average > 5.0

# Hàm ghi log
log_message() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
}

# Hàm gửi cảnh báo email
send_alert() {
    local subject="$1"
    local message="$2"
    echo "$message" | mail -s "[ALERT] $subject" "$ALERT_EMAIL"
    log_message "ALERT SENT: $subject"
}

# Hàm kiểm tra CPU usage
check_cpu() {
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
    cpu_usage=${cpu_usage%.*}  # Loại bỏ phần thập phân
    
    log_message "CPU Usage: ${cpu_usage}%"
    
    if [ "$cpu_usage" -gt "$CPU_THRESHOLD" ]; then
        local message="CPU usage is high: ${cpu_usage}%\nThreshold: ${CPU_THRESHOLD}%\nServer: $(hostname)\nTime: $(date)"
        send_alert "High CPU Usage" "$message"
    fi
    
    echo "$cpu_usage"
}

# Hàm kiểm tra RAM usage
check_memory() {
    local mem_info=$(free | grep Mem)
    local total_mem=$(echo $mem_info | awk '{print $2}')
    local used_mem=$(echo $mem_info | awk '{print $3}')
    local mem_usage=$((used_mem * 100 / total_mem))
    
    log_message "Memory Usage: ${mem_usage}% (${used_mem}/${total_mem})"
    
    if [ "$mem_usage" -gt "$RAM_THRESHOLD" ]; then
        local message="Memory usage is high: ${mem_usage}%\nUsed: $(echo "scale=2; $used_mem/1024/1024" | bc) GB\nTotal: $(echo "scale=2; $total_mem/1024/1024" | bc) GB\nThreshold: ${RAM_THRESHOLD}%\nServer: $(hostname)\nTime: $(date)"
        send_alert "High Memory Usage" "$message"
    fi
    
    echo "$mem_usage"
}

# Hàm kiểm tra Disk usage
check_disk() {
    log_message "=== DISK USAGE ==="
    
    df -h | grep -E '^/dev/' | while read filesystem size used avail percent mountpoint; do
        usage=$(echo $percent | sed 's/%//')
        log_message "$mountpoint: ${usage}% used ($used/$size)"
        
        if [ "$usage" -gt "$DISK_THRESHOLD" ]; then
            local message="Disk usage is high on $mountpoint: ${usage}%\nUsed: $used\nTotal: $size\nAvailable: $avail\nThreshold: ${DISK_THRESHOLD}%\nServer: $(hostname)\nTime: $(date)"
            send_alert "High Disk Usage - $mountpoint" "$message"
        fi
    done
}

# Hàm kiểm tra Load Average
check_load() {
    local load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
    log_message "Load Average (1min): $load_avg"
    
    # So sánh load average (sử dụng bc cho số thập phân)
    if (( $(echo "$load_avg > $LOAD_THRESHOLD" | bc -l) )); then
        local message="Load average is high: $load_avg\nThreshold: $LOAD_THRESHOLD\nServer: $(hostname)\nTime: $(date)\n\nTop processes:\n$(ps aux --sort=-%cpu | head -10)"
        send_alert "High Load Average" "$message"
    fi
}

# Hàm kiểm tra các process tiêu tốn nhiều tài nguyên
check_top_processes() {
    log_message "=== TOP CPU PROCESSES ==="
    ps aux --sort=-%cpu | head -6 | tee -a "$LOG_FILE"
    
    log_message "=== TOP MEMORY PROCESSES ==="
    ps aux --sort=-%mem | head -6 | tee -a "$LOG_FILE"
}

# Hàm kiểm tra network connections
check_network() {
    local connections=$(netstat -an | wc -l)
    local established=$(netstat -an | grep ESTABLISHED | wc -l)
    local listening=$(netstat -an | grep LISTEN | wc -l)
    
    log_message "Network Connections: Total=$connections, Established=$established, Listening=$listening"
    
    # Cảnh báo nếu có quá nhiều kết nối
    if [ "$established" -gt 1000 ]; then
        local message="High number of established connections: $established\nServer: $(hostname)\nTime: $(date)"
        send_alert "High Network Connections" "$message"
    fi
}

# Hàm tạo báo cáo tổng hợp
generate_report() {
    local cpu=$(check_cpu)
    local memory=$(check_memory)
    
    log_message "=== SYSTEM MONITORING REPORT ==="
    log_message "Server: $(hostname)"
    log_message "Uptime: $(uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}')"
    log_message "CPU: ${cpu}%"
    log_message "Memory: ${memory}%"
    
    check_disk
    check_load
    check_top_processes
    check_network
    
    log_message "=== END REPORT ==="
}

# Main execution
log_message "Starting system monitoring"
generate_report
log_message "System monitoring completed"

🔍 2. Script Health Check Server

Comprehensive Server Health Check

#!/bin/bash
# server_health_check.sh - Kiểm tra sức khỏe server toàn diện

# Cấu hình
LOG_FILE="/var/log/health_check.log"
ALERT_EMAIL="[email protected]"
SERVICES=("nginx" "mysql" "postgresql" "redis" "ssh")
PORTS=("80" "443" "22" "3306" "5432")
WEBSITES=("http://localhost" "https://example.com")

# Màu sắc cho output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Hàm ghi log với màu
log_status() {
    local status="$1"
    local message="$2"
    local timestamp="$(date '+%Y-%m-%d %H:%M:%S')"
    
    case $status in
        "OK")
            echo -e "${timestamp} - ${GREEN}[OK]${NC} $message" | tee -a "$LOG_FILE"
            ;;
        "WARNING")
            echo -e "${timestamp} - ${YELLOW}[WARNING]${NC} $message" | tee -a "$LOG_FILE"
            ;;
        "ERROR")
            echo -e "${timestamp} - ${RED}[ERROR]${NC} $message" | tee -a "$LOG_FILE"
            ;;
        *)
            echo "${timestamp} - [INFO] $message" | tee -a "$LOG_FILE"
            ;;
    esac
}

# Hàm kiểm tra service
check_services() {
    log_status "INFO" "=== CHECKING SERVICES ==="
    
    for service in "${SERVICES[@]}"; do
        if pgrep -x "$service" > /dev/null; then
            log_status "OK" "Service $service is running"
        else
            log_status "ERROR" "Service $service is not running"
            echo "Service $service is down on $(hostname)" | mail -s "[ALERT] Service Down" "$ALERT_EMAIL"
        fi
    done
}

# Hàm kiểm tra ports
check_ports() {
    log_status "INFO" "=== CHECKING PORTS ==="
    
    for port in "${PORTS[@]}"; do
        if netstat -tuln | grep ":$port " > /dev/null; then
            log_status "OK" "Port $port is open"
        else
            log_status "WARNING" "Port $port is not listening"
        fi
    done
}

# Hàm kiểm tra websites
check_websites() {
    log_status "INFO" "=== CHECKING WEBSITES ==="
    
    for website in "${WEBSITES[@]}"; do
        response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$website")
        
        if [ "$response" = "200" ]; then
            log_status "OK" "Website $website is responding (HTTP $response)"
        else
            log_status "ERROR" "Website $website is not responding (HTTP $response)"
            echo "Website $website is down (HTTP $response) on $(hostname)" | mail -s "[ALERT] Website Down" "$ALERT_EMAIL"
        fi
    done
}

# Hàm kiểm tra filesystem
check_filesystem() {
    log_status "INFO" "=== CHECKING FILESYSTEM ==="
    
    # Kiểm tra filesystem errors
    if dmesg | grep -i "error\|fail\|corrupt" | tail -5 | grep -q .; then
        log_status "WARNING" "Found filesystem errors in dmesg"
        dmesg | grep -i "error\|fail\|corrupt" | tail -5 | tee -a "$LOG_FILE"
    else
        log_status "OK" "No filesystem errors found"
    fi
    
    # Kiểm tra readonly filesystems
    if mount | grep -q "ro,"; then
        log_status "ERROR" "Found read-only filesystems"
        mount | grep "ro," | tee -a "$LOG_FILE"
    else
        log_status "OK" "All filesystems are writable"
    fi
}

# Hàm kiểm tra system logs
check_system_logs() {
    log_status "INFO" "=== CHECKING SYSTEM LOGS ==="
    
    # Kiểm tra errors trong syslog (1 giờ gần nhất)
    error_count=$(journalctl --since "1 hour ago" --priority=err | wc -l)
    
    if [ "$error_count" -gt 10 ]; then
        log_status "WARNING" "Found $error_count errors in system logs (last 1 hour)"
        journalctl --since "1 hour ago" --priority=err | tail -10 | tee -a "$LOG_FILE"
    else
        log_status "OK" "System logs look normal ($error_count errors in last hour)"
    fi
}

# Hàm kiểm tra security
check_security() {
    log_status "INFO" "=== CHECKING SECURITY ==="
    
    # Kiểm tra failed login attempts
    failed_logins=$(journalctl --since "1 hour ago" | grep "Failed password" | wc -l)
    
    if [ "$failed_logins" -gt 20 ]; then
        log_status "WARNING" "High number of failed login attempts: $failed_logins (last 1 hour)"
        journalctl --since "1 hour ago" | grep "Failed password" | tail -10 | tee -a "$LOG_FILE"
    else
        log_status "OK" "Failed login attempts: $failed_logins (last 1 hour)"
    fi
    
    # Kiểm tra root login
    root_logins=$(journalctl --since "1 day ago" | grep "session opened for user root" | wc -l)
    if [ "$root_logins" -gt 0 ]; then
        log_status "WARNING" "Root login detected: $root_logins times (last 24 hours)"
    else
        log_status "OK" "No root login in last 24 hours"
    fi
}

# Hàm tạo summary report
generate_summary() {
    log_status "INFO" "=== HEALTH CHECK SUMMARY ==="
    log_status "INFO" "Server: $(hostname)"
    log_status "INFO" "Date: $(date)"
    log_status "INFO" "Uptime: $(uptime -p)"
    log_status "INFO" "Load: $(uptime | awk -F'load average:' '{print $2}')"
    log_status "INFO" "Memory: $(free -h | grep Mem | awk '{print $3"/"$2}')"
    log_status "INFO" "Disk: $(df -h / | tail -1 | awk '{print $3"/"$2" ("$5" used)"}')"
}

# Main execution
log_status "INFO" "Starting server health check"
generate_summary
check_services
check_ports
check_websites
check_filesystem
check_system_logs
check_security
log_status "INFO" "Health check completed"

📈 3. Script Performance Monitoring

Continuous Performance Monitoring

#!/bin/bash
# performance_monitor.sh - Giám sát hiệu năng liên tục

# Cấu hình
MONITOR_INTERVAL=60  # Giám sát mỗi 60 giây
LOG_DIR="/var/log/performance"
DATA_RETENTION_DAYS=30
ALERT_EMAIL="[email protected]"

# Tạo thư mục log
mkdir -p "$LOG_DIR"

# Files log
CPU_LOG="$LOG_DIR/cpu_$(date +%Y%m%d).log"
MEM_LOG="$LOG_DIR/memory_$(date +%Y%m%d).log"
DISK_LOG="$LOG_DIR/disk_$(date +%Y%m%d).log"
NET_LOG="$LOG_DIR/network_$(date +%Y%m%d).log"
PROC_LOG="$LOG_DIR/processes_$(date +%Y%m%d).log"

# Hàm thu thập CPU metrics
collect_cpu_metrics() {
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
    local load_avg=$(uptime | awk -F'load average:' '{print $2}' | tr -d ' ')
    
    echo "$timestamp,$cpu_usage,$load_avg" >> "$CPU_LOG"
}

# Hàm thu thập Memory metrics
collect_memory_metrics() {
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    local mem_info=$(free -m | grep Mem)
    local total=$(echo $mem_info | awk '{print $2}')
    local used=$(echo $mem_info | awk '{print $3}')
    local free=$(echo $mem_info | awk '{print $4}')
    local cached=$(echo $mem_info | awk '{print $6}')
    local usage_percent=$((used * 100 / total))
    
    echo "$timestamp,$total,$used,$free,$cached,$usage_percent" >> "$MEM_LOG"
}

# Hàm thu thập Disk I/O metrics
collect_disk_metrics() {
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    # Disk usage cho các mount points chính
    df -h | grep -E '^/dev/' | while read filesystem size used avail percent mountpoint; do
        usage=$(echo $percent | sed 's/%//')
        echo "$timestamp,$mountpoint,$size,$used,$avail,$usage" >> "$DISK_LOG"
    done
    
    # Disk I/O statistics (nếu có iostat)
    if command -v iostat >/dev/null 2>&1; then
        iostat -x 1 1 | grep -E '^[a-z]' | while read device rrqm wrqm r_s w_s rkb_s wkb_s avgrq avgqu await r_await w_await svctm util; do
            echo "$timestamp,$device,$r_s,$w_s,$rkb_s,$wkb_s,$util" >> "${DISK_LOG}.io"
        done
    fi
}

# Hàm thu thập Network metrics
collect_network_metrics() {
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    # Network interface statistics
    cat /proc/net/dev | grep -E '^\s*[a-z]' | while read line; do
        interface=$(echo $line | awk -F: '{print $1}' | tr -d ' ')
        rx_bytes=$(echo $line | awk '{print $2}')
        tx_bytes=$(echo $line | awk '{print $10}')
        
        echo "$timestamp,$interface,$rx_bytes,$tx_bytes" >> "$NET_LOG"
    done
    
    # Connection counts
    local total_conn=$(netstat -an | wc -l)
    local established=$(netstat -an | grep ESTABLISHED | wc -l)
    local time_wait=$(netstat -an | grep TIME_WAIT | wc -l)
    
    echo "$timestamp,connections,$total_conn,$established,$time_wait" >> "$NET_LOG"
}

# Hàm thu thập Process metrics
collect_process_metrics() {
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    # Top CPU processes
    ps aux --sort=-%cpu | head -6 | tail -5 | while read user pid cpu mem vsz rss tty stat start time command; do
        echo "$timestamp,cpu,$user,$pid,$cpu,$mem,$command" >> "$PROC_LOG"
    done
    
    # Top Memory processes
    ps aux --sort=-%mem | head -6 | tail -5 | while read user pid cpu mem vsz rss tty stat start time command; do
        echo "$timestamp,memory,$user,$pid,$cpu,$mem,$command" >> "$PROC_LOG"
    done
    
    # Process count
    local total_processes=$(ps aux | wc -l)
    local running_processes=$(ps aux | grep -c " R ")
    local zombie_processes=$(ps aux | grep -c " Z ")
    
    echo "$timestamp,summary,$total_processes,$running_processes,$zombie_processes" >> "$PROC_LOG"
}

# Hàm dọn dẹp log cũ
cleanup_old_logs() {
    find "$LOG_DIR" -name "*.log" -type f -mtime +$DATA_RETENTION_DAYS -delete
    echo "$(date '+%Y-%m-%d %H:%M:%S') - Cleaned up logs older than $DATA_RETENTION_DAYS days"
}

# Hàm tạo báo cáo hàng ngày
generate_daily_report() {
    local report_file="$LOG_DIR/daily_report_$(date +%Y%m%d).txt"
    
    {
        echo "=== DAILY PERFORMANCE REPORT ==="
        echo "Date: $(date)"
        echo "Server: $(hostname)"
        echo ""
        
        echo "=== CPU STATISTICS ==="
        if [ -f "$CPU_LOG" ]; then
            echo "Average CPU Usage: $(awk -F',' '{sum+=$2; count++} END {printf "%.2f%%", sum/count}' "$CPU_LOG")"
            echo "Max CPU Usage: $(awk -F',' '{if($2>max) max=$2} END {printf "%.2f%%", max}' "$CPU_LOG")"
        fi
        echo ""
        
        echo "=== MEMORY STATISTICS ==="
        if [ -f "$MEM_LOG" ]; then
            echo "Average Memory Usage: $(awk -F',' '{sum+=$6; count++} END {printf "%.2f%%", sum/count}' "$MEM_LOG")"
            echo "Max Memory Usage: $(awk -F',' '{if($6>max) max=$6} END {printf "%.2f%%", max}' "$MEM_LOG")"
        fi
        echo ""
        
        echo "=== DISK USAGE ==="
        df -h | grep -E '^/dev/'
        echo ""
        
        echo "=== TOP PROCESSES BY CPU ==="
        if [ -f "$PROC_LOG" ]; then
            grep ",cpu," "$PROC_LOG" | tail -10
        fi
        
    } > "$report_file"
    
    echo "Daily report generated: $report_file"
}

# Hàm chính cho monitoring loop
start_monitoring() {
    echo "Starting performance monitoring (interval: ${MONITOR_INTERVAL}s)"
    echo "Logs directory: $LOG_DIR"
    echo "Press Ctrl+C to stop"
    
    # Trap để dọn dẹp khi thoát
    trap 'echo "Stopping performance monitoring..."; exit 0' INT TERM
    
    while true; do
        collect_cpu_metrics
        collect_memory_metrics
        collect_disk_metrics
        collect_network_metrics
        collect_process_metrics
        
        # Dọn dẹp log cũ mỗi ngày (chạy lúc 00:00)
        if [ "$(date +%H%M)" = "0000" ]; then
            cleanup_old_logs
            generate_daily_report
        fi
        
        sleep "$MONITOR_INTERVAL"
    done
}

# Hàm hiển thị real-time stats
show_realtime_stats() {
    while true; do
        clear
        echo "=== REAL-TIME SYSTEM STATS ==="
        echo "Time: $(date)"
        echo "Uptime: $(uptime -p)"
        echo ""
        
        echo "=== CPU & LOAD ==="
        echo "CPU Usage: $(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')%"
        echo "Load Average: $(uptime | awk -F'load average:' '{print $2}')"
        echo ""
        
        echo "=== MEMORY ==="
        free -h
        echo ""
        
        echo "=== DISK USAGE ==="
        df -h | grep -E '^/dev/' | head -5
        echo ""
        
        echo "=== TOP PROCESSES ==="
        ps aux --sort=-%cpu | head -6
        echo ""
        
        echo "Press Ctrl+C to exit"
        sleep 2
    done
}

# Menu chính
case "${1:-monitor}" in
    "monitor")
        start_monitoring
        ;;
    "realtime")
        show_realtime_stats
        ;;
    "report")
        generate_daily_report
        ;;
    "cleanup")
        cleanup_old_logs
        ;;
    *)
        echo "Usage: $0 {monitor|realtime|report|cleanup}"
        echo "  monitor  - Start continuous monitoring (default)"
        echo "  realtime - Show real-time stats"
        echo "  report   - Generate daily report"
        echo "  cleanup  - Clean up old logs"
        exit 1
        ;;
esac

⚙️ 4. Cấu Hình và Tự Động Hóa

Cron Jobs cho Monitoring

# Mở crontab editor
crontab -e

# Thêm các dòng sau:

# System monitoring mỗi 5 phút
*/5 * * * * /path/to/system_monitor.sh

# Health check mỗi 15 phút
*/15 * * * * /path/to/server_health_check.sh

# Performance monitoring (chạy liên tục)
@reboot /path/to/performance_monitor.sh monitor &

# Daily report lúc 6:00 AM
0 6 * * * /path/to/performance_monitor.sh report

# Weekly cleanup vào Chủ nhật lúc 2:00 AM
0 2 * * 0 /path/to/performance_monitor.sh cleanup

Systemd Service cho Monitoring

# /etc/systemd/system/performance-monitor.service
[Unit]
Description=Performance Monitoring Service
After=network.target

[Service]
Type=simple
User=monitor
Group=monitor
ExecStart=/opt/scripts/performance_monitor.sh monitor
Restart=always
RestartSec=10

[Install]
WantedBy=multi-user.target

# Kích hoạt service
sudo systemctl enable performance-monitor.service
sudo systemctl start performance-monitor.service
sudo systemctl status performance-monitor.service

📧 5. Cấu Hình Email Alerts

Cài đặt Mail Server

# Ubuntu/Debian
sudo apt-get install mailutils postfix

# CentOS/RHEL
sudo yum install mailx postfix

# Cấu hình postfix cho relay qua Gmail
sudo nano /etc/postfix/main.cf

Email Configuration

# /etc/postfix/main.cf
relayhost = [smtp.gmail.com]:587
smtp_use_tls = yes
smtp_sasl_auth_enable = yes
smtp_sasl_password_maps = hash:/etc/postfix/sasl_passwd
smtp_sasl_security_options = noanonymous
smtp_tls_CAfile = /etc/ssl/certs/ca-certificates.crt

# /etc/postfix/sasl_passwd
[smtp.gmail.com]:587 [email protected]:your-app-password

# Bảo mật file password
sudo postmap /etc/postfix/sasl_passwd
sudo chmod 600 /etc/postfix/sasl_passwd*
sudo systemctl restart postfix

🔧 6. Troubleshooting

Các lỗi thường gặp:

Permission denied:

sudo chmod +x /path/to/script.sh
sudo chown monitor:monitor /var/log/monitoring/

Mail command not found:

# Ubuntu/Debian
sudo apt-get install mailutils
# CentOS/RHEL
sudo yum install mailx

High CPU usage từ monitoring scripts:

# Tăng interval giữa các lần check
MONITOR_INTERVAL=300  # 5 phút thay vì 1 phút

Log files quá lớn:

# Thêm log rotation
sudo nano /etc/logrotate.d/monitoring

Log Rotation Configuration

# /etc/logrotate.d/monitoring
/var/log/system_monitor.log
/var/log/health_check.log
/var/log/performance/*.log {
    daily
    rotate 30
    compress
    delaycompress
    missingok
    notifempty
    create 644 monitor monitor
    postrotate
        systemctl reload rsyslog > /dev/null 2>&1 || true
    endscript
}

Lưu ý quan trọng:

Test tất cả scripts trên môi trường dev trước
Cấu hình email alerts phù hợp với hạ tầng
Monitor performance của chính các monitoring scripts
Thiết lập log rotation để tránh disk full
Backup cấu hình monitoring thường xuyên

🖥️ 1. Script Giám Sát Tài Nguyên Hệ Thống​

Monitor CPU, RAM, Disk Usage​

🔍 2. Script Health Check Server​

Comprehensive Server Health Check​

📈 3. Script Performance Monitoring​

Continuous Performance Monitoring​

⚙️ 4. Cấu Hình và Tự Động Hóa​

Cron Jobs cho Monitoring​

Systemd Service cho Monitoring​

📧 5. Cấu Hình Email Alerts​

Cài đặt Mail Server​

Email Configuration​

🔧 6. Troubleshooting​

Các lỗi thường gặp:​

Log Rotation Configuration​