Chuyển tới nội dung chính

📊 System Monitoring Scripts

Các scripts trong phần này giúp bạn giám sát hệ thống Linux một cách tự động, theo dõi hiệu năng và nhận cảnh báo khi có vấn đề.

🖥️ 1. Script Giám Sát Tài Nguyên Hệ Thống

Monitor CPU, RAM, Disk Usage

#!/bin/bash
# system_monitor.sh - Giám sát tài nguyên hệ thống

# Cấu hình
LOG_FILE="/var/log/system_monitor.log"
ALERT_EMAIL="[email protected]"
CPU_THRESHOLD=80 # Cảnh báo khi CPU > 80%
RAM_THRESHOLD=85 # Cảnh báo khi RAM > 85%
DISK_THRESHOLD=90 # Cảnh báo khi Disk > 90%
LOAD_THRESHOLD=5.0 # Cảnh báo khi Load Average > 5.0

# Hàm ghi log
log_message() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
}

# Hàm gửi cảnh báo email
send_alert() {
local subject="$1"
local message="$2"
echo "$message" | mail -s "[ALERT] $subject" "$ALERT_EMAIL"
log_message "ALERT SENT: $subject"
}

# Hàm kiểm tra CPU usage
check_cpu() {
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
cpu_usage=${cpu_usage%.*} # Loại bỏ phần thập phân

log_message "CPU Usage: ${cpu_usage}%"

if [ "$cpu_usage" -gt "$CPU_THRESHOLD" ]; then
local message="CPU usage is high: ${cpu_usage}%\nThreshold: ${CPU_THRESHOLD}%\nServer: $(hostname)\nTime: $(date)"
send_alert "High CPU Usage" "$message"
fi

echo "$cpu_usage"
}

# Hàm kiểm tra RAM usage
check_memory() {
local mem_info=$(free | grep Mem)
local total_mem=$(echo $mem_info | awk '{print $2}')
local used_mem=$(echo $mem_info | awk '{print $3}')
local mem_usage=$((used_mem * 100 / total_mem))

log_message "Memory Usage: ${mem_usage}% (${used_mem}/${total_mem})"

if [ "$mem_usage" -gt "$RAM_THRESHOLD" ]; then
local message="Memory usage is high: ${mem_usage}%\nUsed: $(echo "scale=2; $used_mem/1024/1024" | bc) GB\nTotal: $(echo "scale=2; $total_mem/1024/1024" | bc) GB\nThreshold: ${RAM_THRESHOLD}%\nServer: $(hostname)\nTime: $(date)"
send_alert "High Memory Usage" "$message"
fi

echo "$mem_usage"
}

# Hàm kiểm tra Disk usage
check_disk() {
log_message "=== DISK USAGE ==="

df -h | grep -E '^/dev/' | while read filesystem size used avail percent mountpoint; do
usage=$(echo $percent | sed 's/%//')
log_message "$mountpoint: ${usage}% used ($used/$size)"

if [ "$usage" -gt "$DISK_THRESHOLD" ]; then
local message="Disk usage is high on $mountpoint: ${usage}%\nUsed: $used\nTotal: $size\nAvailable: $avail\nThreshold: ${DISK_THRESHOLD}%\nServer: $(hostname)\nTime: $(date)"
send_alert "High Disk Usage - $mountpoint" "$message"
fi
done
}

# Hàm kiểm tra Load Average
check_load() {
local load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
log_message "Load Average (1min): $load_avg"

# So sánh load average (sử dụng bc cho số thập phân)
if (( $(echo "$load_avg > $LOAD_THRESHOLD" | bc -l) )); then
local message="Load average is high: $load_avg\nThreshold: $LOAD_THRESHOLD\nServer: $(hostname)\nTime: $(date)\n\nTop processes:\n$(ps aux --sort=-%cpu | head -10)"
send_alert "High Load Average" "$message"
fi
}

# Hàm kiểm tra các process tiêu tốn nhiều tài nguyên
check_top_processes() {
log_message "=== TOP CPU PROCESSES ==="
ps aux --sort=-%cpu | head -6 | tee -a "$LOG_FILE"

log_message "=== TOP MEMORY PROCESSES ==="
ps aux --sort=-%mem | head -6 | tee -a "$LOG_FILE"
}

# Hàm kiểm tra network connections
check_network() {
local connections=$(netstat -an | wc -l)
local established=$(netstat -an | grep ESTABLISHED | wc -l)
local listening=$(netstat -an | grep LISTEN | wc -l)

log_message "Network Connections: Total=$connections, Established=$established, Listening=$listening"

# Cảnh báo nếu có quá nhiều kết nối
if [ "$established" -gt 1000 ]; then
local message="High number of established connections: $established\nServer: $(hostname)\nTime: $(date)"
send_alert "High Network Connections" "$message"
fi
}

# Hàm tạo báo cáo tổng hợp
generate_report() {
local cpu=$(check_cpu)
local memory=$(check_memory)

log_message "=== SYSTEM MONITORING REPORT ==="
log_message "Server: $(hostname)"
log_message "Uptime: $(uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}')"
log_message "CPU: ${cpu}%"
log_message "Memory: ${memory}%"

check_disk
check_load
check_top_processes
check_network

log_message "=== END REPORT ==="
}

# Main execution
log_message "Starting system monitoring"
generate_report
log_message "System monitoring completed"

🔍 2. Script Health Check Server

Comprehensive Server Health Check

#!/bin/bash
# server_health_check.sh - Kiểm tra sức khỏe server toàn diện

# Cấu hình
LOG_FILE="/var/log/health_check.log"
ALERT_EMAIL="[email protected]"
SERVICES=("nginx" "mysql" "postgresql" "redis" "ssh")
PORTS=("80" "443" "22" "3306" "5432")
WEBSITES=("http://localhost" "https://example.com")

# Màu sắc cho output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Hàm ghi log với màu
log_status() {
local status="$1"
local message="$2"
local timestamp="$(date '+%Y-%m-%d %H:%M:%S')"

case $status in
"OK")
echo -e "${timestamp} - ${GREEN}[OK]${NC} $message" | tee -a "$LOG_FILE"
;;
"WARNING")
echo -e "${timestamp} - ${YELLOW}[WARNING]${NC} $message" | tee -a "$LOG_FILE"
;;
"ERROR")
echo -e "${timestamp} - ${RED}[ERROR]${NC} $message" | tee -a "$LOG_FILE"
;;
*)
echo "${timestamp} - [INFO] $message" | tee -a "$LOG_FILE"
;;
esac
}

# Hàm kiểm tra service
check_services() {
log_status "INFO" "=== CHECKING SERVICES ==="

for service in "${SERVICES[@]}"; do
if pgrep -x "$service" > /dev/null; then
log_status "OK" "Service $service is running"
else
log_status "ERROR" "Service $service is not running"
echo "Service $service is down on $(hostname)" | mail -s "[ALERT] Service Down" "$ALERT_EMAIL"
fi
done
}

# Hàm kiểm tra ports
check_ports() {
log_status "INFO" "=== CHECKING PORTS ==="

for port in "${PORTS[@]}"; do
if netstat -tuln | grep ":$port " > /dev/null; then
log_status "OK" "Port $port is open"
else
log_status "WARNING" "Port $port is not listening"
fi
done
}

# Hàm kiểm tra websites
check_websites() {
log_status "INFO" "=== CHECKING WEBSITES ==="

for website in "${WEBSITES[@]}"; do
response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$website")

if [ "$response" = "200" ]; then
log_status "OK" "Website $website is responding (HTTP $response)"
else
log_status "ERROR" "Website $website is not responding (HTTP $response)"
echo "Website $website is down (HTTP $response) on $(hostname)" | mail -s "[ALERT] Website Down" "$ALERT_EMAIL"
fi
done
}

# Hàm kiểm tra filesystem
check_filesystem() {
log_status "INFO" "=== CHECKING FILESYSTEM ==="

# Kiểm tra filesystem errors
if dmesg | grep -i "error\|fail\|corrupt" | tail -5 | grep -q .; then
log_status "WARNING" "Found filesystem errors in dmesg"
dmesg | grep -i "error\|fail\|corrupt" | tail -5 | tee -a "$LOG_FILE"
else
log_status "OK" "No filesystem errors found"
fi

# Kiểm tra readonly filesystems
if mount | grep -q "ro,"; then
log_status "ERROR" "Found read-only filesystems"
mount | grep "ro," | tee -a "$LOG_FILE"
else
log_status "OK" "All filesystems are writable"
fi
}

# Hàm kiểm tra system logs
check_system_logs() {
log_status "INFO" "=== CHECKING SYSTEM LOGS ==="

# Kiểm tra errors trong syslog (1 giờ gần nhất)
error_count=$(journalctl --since "1 hour ago" --priority=err | wc -l)

if [ "$error_count" -gt 10 ]; then
log_status "WARNING" "Found $error_count errors in system logs (last 1 hour)"
journalctl --since "1 hour ago" --priority=err | tail -10 | tee -a "$LOG_FILE"
else
log_status "OK" "System logs look normal ($error_count errors in last hour)"
fi
}

# Hàm kiểm tra security
check_security() {
log_status "INFO" "=== CHECKING SECURITY ==="

# Kiểm tra failed login attempts
failed_logins=$(journalctl --since "1 hour ago" | grep "Failed password" | wc -l)

if [ "$failed_logins" -gt 20 ]; then
log_status "WARNING" "High number of failed login attempts: $failed_logins (last 1 hour)"
journalctl --since "1 hour ago" | grep "Failed password" | tail -10 | tee -a "$LOG_FILE"
else
log_status "OK" "Failed login attempts: $failed_logins (last 1 hour)"
fi

# Kiểm tra root login
root_logins=$(journalctl --since "1 day ago" | grep "session opened for user root" | wc -l)
if [ "$root_logins" -gt 0 ]; then
log_status "WARNING" "Root login detected: $root_logins times (last 24 hours)"
else
log_status "OK" "No root login in last 24 hours"
fi
}

# Hàm tạo summary report
generate_summary() {
log_status "INFO" "=== HEALTH CHECK SUMMARY ==="
log_status "INFO" "Server: $(hostname)"
log_status "INFO" "Date: $(date)"
log_status "INFO" "Uptime: $(uptime -p)"
log_status "INFO" "Load: $(uptime | awk -F'load average:' '{print $2}')"
log_status "INFO" "Memory: $(free -h | grep Mem | awk '{print $3"/"$2}')"
log_status "INFO" "Disk: $(df -h / | tail -1 | awk '{print $3"/"$2" ("$5" used)"}')"
}

# Main execution
log_status "INFO" "Starting server health check"
generate_summary
check_services
check_ports
check_websites
check_filesystem
check_system_logs
check_security
log_status "INFO" "Health check completed"

📈 3. Script Performance Monitoring

Continuous Performance Monitoring

#!/bin/bash
# performance_monitor.sh - Giám sát hiệu năng liên tục

# Cấu hình
MONITOR_INTERVAL=60 # Giám sát mỗi 60 giây
LOG_DIR="/var/log/performance"
DATA_RETENTION_DAYS=30
ALERT_EMAIL="[email protected]"

# Tạo thư mục log
mkdir -p "$LOG_DIR"

# Files log
CPU_LOG="$LOG_DIR/cpu_$(date +%Y%m%d).log"
MEM_LOG="$LOG_DIR/memory_$(date +%Y%m%d).log"
DISK_LOG="$LOG_DIR/disk_$(date +%Y%m%d).log"
NET_LOG="$LOG_DIR/network_$(date +%Y%m%d).log"
PROC_LOG="$LOG_DIR/processes_$(date +%Y%m%d).log"

# Hàm thu thập CPU metrics
collect_cpu_metrics() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
local load_avg=$(uptime | awk -F'load average:' '{print $2}' | tr -d ' ')

echo "$timestamp,$cpu_usage,$load_avg" >> "$CPU_LOG"
}

# Hàm thu thập Memory metrics
collect_memory_metrics() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local mem_info=$(free -m | grep Mem)
local total=$(echo $mem_info | awk '{print $2}')
local used=$(echo $mem_info | awk '{print $3}')
local free=$(echo $mem_info | awk '{print $4}')
local cached=$(echo $mem_info | awk '{print $6}')
local usage_percent=$((used * 100 / total))

echo "$timestamp,$total,$used,$free,$cached,$usage_percent" >> "$MEM_LOG"
}

# Hàm thu thập Disk I/O metrics
collect_disk_metrics() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')

# Disk usage cho các mount points chính
df -h | grep -E '^/dev/' | while read filesystem size used avail percent mountpoint; do
usage=$(echo $percent | sed 's/%//')
echo "$timestamp,$mountpoint,$size,$used,$avail,$usage" >> "$DISK_LOG"
done

# Disk I/O statistics (nếu có iostat)
if command -v iostat >/dev/null 2>&1; then
iostat -x 1 1 | grep -E '^[a-z]' | while read device rrqm wrqm r_s w_s rkb_s wkb_s avgrq avgqu await r_await w_await svctm util; do
echo "$timestamp,$device,$r_s,$w_s,$rkb_s,$wkb_s,$util" >> "${DISK_LOG}.io"
done
fi
}

# Hàm thu thập Network metrics
collect_network_metrics() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')

# Network interface statistics
cat /proc/net/dev | grep -E '^\s*[a-z]' | while read line; do
interface=$(echo $line | awk -F: '{print $1}' | tr -d ' ')
rx_bytes=$(echo $line | awk '{print $2}')
tx_bytes=$(echo $line | awk '{print $10}')

echo "$timestamp,$interface,$rx_bytes,$tx_bytes" >> "$NET_LOG"
done

# Connection counts
local total_conn=$(netstat -an | wc -l)
local established=$(netstat -an | grep ESTABLISHED | wc -l)
local time_wait=$(netstat -an | grep TIME_WAIT | wc -l)

echo "$timestamp,connections,$total_conn,$established,$time_wait" >> "$NET_LOG"
}

# Hàm thu thập Process metrics
collect_process_metrics() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')

# Top CPU processes
ps aux --sort=-%cpu | head -6 | tail -5 | while read user pid cpu mem vsz rss tty stat start time command; do
echo "$timestamp,cpu,$user,$pid,$cpu,$mem,$command" >> "$PROC_LOG"
done

# Top Memory processes
ps aux --sort=-%mem | head -6 | tail -5 | while read user pid cpu mem vsz rss tty stat start time command; do
echo "$timestamp,memory,$user,$pid,$cpu,$mem,$command" >> "$PROC_LOG"
done

# Process count
local total_processes=$(ps aux | wc -l)
local running_processes=$(ps aux | grep -c " R ")
local zombie_processes=$(ps aux | grep -c " Z ")

echo "$timestamp,summary,$total_processes,$running_processes,$zombie_processes" >> "$PROC_LOG"
}

# Hàm dọn dẹp log cũ
cleanup_old_logs() {
find "$LOG_DIR" -name "*.log" -type f -mtime +$DATA_RETENTION_DAYS -delete
echo "$(date '+%Y-%m-%d %H:%M:%S') - Cleaned up logs older than $DATA_RETENTION_DAYS days"
}

# Hàm tạo báo cáo hàng ngày
generate_daily_report() {
local report_file="$LOG_DIR/daily_report_$(date +%Y%m%d).txt"

{
echo "=== DAILY PERFORMANCE REPORT ==="
echo "Date: $(date)"
echo "Server: $(hostname)"
echo ""

echo "=== CPU STATISTICS ==="
if [ -f "$CPU_LOG" ]; then
echo "Average CPU Usage: $(awk -F',' '{sum+=$2; count++} END {printf "%.2f%%", sum/count}' "$CPU_LOG")"
echo "Max CPU Usage: $(awk -F',' '{if($2>max) max=$2} END {printf "%.2f%%", max}' "$CPU_LOG")"
fi
echo ""

echo "=== MEMORY STATISTICS ==="
if [ -f "$MEM_LOG" ]; then
echo "Average Memory Usage: $(awk -F',' '{sum+=$6; count++} END {printf "%.2f%%", sum/count}' "$MEM_LOG")"
echo "Max Memory Usage: $(awk -F',' '{if($6>max) max=$6} END {printf "%.2f%%", max}' "$MEM_LOG")"
fi
echo ""

echo "=== DISK USAGE ==="
df -h | grep -E '^/dev/'
echo ""

echo "=== TOP PROCESSES BY CPU ==="
if [ -f "$PROC_LOG" ]; then
grep ",cpu," "$PROC_LOG" | tail -10
fi

} > "$report_file"

echo "Daily report generated: $report_file"
}

# Hàm chính cho monitoring loop
start_monitoring() {
echo "Starting performance monitoring (interval: ${MONITOR_INTERVAL}s)"
echo "Logs directory: $LOG_DIR"
echo "Press Ctrl+C to stop"

# Trap để dọn dẹp khi thoát
trap 'echo "Stopping performance monitoring..."; exit 0' INT TERM

while true; do
collect_cpu_metrics
collect_memory_metrics
collect_disk_metrics
collect_network_metrics
collect_process_metrics

# Dọn dẹp log cũ mỗi ngày (chạy lúc 00:00)
if [ "$(date +%H%M)" = "0000" ]; then
cleanup_old_logs
generate_daily_report
fi

sleep "$MONITOR_INTERVAL"
done
}

# Hàm hiển thị real-time stats
show_realtime_stats() {
while true; do
clear
echo "=== REAL-TIME SYSTEM STATS ==="
echo "Time: $(date)"
echo "Uptime: $(uptime -p)"
echo ""

echo "=== CPU & LOAD ==="
echo "CPU Usage: $(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')%"
echo "Load Average: $(uptime | awk -F'load average:' '{print $2}')"
echo ""

echo "=== MEMORY ==="
free -h
echo ""

echo "=== DISK USAGE ==="
df -h | grep -E '^/dev/' | head -5
echo ""

echo "=== TOP PROCESSES ==="
ps aux --sort=-%cpu | head -6
echo ""

echo "Press Ctrl+C to exit"
sleep 2
done
}

# Menu chính
case "${1:-monitor}" in
"monitor")
start_monitoring
;;
"realtime")
show_realtime_stats
;;
"report")
generate_daily_report
;;
"cleanup")
cleanup_old_logs
;;
*)
echo "Usage: $0 {monitor|realtime|report|cleanup}"
echo " monitor - Start continuous monitoring (default)"
echo " realtime - Show real-time stats"
echo " report - Generate daily report"
echo " cleanup - Clean up old logs"
exit 1
;;
esac

⚙️ 4. Cấu Hình và Tự Động Hóa

Cron Jobs cho Monitoring

# Mở crontab editor
crontab -e

# Thêm các dòng sau:

# System monitoring mỗi 5 phút
*/5 * * * * /path/to/system_monitor.sh

# Health check mỗi 15 phút
*/15 * * * * /path/to/server_health_check.sh

# Performance monitoring (chạy liên tục)
@reboot /path/to/performance_monitor.sh monitor &

# Daily report lúc 6:00 AM
0 6 * * * /path/to/performance_monitor.sh report

# Weekly cleanup vào Chủ nhật lúc 2:00 AM
0 2 * * 0 /path/to/performance_monitor.sh cleanup

Systemd Service cho Monitoring

# /etc/systemd/system/performance-monitor.service
[Unit]
Description=Performance Monitoring Service
After=network.target

[Service]
Type=simple
User=monitor
Group=monitor
ExecStart=/opt/scripts/performance_monitor.sh monitor
Restart=always
RestartSec=10

[Install]
WantedBy=multi-user.target
# Kích hoạt service
sudo systemctl enable performance-monitor.service
sudo systemctl start performance-monitor.service
sudo systemctl status performance-monitor.service

📧 5. Cấu Hình Email Alerts

Cài đặt Mail Server

# Ubuntu/Debian
sudo apt-get install mailutils postfix

# CentOS/RHEL
sudo yum install mailx postfix

# Cấu hình postfix cho relay qua Gmail
sudo nano /etc/postfix/main.cf

Email Configuration

# /etc/postfix/main.cf
relayhost = [smtp.gmail.com]:587
smtp_use_tls = yes
smtp_sasl_auth_enable = yes
smtp_sasl_password_maps = hash:/etc/postfix/sasl_passwd
smtp_sasl_security_options = noanonymous
smtp_tls_CAfile = /etc/ssl/certs/ca-certificates.crt

# /etc/postfix/sasl_passwd
[smtp.gmail.com]:587 [email protected]:your-app-password

# Bảo mật file password
sudo postmap /etc/postfix/sasl_passwd
sudo chmod 600 /etc/postfix/sasl_passwd*
sudo systemctl restart postfix

🔧 6. Troubleshooting

Các lỗi thường gặp:

  1. Permission denied:

    sudo chmod +x /path/to/script.sh
    sudo chown monitor:monitor /var/log/monitoring/
  2. Mail command not found:

    # Ubuntu/Debian
    sudo apt-get install mailutils
    # CentOS/RHEL
    sudo yum install mailx
  3. High CPU usage từ monitoring scripts:

    # Tăng interval giữa các lần check
    MONITOR_INTERVAL=300 # 5 phút thay vì 1 phút
  4. Log files quá lớn:

    # Thêm log rotation
    sudo nano /etc/logrotate.d/monitoring

Log Rotation Configuration

# /etc/logrotate.d/monitoring
/var/log/system_monitor.log
/var/log/health_check.log
/var/log/performance/*.log {
daily
rotate 30
compress
delaycompress
missingok
notifempty
create 644 monitor monitor
postrotate
systemctl reload rsyslog > /dev/null 2>&1 || true
endscript
}

Lưu ý quan trọng:

  • Test tất cả scripts trên môi trường dev trước
  • Cấu hình email alerts phù hợp với hạ tầng
  • Monitor performance của chính các monitoring scripts
  • Thiết lập log rotation để tránh disk full
  • Backup cấu hình monitoring thường xuyên