Files
CloudOps/platform/modules/backups.py

648 lines
27 KiB
Python

# modules/backups.py
import os
import glob
import subprocess
import json
import hashlib
import tarfile
import re
from config import (
RUNNING_ON_MAIN_SERVER,
MAIN_SERVER_IP, MAIN_SERVER_USER, MAIN_SERVER_KEY, MAIN_SERVER_PORT,
VM_HOST, VM_PORT, VM_KEY, VM_USER,
)
def _run(cmd, timeout=20):
try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return r.stdout.strip(), r.stderr.strip()
except Exception as e:
return '', str(e)
def _human_bytes(n):
"""Human-readable byte size for audit UI."""
n = int(n)
if n < 1024:
return f'{n} B'
if n < 1024 ** 2:
return f'{n / 1024:.1f} KB'
if n < 1024 ** 3:
return f'{n / (1024 ** 2):.1f} MB'
if n < 1024 ** 4:
return f'{n / (1024 ** 3):.2f} GB'
return f'{n / (1024 ** 4):.2f} TB'
def _ssh_main(remote_cmd, timeout=20):
if RUNNING_ON_MAIN_SERVER:
return _run(remote_cmd, timeout=timeout)
else:
ssh = (
f"ssh -i {MAIN_SERVER_KEY} -p {MAIN_SERVER_PORT} "
f"-o StrictHostKeyChecking=no -o ConnectTimeout=10 "
f"{MAIN_SERVER_USER}@{MAIN_SERVER_IP}"
)
return _run(f"{ssh} '{remote_cmd}'", timeout=timeout)
# ────────────────────────────────────────────────────────────────
# BACKUPS
# ────────────────────────────────────────────────────────────────
def get_local_backups():
stdout, _ = _ssh_main(
"ls -t /root/backups/myapps-backup-*.tar.gz 2>/dev/null | head -20"
)
files = []
if stdout:
for line in stdout.split('\n'):
line = line.strip()
if line:
files.append(os.path.basename(line))
return files
def get_vm_backups():
vm_backups = []
if RUNNING_ON_MAIN_SERVER:
try:
cmd = (
f"ssh -i {VM_KEY} -p {VM_PORT} "
f"-o StrictHostKeyChecking=no -o ConnectTimeout=10 "
f"{VM_USER}@{VM_HOST} "
f"'ls -t /backups/main-server/myapps-backup-*.tar.gz 2>/dev/null | head -20'"
)
stdout, _ = _run(cmd, timeout=25)
if stdout:
for line in stdout.split('\n'):
line = line.strip()
if line and '.tar.gz' in line:
vm_backups.append(os.path.basename(line))
except Exception as e:
print(f"[backups] VM backup fetch error: {e}")
else:
backup_dir = '/backups/main-server'
if os.path.exists(backup_dir):
files = glob.glob(f'{backup_dir}/myapps-backup-*.tar.gz')
files.sort(key=os.path.getmtime, reverse=True)
vm_backups = [os.path.basename(f) for f in files[:20]]
return vm_backups
# ────────────────────────────────────────────────────────────────
# BACKUP HEALTH AUDIT
# ────────────────────────────────────────────────────────────────
def audit_backup(backup_file, source='local'):
"""
Perform a health and integrity audit on a backup archive.
Checks:
1. File exists
2. File size sanity
3. SHA256 checksum (if .sha256 sidecar exists)
4. tar archive integrity (gzip test only — portable, no conflicting flags)
5. Expected internal structure
6. Path traversal / suspicious paths
7. Suspicious script files at unexpected locations (scripts only, not binaries)
8. Volume count
Returns:
{ ok, score, checks, summary }
"""
checks = []
def add(name, status, detail='', more=None):
entry = {'name': name, 'status': status, 'detail': detail}
if more:
entry['more'] = more
checks.append(entry)
# ── Resolve archive path ─────────────────────────────────────────────────
if source == 'local':
archive_path = f"/root/backups/{backup_file}"
else:
archive_path = f"/backups/main-server/{backup_file}"
# On VM auditing a "local" (main server) backup → pull to /tmp first
if not RUNNING_ON_MAIN_SERVER and source == 'local':
tmp_path = f"/tmp/audit_{backup_file}"
if not os.path.exists(tmp_path):
pull_cmd = (
f"scp -i {MAIN_SERVER_KEY} -P {MAIN_SERVER_PORT} "
f"-o StrictHostKeyChecking=no -o ConnectTimeout=15 "
f"{MAIN_SERVER_USER}@{MAIN_SERVER_IP}:/root/backups/{backup_file} "
f"{tmp_path}"
)
out, err = _run(pull_cmd, timeout=120)
if not os.path.exists(tmp_path):
return {
'ok': False, 'score': 0,
'backup_file': backup_file,
'file_size_bytes': None,
'file_size_display': None,
'health_tier': 'critical',
'health_label': 'Unhealthy',
'checks': [{'name': 'File Access', 'status': 'fail',
'detail': f'Could not pull from main server: {err}'}],
'summary': 'Cannot access backup file from this host.'
}
archive_path = tmp_path
# ── CHECK 1: File exists ─────────────────────────────────────────────────
if not os.path.exists(archive_path):
add('File Exists', 'fail', f'Not found: {archive_path}')
return {
'ok': False, 'score': 0, 'checks': checks,
'backup_file': backup_file,
'file_size_bytes': None,
'file_size_display': None,
'health_tier': 'critical',
'health_label': 'Unhealthy',
'summary': 'Backup file does not exist on disk.',
}
add('File Exists', 'pass', archive_path)
# ── CHECK 2: File size ───────────────────────────────────────────────────
size_bytes = os.path.getsize(archive_path)
size_mb = size_bytes / (1024 * 1024)
size_human = _human_bytes(size_bytes)
size_more = [
f'Exact size: {size_bytes:,} bytes ({size_human})',
'We flag archives under 1 MB as corrupt and under ~50 MB as unusually small for a full stack backup.',
]
if size_bytes < 1024 * 1024:
add('File Size', 'fail',
f'{size_human} — suspiciously tiny, likely corrupt', more=size_more)
elif size_mb < 50:
add('File Size', 'warn',
f'{size_human} — smaller than expected (typical full backup > 50 MB)', more=size_more)
else:
add('File Size', 'pass',
f'{size_human} — within expected range', more=size_more)
# ── CHECK 3: SHA256 checksum ─────────────────────────────────────────────
sha_file = archive_path + '.sha256'
if os.path.exists(sha_file):
try:
with open(sha_file, 'r') as f:
expected_hash = f.read().split()[0].strip()
actual_hash = _sha256_file(archive_path)
if actual_hash == expected_hash:
add('Checksum (SHA256)', 'pass', f'Hash verified — {actual_hash[:20]}')
else:
add('Checksum (SHA256)', 'fail',
f'MISMATCH — expected {expected_hash[:20]}… got {actual_hash[:20]}')
except Exception as e:
add('Checksum (SHA256)', 'warn', f'Could not verify: {e}')
else:
add('Checksum (SHA256)', 'warn',
'No .sha256 sidecar found — run a new backup to get checksums')
# ── CHECK 4: Archive integrity ───────────────────────────────────────────
# Use gzip --test which works everywhere without conflicting tar flags
try:
result = subprocess.run(
['gzip', '--test', archive_path],
capture_output=True, text=True, timeout=120
)
if result.returncode == 0:
add('Archive Integrity', 'pass', 'gzip test passed — archive is not corrupted', more=[
'Runs gzip --test on the .tar.gz so the compressed stream is readable end-to-end.',
])
else:
add('Archive Integrity', 'fail',
f'gzip test failed: {(result.stderr or result.stdout)[:200]}')
except FileNotFoundError:
# gzip not available — try python gzip
try:
import gzip
with gzip.open(archive_path, 'rb') as f:
# Read just the first few MB to check header validity
f.read(1024 * 1024)
add('Archive Integrity', 'pass', 'gzip header valid')
except Exception as e:
add('Archive Integrity', 'fail', f'Archive appears corrupt: {e}')
except subprocess.TimeoutExpired:
add('Archive Integrity', 'warn', 'Integrity check timed out — file is large, probably OK')
except Exception as e:
add('Archive Integrity', 'warn', f'Could not test: {e}')
# ── Read archive member list (used by checks 5, 6, 7, 8) ─────────────────
members = []
try:
with tarfile.open(archive_path, 'r:gz') as tf:
members = tf.getnames()
except Exception:
pass
# ── CHECK 5: Internal structure ──────────────────────────────────────────
if members:
has_volumes = any('volumes/' in m for m in members)
has_info = any('backup-info.txt' in m for m in members)
has_compose = any('compose-files/' in m for m in members)
vol_count = len([m for m in members if '/volumes/' in m and m.endswith('.tar.gz')])
issues = []
if not has_volumes: issues.append('volumes/ missing')
if not has_info: issues.append('backup-info.txt missing')
if not issues:
detail = f'volumes/ ✓ backup-info.txt ✓'
if has_compose: detail += ' compose-files/ ✓'
detail += f' ({vol_count} volume archives)'
add('Internal Structure', 'pass', detail)
else:
add('Internal Structure', 'fail', ' · '.join(issues))
else:
add('Internal Structure', 'warn', 'Could not inspect archive members')
# ── CHECK 6: Path traversal / suspicious paths ────────────────────────────
SUSPICIOUS = [
(r'\.\./', 'path traversal (..)'),
(r'^/', 'absolute path in archive'),
(r'/etc/passwd', '/etc/passwd reference'),
(r'/etc/shadow', '/etc/shadow reference'),
(r'\.ssh/', '.ssh directory reference'),
(r'id_rsa(?!\.pub)', 'private SSH key reference'),
(r'authorized_keys', 'authorized_keys reference'),
]
found_suspicious = []
for m in members:
for pat, label in SUSPICIOUS:
if re.search(pat, m):
found_suspicious.append(f'{m} ({label})')
break
if found_suspicious:
add('Security Scan', 'fail',
f'Suspicious entries found: {found_suspicious[:3]}')
else:
add('Security Scan', 'pass', 'No path traversal or dangerous entries detected', more=[
'Member paths are checked for .. segments, absolute roots, and sensitive paths '
'(e.g. .ssh, /etc/shadow).',
])
# ── CHECK 7: Suspicious scripts (smart — scripts only, not data files) ────
# Only flag actual text script files (.sh .py .pl .rb) with execute bits
# placed outside compose-files/ and outside known vendor directories.
# .bin, .so, .exe data files are intentionally excluded (too many false positives)
SCRIPT_EXTENSIONS = ('.sh', '.py', '.pl', '.rb', '.bash', '.zsh')
SAFE_PREFIXES = (
'compose-files/',
'volumes/',
'container-configs/',
'configs/',
)
suspicious_scripts = []
try:
with tarfile.open(archive_path, 'r:gz') as tf:
for member in tf.getmembers():
if not member.isfile():
continue
name = member.name
# Skip files in known-safe directories
if any(name.startswith(p) or f'/{p}' in name for p in SAFE_PREFIXES):
continue
# Only flag actual script extensions with execute bits
name_lower = name.lower()
has_script_ext = any(name_lower.endswith(ext) for ext in SCRIPT_EXTENSIONS)
has_exec_bit = bool(member.mode & 0o111)
if has_script_ext and has_exec_bit:
suspicious_scripts.append(os.path.basename(name))
except Exception:
pass
if suspicious_scripts:
add('Executable Scripts', 'warn',
f'Scripts with execute bit outside expected dirs: {suspicious_scripts[:3]}')
else:
add('Executable Scripts', 'pass', 'No unexpected executable scripts found')
# ── CHECK 8: Volume count ────────────────────────────────────────────────
vol_archives = [m for m in members if 'volumes/' in m and m.endswith('.tar.gz')]
v = len(vol_archives)
if v == 0:
add('Volume Count', 'fail', 'No volume archives found in backup')
elif v < 5:
add('Volume Count', 'warn', f'Only {v} volumes (expected ≥5 for a full backup)')
else:
add('Volume Count', 'pass', f'{v} volume archives present')
# ── Score ─────────────────────────────────────────────────────────────────
weights = {'pass': 10, 'warn': 5, 'fail': 0}
total = len(checks) * 10
earned = sum(weights.get(c['status'], 0) for c in checks)
score = int((earned / total) * 100) if total > 0 else 0
has_fails = any(c['status'] == 'fail' for c in checks)
ok = not has_fails and score >= 60
if score >= 90:
summary = 'Backup looks healthy and is safe to restore.'
elif score >= 70:
summary = 'Minor warnings — likely safe, but review before restoring.'
elif score >= 40:
summary = 'Significant issues detected — restore with caution.'
else:
summary = 'Multiple checks failed — do NOT restore without manual inspection.'
has_warns = any(c['status'] == 'warn' for c in checks)
if has_fails:
health_tier = 'critical'
health_label = 'Unhealthy'
elif score == 100:
health_tier = 'excellent'
health_label = '100% healthy'
elif score >= 90:
health_tier = 'good'
health_label = 'Healthy' if not has_warns else 'Healthy (with notes)'
elif score >= 70:
health_tier = 'fair'
health_label = 'Mostly healthy'
elif score >= 40:
health_tier = 'poor'
health_label = 'At risk'
else:
health_tier = 'critical'
health_label = 'Unhealthy'
return {
'ok': ok,
'score': score,
'checks': checks,
'summary': summary,
'backup_file': backup_file,
'file_size_bytes': size_bytes,
'file_size_display': size_human,
'health_tier': health_tier,
'health_label': health_label,
}
def _sha256_file(path):
h = hashlib.sha256()
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(65536), b''):
h.update(chunk)
return h.hexdigest()
# ────────────────────────────────────────────────────────────────
# DELETE BACKUP
# ────────────────────────────────────────────────────────────────
def delete_backup(backup_file, source='local'):
if not re.match(r'^myapps-backup-\d{8}_\d{6}\.tar\.gz$', backup_file):
return False, f'Invalid backup filename: {backup_file}'
if source == 'local':
if RUNNING_ON_MAIN_SERVER:
archive_path = f"/root/backups/{backup_file}"
if not os.path.exists(archive_path):
return False, f'File not found: {archive_path}'
os.remove(archive_path)
sha = archive_path + '.sha256'
if os.path.exists(sha): os.remove(sha)
return True, f'Deleted {backup_file} from main server'
else:
cmd = f"rm -f /root/backups/{backup_file} /root/backups/{backup_file}.sha256"
out, err = _ssh_main(cmd)
if err and 'No such file' not in err:
return False, f'Remote delete error: {err}'
return True, f'Deleted {backup_file} from main server'
elif source == 'vm':
archive_path = f"/backups/main-server/{backup_file}"
if not RUNNING_ON_MAIN_SERVER:
if not os.path.exists(archive_path):
return False, f'File not found: {archive_path}'
os.remove(archive_path)
sha = archive_path + '.sha256'
if os.path.exists(sha): os.remove(sha)
return True, f'Deleted {backup_file} from VM'
else:
cmd = (
f"ssh -i {VM_KEY} -p {VM_PORT} "
f"-o StrictHostKeyChecking=no -o ConnectTimeout=10 "
f"{VM_USER}@{VM_HOST} "
f"'rm -f /backups/main-server/{backup_file} "
f"/backups/main-server/{backup_file}.sha256'"
)
out, err = _run(cmd, timeout=30)
if err and 'No such file' not in err:
return False, f'VM delete error: {err}'
return True, f'Deleted {backup_file} from VM'
return False, 'Unknown source'
# ────────────────────────────────────────────────────────────────
# BACKUP STATUS LOG
# ────────────────────────────────────────────────────────────────
def get_backup_log_entries(limit=20):
stdout, _ = _ssh_main(
f"tail -n {limit} /root/backups/backup-status.log 2>/dev/null || echo ''"
)
entries = []
if not stdout:
return entries
for line in stdout.strip().split('\n'):
if not line.strip():
continue
parts = line.split('|')
entries.append({
'timestamp': parts[0].strip() if len(parts) > 0 else '',
'status': parts[1].strip() if len(parts) > 1 else '',
'name': parts[2].strip() if len(parts) > 2 else '',
'message': parts[3].strip() if len(parts) > 3 else '',
})
return list(reversed(entries))
def get_backup_script_path():
candidates = ['/root/backup-myapps.sh']
for p in candidates:
out, _ = _ssh_main(f"[ -f {p} ] && echo yes || echo no")
if out.strip() == 'yes':
return p
return None
# ────────────────────────────────────────────────────────────────
# CONTAINERS
# ────────────────────────────────────────────────────────────────
def _parse_containers(raw, owner='root'):
containers = []
if raw:
for line in raw.split('\n'):
if '|' not in line:
continue
parts = line.split('|')
containers.append({
'name': parts[0].strip(),
'status': parts[1].strip() if len(parts) > 1 else '',
'image': parts[2].strip() if len(parts) > 2 else '',
'ports': parts[3].strip() if len(parts) > 3 else '',
'owner': owner,
})
return containers
def get_containers():
stdout, _ = _ssh_main(
"docker ps -a --format '{{.Names}}|{{.Status}}|{{.Image}}|{{.Ports}}' 2>/dev/null | "
"grep -E 'frappe|nextcloud|mautic|n8n|odoo'"
)
return _parse_containers(stdout)
def get_all_root_containers():
stdout, _ = _ssh_main(
"docker ps -a --format '{{.Names}}|{{.Status}}|{{.Image}}|{{.Ports}}' 2>/dev/null"
)
return _parse_containers(stdout)
def get_rootless_user_containers_remote():
cmd = "ls /run/user/*/docker.sock 2>/dev/null"
stdout, _ = _ssh_main(cmd)
containers = []
if not stdout:
return containers
for sock_path in stdout.split('\n'):
sock_path = sock_path.strip()
if not sock_path:
continue
try:
uid = sock_path.split('/run/user/')[1].split('/')[0]
except (IndexError, ValueError):
continue
name_out, _ = _ssh_main(f"getent passwd {uid} | cut -d: -f1")
username = name_out.strip() or f"uid{uid}"
ctr_out, _ = _ssh_main(
f"DOCKER_HOST=unix://{sock_path} "
f"docker ps -a --format '{{{{.Names}}}}|{{{{.Status}}}}|{{{{.Image}}}}|{{{{.Ports}}}}' 2>/dev/null"
)
containers.extend(_parse_containers(ctr_out, owner=username))
return containers
# ────────────────────────────────────────────────────────────────
# CONTAINER ACTIONS
# ────────────────────────────────────────────────────────────────
def container_action(container_name, action):
if action not in ('start', 'stop', 'restart'):
return False, "Invalid action"
safe_name = container_name.replace('"', '').replace(';', '').replace('|', '')
stdout, stderr = _ssh_main(f"docker {action} {safe_name} 2>&1", timeout=30)
output = (stdout + stderr).strip()
return True, output
def get_container_status(container_name):
safe_name = container_name.replace('"', '').replace(';', '').replace('|', '')
stdout, _ = _ssh_main(
f"docker inspect --format='{{{{.State.Status}}}}' {safe_name} 2>/dev/null"
)
raw = stdout.strip().lower()
if raw in ('running', 'restarting'):
status = 'running'
elif raw in ('exited', 'stopped', 'dead', 'paused'):
status = 'stopped'
else:
status = 'unknown'
return {'name': container_name, 'status': status, 'raw': raw}
# ────────────────────────────────────────────────────────────────
# STATS
# ────────────────────────────────────────────────────────────────
def get_container_stats_remote():
stdout, _ = _ssh_main(
"docker stats --no-stream --format "
"'{{.Name}}|{{.CPUPerc}}|{{.MemUsage}}|{{.MemPerc}}|{{.NetIO}}|{{.BlockIO}}' 2>/dev/null",
timeout=35
)
stats = {}
if stdout:
for line in stdout.split('\n'):
if '|' not in line:
continue
parts = line.split('|')
if len(parts) < 6:
continue
name = parts[0].strip()
stats[name] = {
'cpu': parts[1].strip(),
'mem': parts[2].strip(),
'mem_pct': parts[3].strip(),
'net': parts[4].strip(),
'block': parts[5].strip(),
}
return stats
def get_all_stats():
all_stats = get_container_stats_remote()
socks_out, _ = _ssh_main("ls /run/user/*/docker.sock 2>/dev/null")
if socks_out:
for sock in socks_out.split('\n'):
sock = sock.strip()
if not sock:
continue
stdout, _ = _ssh_main(
f"DOCKER_HOST=unix://{sock} "
f"docker stats --no-stream --format "
f"'{{{{.Name}}}}|{{{{.CPUPerc}}}}|{{{{.MemUsage}}}}|{{{{.MemPerc}}}}|{{{{.NetIO}}}}|{{{{.BlockIO}}}}' 2>/dev/null",
timeout=35
)
if stdout:
for line in stdout.split('\n'):
if '|' not in line:
continue
parts = line.split('|')
if len(parts) < 6:
continue
all_stats[parts[0].strip()] = {
'cpu': parts[1].strip(),
'mem': parts[2].strip(),
'mem_pct': parts[3].strip(),
'net': parts[4].strip(),
'block': parts[5].strip(),
}
return all_stats
# ────────────────────────────────────────────────────────────────
# SYSTEM INFO
# ────────────────────────────────────────────────────────────────
def get_system_info():
cpu_out, _ = _ssh_main("top -bn1 | grep 'Cpu(s)' | awk '{print $2+$4}'")
mem_out, _ = _ssh_main("free -m | awk 'NR==2{printf \"%s/%sMB\", $3, $2}'")
mem_pct, _ = _ssh_main("free | awk 'NR==2{printf \"%.0f\", $3/$2*100}'")
disk_out, _ = _ssh_main("df -h / | awk 'NR==2{printf \"%s/%s\", $3, $2}'")
disk_pct, _ = _ssh_main("df / | awk 'NR==2{print $5}' | tr -d '%'")
load_out, _ = _ssh_main("cat /proc/loadavg | awk '{print $1, $2, $3}'")
uptime, _ = _ssh_main("uptime -p")
docker_v, _ = _ssh_main("docker --version | cut -d' ' -f3 | tr -d ','")
hostname, _ = _run("hostname -f 2>/dev/null || hostname")
return {
'cpu_pct': cpu_out or '0',
'memory': mem_out or 'N/A',
'mem_pct': mem_pct or '0',
'disk': disk_out or 'N/A',
'disk_pct': disk_pct or '0',
'load': load_out or 'N/A',
'uptime': uptime or 'N/A',
'docker_v': docker_v or 'N/A',
'hostname': hostname or 'main server',
}