CloudOps/platform/modules/backups.py

# modules/backups.py
import os
import glob
import subprocess
import json
import hashlib
import tarfile
import re
from config import (
    RUNNING_ON_MAIN_SERVER,
    MAIN_SERVER_IP, MAIN_SERVER_USER, MAIN_SERVER_KEY, MAIN_SERVER_PORT,
    VM_HOST, VM_PORT, VM_KEY, VM_USER,
)


def _run(cmd, timeout=20):
    try:
        r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
        return r.stdout.strip(), r.stderr.strip()
    except Exception as e:
        return '', str(e)


def _human_bytes(n):
    """Human-readable byte size for audit UI."""
    n = int(n)
    if n < 1024:
        return f'{n} B'
    if n < 1024 ** 2:
        return f'{n / 1024:.1f} KB'
    if n < 1024 ** 3:
        return f'{n / (1024 ** 2):.1f} MB'
    if n < 1024 ** 4:
        return f'{n / (1024 ** 3):.2f} GB'
    return f'{n / (1024 ** 4):.2f} TB'


def _ssh_main(remote_cmd, timeout=20):
    if RUNNING_ON_MAIN_SERVER:
        return _run(remote_cmd, timeout=timeout)
    else:
        ssh = (
            f"ssh -i {MAIN_SERVER_KEY} -p {MAIN_SERVER_PORT} "
            f"-o StrictHostKeyChecking=no -o ConnectTimeout=10 "
            f"{MAIN_SERVER_USER}@{MAIN_SERVER_IP}"
        )
        return _run(f"{ssh} '{remote_cmd}'", timeout=timeout)


# ────────────────────────────────────────────────────────────────
# BACKUPS
# ────────────────────────────────────────────────────────────────

def get_local_backups():
    stdout, _ = _ssh_main(
        "ls -t /root/backups/myapps-backup-*.tar.gz 2>/dev/null | head -20"
    )
    files = []
    if stdout:
        for line in stdout.split('\n'):
            line = line.strip()
            if line:
                files.append(os.path.basename(line))
    return files


def get_vm_backups():
    vm_backups = []
    if RUNNING_ON_MAIN_SERVER:
        try:
            cmd = (
                f"ssh -i {VM_KEY} -p {VM_PORT} "
                f"-o StrictHostKeyChecking=no -o ConnectTimeout=10 "
                f"{VM_USER}@{VM_HOST} "
                f"'ls -t /backups/main-server/myapps-backup-*.tar.gz 2>/dev/null | head -20'"
            )
            stdout, _ = _run(cmd, timeout=25)
            if stdout:
                for line in stdout.split('\n'):
                    line = line.strip()
                    if line and '.tar.gz' in line:
                        vm_backups.append(os.path.basename(line))
        except Exception as e:
            print(f"[backups] VM backup fetch error: {e}")
    else:
        backup_dir = '/backups/main-server'
        if os.path.exists(backup_dir):
            files = glob.glob(f'{backup_dir}/myapps-backup-*.tar.gz')
            files.sort(key=os.path.getmtime, reverse=True)
            vm_backups = [os.path.basename(f) for f in files[:20]]
    return vm_backups


# ────────────────────────────────────────────────────────────────
# BACKUP HEALTH AUDIT
# ────────────────────────────────────────────────────────────────

def audit_backup(backup_file, source='local'):
    """
    Perform a health and integrity audit on a backup archive.

    Checks:
      1. File exists
      2. File size sanity
      3. SHA256 checksum (if .sha256 sidecar exists)
      4. tar archive integrity (gzip test only — portable, no conflicting flags)
      5. Expected internal structure
      6. Path traversal / suspicious paths
      7. Suspicious script files at unexpected locations (scripts only, not binaries)
      8. Volume count

    Returns:
      { ok, score, checks, summary }
    """
    checks = []

    def add(name, status, detail='', more=None):
        entry = {'name': name, 'status': status, 'detail': detail}
        if more:
            entry['more'] = more
        checks.append(entry)

    # ── Resolve archive path ─────────────────────────────────────────────────
    if source == 'local':
        archive_path = f"/root/backups/{backup_file}"
    else:
        archive_path = f"/backups/main-server/{backup_file}"

    # On VM auditing a "local" (main server) backup → pull to /tmp first
    if not RUNNING_ON_MAIN_SERVER and source == 'local':
        tmp_path = f"/tmp/audit_{backup_file}"
        if not os.path.exists(tmp_path):
            pull_cmd = (
                f"scp -i {MAIN_SERVER_KEY} -P {MAIN_SERVER_PORT} "
                f"-o StrictHostKeyChecking=no -o ConnectTimeout=15 "
                f"{MAIN_SERVER_USER}@{MAIN_SERVER_IP}:/root/backups/{backup_file} "
                f"{tmp_path}"
            )
            out, err = _run(pull_cmd, timeout=120)
            if not os.path.exists(tmp_path):
                return {
                    'ok': False, 'score': 0,
                    'backup_file': backup_file,
                    'file_size_bytes': None,
                    'file_size_display': None,
                    'health_tier': 'critical',
                    'health_label': 'Unhealthy',
                    'checks': [{'name': 'File Access', 'status': 'fail',
                                'detail': f'Could not pull from main server: {err}'}],
                    'summary': 'Cannot access backup file from this host.'
                }
        archive_path = tmp_path

    # ── CHECK 1: File exists ─────────────────────────────────────────────────
    if not os.path.exists(archive_path):
        add('File Exists', 'fail', f'Not found: {archive_path}')
        return {
            'ok': False, 'score': 0, 'checks': checks,
            'backup_file': backup_file,
            'file_size_bytes': None,
            'file_size_display': None,
            'health_tier': 'critical',
            'health_label': 'Unhealthy',
            'summary': 'Backup file does not exist on disk.',
        }
    add('File Exists', 'pass', archive_path)

    # ── CHECK 2: File size ───────────────────────────────────────────────────
    size_bytes = os.path.getsize(archive_path)
    size_mb = size_bytes / (1024 * 1024)
    size_human = _human_bytes(size_bytes)
    size_more = [
        f'Exact size: {size_bytes:,} bytes ({size_human})',
        'We flag archives under 1 MB as corrupt and under ~50 MB as unusually small for a full stack backup.',
    ]
    if size_bytes < 1024 * 1024:
        add('File Size', 'fail',
            f'{size_human} — suspiciously tiny, likely corrupt', more=size_more)
    elif size_mb < 50:
        add('File Size', 'warn',
            f'{size_human} — smaller than expected (typical full backup > 50 MB)', more=size_more)
    else:
        add('File Size', 'pass',
            f'{size_human} — within expected range', more=size_more)

    # ── CHECK 3: SHA256 checksum ─────────────────────────────────────────────
    sha_file = archive_path + '.sha256'
    if os.path.exists(sha_file):
        try:
            with open(sha_file, 'r') as f:
                expected_hash = f.read().split()[0].strip()
            actual_hash = _sha256_file(archive_path)
            if actual_hash == expected_hash:
                add('Checksum (SHA256)', 'pass', f'Hash verified — {actual_hash[:20]}…')
            else:
                add('Checksum (SHA256)', 'fail',
                    f'MISMATCH — expected {expected_hash[:20]}… got {actual_hash[:20]}…')
        except Exception as e:
            add('Checksum (SHA256)', 'warn', f'Could not verify: {e}')
    else:
        add('Checksum (SHA256)', 'warn',
            'No .sha256 sidecar found — run a new backup to get checksums')

    # ── CHECK 4: Archive integrity ───────────────────────────────────────────
    # Use gzip --test which works everywhere without conflicting tar flags
    try:
        result = subprocess.run(
            ['gzip', '--test', archive_path],
            capture_output=True, text=True, timeout=120
        )
        if result.returncode == 0:
            add('Archive Integrity', 'pass', 'gzip test passed — archive is not corrupted', more=[
                'Runs gzip --test on the .tar.gz so the compressed stream is readable end-to-end.',
            ])
        else:
            add('Archive Integrity', 'fail',
                f'gzip test failed: {(result.stderr or result.stdout)[:200]}')
    except FileNotFoundError:
        # gzip not available — try python gzip
        try:
            import gzip
            with gzip.open(archive_path, 'rb') as f:
                # Read just the first few MB to check header validity
                f.read(1024 * 1024)
            add('Archive Integrity', 'pass', 'gzip header valid')
        except Exception as e:
            add('Archive Integrity', 'fail', f'Archive appears corrupt: {e}')
    except subprocess.TimeoutExpired:
        add('Archive Integrity', 'warn', 'Integrity check timed out — file is large, probably OK')
    except Exception as e:
        add('Archive Integrity', 'warn', f'Could not test: {e}')

    # ── Read archive member list (used by checks 5, 6, 7, 8) ─────────────────
    members = []
    try:
        with tarfile.open(archive_path, 'r:gz') as tf:
            members = tf.getnames()
    except Exception:
        pass

    # ── CHECK 5: Internal structure ──────────────────────────────────────────
    if members:
        has_volumes  = any('volumes/' in m for m in members)
        has_info     = any('backup-info.txt' in m for m in members)
        has_compose  = any('compose-files/' in m for m in members)
        vol_count    = len([m for m in members if '/volumes/' in m and m.endswith('.tar.gz')])

        issues = []
        if not has_volumes:  issues.append('volumes/ missing')
        if not has_info:     issues.append('backup-info.txt missing')

        if not issues:
            detail = f'volumes/ ✓  backup-info.txt ✓'
            if has_compose: detail += '  compose-files/ ✓'
            detail += f'  ({vol_count} volume archives)'
            add('Internal Structure', 'pass', detail)
        else:
            add('Internal Structure', 'fail', ' · '.join(issues))
    else:
        add('Internal Structure', 'warn', 'Could not inspect archive members')

    # ── CHECK 6: Path traversal / suspicious paths ────────────────────────────
    SUSPICIOUS = [
        (r'\.\./', 'path traversal (..)'),
        (r'^/', 'absolute path in archive'),
        (r'/etc/passwd', '/etc/passwd reference'),
        (r'/etc/shadow', '/etc/shadow reference'),
        (r'\.ssh/', '.ssh directory reference'),
        (r'id_rsa(?!\.pub)', 'private SSH key reference'),
        (r'authorized_keys', 'authorized_keys reference'),
    ]
    found_suspicious = []
    for m in members:
        for pat, label in SUSPICIOUS:
            if re.search(pat, m):
                found_suspicious.append(f'{m} ({label})')
                break

    if found_suspicious:
        add('Security Scan', 'fail',
            f'Suspicious entries found: {found_suspicious[:3]}')
    else:
        add('Security Scan', 'pass', 'No path traversal or dangerous entries detected', more=[
            'Member paths are checked for .. segments, absolute roots, and sensitive paths '
            '(e.g. .ssh, /etc/shadow).',
        ])

    # ── CHECK 7: Suspicious scripts (smart — scripts only, not data files) ────
    # Only flag actual text script files (.sh .py .pl .rb) with execute bits
    # placed outside compose-files/ and outside known vendor directories.
    # .bin, .so, .exe data files are intentionally excluded (too many false positives)
    SCRIPT_EXTENSIONS = ('.sh', '.py', '.pl', '.rb', '.bash', '.zsh')
    SAFE_PREFIXES = (
        'compose-files/',
        'volumes/',
        'container-configs/',
        'configs/',
    )
    suspicious_scripts = []
    try:
        with tarfile.open(archive_path, 'r:gz') as tf:
            for member in tf.getmembers():
                if not member.isfile():
                    continue
                name = member.name
                # Skip files in known-safe directories
                if any(name.startswith(p) or f'/{p}' in name for p in SAFE_PREFIXES):
                    continue
                # Only flag actual script extensions with execute bits
                name_lower = name.lower()
                has_script_ext = any(name_lower.endswith(ext) for ext in SCRIPT_EXTENSIONS)
                has_exec_bit   = bool(member.mode & 0o111)
                if has_script_ext and has_exec_bit:
                    suspicious_scripts.append(os.path.basename(name))
    except Exception:
        pass

    if suspicious_scripts:
        add('Executable Scripts', 'warn',
            f'Scripts with execute bit outside expected dirs: {suspicious_scripts[:3]}')
    else:
        add('Executable Scripts', 'pass', 'No unexpected executable scripts found')

    # ── CHECK 8: Volume count ────────────────────────────────────────────────
    vol_archives = [m for m in members if 'volumes/' in m and m.endswith('.tar.gz')]
    v = len(vol_archives)
    if v == 0:
        add('Volume Count', 'fail', 'No volume archives found in backup')
    elif v < 5:
        add('Volume Count', 'warn', f'Only {v} volumes (expected ≥5 for a full backup)')
    else:
        add('Volume Count', 'pass', f'{v} volume archives present')

    # ── Score ─────────────────────────────────────────────────────────────────
    weights = {'pass': 10, 'warn': 5, 'fail': 0}
    total  = len(checks) * 10
    earned = sum(weights.get(c['status'], 0) for c in checks)
    score  = int((earned / total) * 100) if total > 0 else 0

    has_fails = any(c['status'] == 'fail' for c in checks)
    ok = not has_fails and score >= 60

    if score >= 90:
        summary = 'Backup looks healthy and is safe to restore.'
    elif score >= 70:
        summary = 'Minor warnings — likely safe, but review before restoring.'
    elif score >= 40:
        summary = 'Significant issues detected — restore with caution.'
    else:
        summary = 'Multiple checks failed — do NOT restore without manual inspection.'

    has_warns = any(c['status'] == 'warn' for c in checks)
    if has_fails:
        health_tier = 'critical'
        health_label = 'Unhealthy'
    elif score == 100:
        health_tier = 'excellent'
        health_label = '100% healthy'
    elif score >= 90:
        health_tier = 'good'
        health_label = 'Healthy' if not has_warns else 'Healthy (with notes)'
    elif score >= 70:
        health_tier = 'fair'
        health_label = 'Mostly healthy'
    elif score >= 40:
        health_tier = 'poor'
        health_label = 'At risk'
    else:
        health_tier = 'critical'
        health_label = 'Unhealthy'

    return {
        'ok': ok,
        'score': score,
        'checks': checks,
        'summary': summary,
        'backup_file': backup_file,
        'file_size_bytes': size_bytes,
        'file_size_display': size_human,
        'health_tier': health_tier,
        'health_label': health_label,
    }


def _sha256_file(path):
    h = hashlib.sha256()
    with open(path, 'rb') as f:
        for chunk in iter(lambda: f.read(65536), b''):
            h.update(chunk)
    return h.hexdigest()


# ────────────────────────────────────────────────────────────────
# DELETE BACKUP
# ────────────────────────────────────────────────────────────────

def delete_backup(backup_file, source='local'):
    if not re.match(r'^myapps-backup-\d{8}_\d{6}\.tar\.gz$', backup_file):
        return False, f'Invalid backup filename: {backup_file}'

    if source == 'local':
        if RUNNING_ON_MAIN_SERVER:
            archive_path = f"/root/backups/{backup_file}"
            if not os.path.exists(archive_path):
                return False, f'File not found: {archive_path}'
            os.remove(archive_path)
            sha = archive_path + '.sha256'
            if os.path.exists(sha): os.remove(sha)
            return True, f'Deleted {backup_file} from main server'
        else:
            cmd = f"rm -f /root/backups/{backup_file} /root/backups/{backup_file}.sha256"
            out, err = _ssh_main(cmd)
            if err and 'No such file' not in err:
                return False, f'Remote delete error: {err}'
            return True, f'Deleted {backup_file} from main server'

    elif source == 'vm':
        archive_path = f"/backups/main-server/{backup_file}"
        if not RUNNING_ON_MAIN_SERVER:
            if not os.path.exists(archive_path):
                return False, f'File not found: {archive_path}'
            os.remove(archive_path)
            sha = archive_path + '.sha256'
            if os.path.exists(sha): os.remove(sha)
            return True, f'Deleted {backup_file} from VM'
        else:
            cmd = (
                f"ssh -i {VM_KEY} -p {VM_PORT} "
                f"-o StrictHostKeyChecking=no -o ConnectTimeout=10 "
                f"{VM_USER}@{VM_HOST} "
                f"'rm -f /backups/main-server/{backup_file} "
                f"/backups/main-server/{backup_file}.sha256'"
            )
            out, err = _run(cmd, timeout=30)
            if err and 'No such file' not in err:
                return False, f'VM delete error: {err}'
            return True, f'Deleted {backup_file} from VM'

    return False, 'Unknown source'


# ────────────────────────────────────────────────────────────────
# BACKUP STATUS LOG
# ────────────────────────────────────────────────────────────────

def get_backup_log_entries(limit=20):
    stdout, _ = _ssh_main(
        f"tail -n {limit} /root/backups/backup-status.log 2>/dev/null || echo ''"
    )
    entries = []
    if not stdout:
        return entries
    for line in stdout.strip().split('\n'):
        if not line.strip():
            continue
        parts = line.split('|')
        entries.append({
            'timestamp': parts[0].strip() if len(parts) > 0 else '',
            'status':    parts[1].strip() if len(parts) > 1 else '',
            'name':      parts[2].strip() if len(parts) > 2 else '',
            'message':   parts[3].strip() if len(parts) > 3 else '',
        })
    return list(reversed(entries))


def get_backup_script_path():
    candidates = ['/root/backup-myapps.sh']
    for p in candidates:
        out, _ = _ssh_main(f"[ -f {p} ] && echo yes || echo no")
        if out.strip() == 'yes':
            return p
    return None


# ────────────────────────────────────────────────────────────────
# CONTAINERS
# ────────────────────────────────────────────────────────────────

def _parse_containers(raw, owner='root'):
    containers = []
    if raw:
        for line in raw.split('\n'):
            if '|' not in line:
                continue
            parts = line.split('|')
            containers.append({
                'name':   parts[0].strip(),
                'status': parts[1].strip() if len(parts) > 1 else '',
                'image':  parts[2].strip() if len(parts) > 2 else '',
                'ports':  parts[3].strip() if len(parts) > 3 else '',
                'owner':  owner,
            })
    return containers


def get_containers():
    stdout, _ = _ssh_main(
        "docker ps -a --format '{{.Names}}|{{.Status}}|{{.Image}}|{{.Ports}}' 2>/dev/null | "
        "grep -E 'frappe|nextcloud|mautic|n8n|odoo'"
    )
    return _parse_containers(stdout)


def get_all_root_containers():
    stdout, _ = _ssh_main(
        "docker ps -a --format '{{.Names}}|{{.Status}}|{{.Image}}|{{.Ports}}' 2>/dev/null"
    )
    return _parse_containers(stdout)


def get_rootless_user_containers_remote():
    cmd = "ls /run/user/*/docker.sock 2>/dev/null"
    stdout, _ = _ssh_main(cmd)
    containers = []
    if not stdout:
        return containers
    for sock_path in stdout.split('\n'):
        sock_path = sock_path.strip()
        if not sock_path:
            continue
        try:
            uid = sock_path.split('/run/user/')[1].split('/')[0]
        except (IndexError, ValueError):
            continue
        name_out, _ = _ssh_main(f"getent passwd {uid} | cut -d: -f1")
        username = name_out.strip() or f"uid{uid}"
        ctr_out, _ = _ssh_main(
            f"DOCKER_HOST=unix://{sock_path} "
            f"docker ps -a --format '{{{{.Names}}}}|{{{{.Status}}}}|{{{{.Image}}}}|{{{{.Ports}}}}' 2>/dev/null"
        )
        containers.extend(_parse_containers(ctr_out, owner=username))
    return containers


# ────────────────────────────────────────────────────────────────
# CONTAINER ACTIONS
# ────────────────────────────────────────────────────────────────

def container_action(container_name, action):
    if action not in ('start', 'stop', 'restart'):
        return False, "Invalid action"
    safe_name = container_name.replace('"', '').replace(';', '').replace('|', '')
    stdout, stderr = _ssh_main(f"docker {action} {safe_name} 2>&1", timeout=30)
    output = (stdout + stderr).strip()
    return True, output


def get_container_status(container_name):
    safe_name = container_name.replace('"', '').replace(';', '').replace('|', '')
    stdout, _ = _ssh_main(
        f"docker inspect --format='{{{{.State.Status}}}}' {safe_name} 2>/dev/null"
    )
    raw = stdout.strip().lower()
    if raw in ('running', 'restarting'):
        status = 'running'
    elif raw in ('exited', 'stopped', 'dead', 'paused'):
        status = 'stopped'
    else:
        status = 'unknown'
    return {'name': container_name, 'status': status, 'raw': raw}


# ────────────────────────────────────────────────────────────────
# STATS
# ────────────────────────────────────────────────────────────────

def get_container_stats_remote():
    stdout, _ = _ssh_main(
        "docker stats --no-stream --format "
        "'{{.Name}}|{{.CPUPerc}}|{{.MemUsage}}|{{.MemPerc}}|{{.NetIO}}|{{.BlockIO}}' 2>/dev/null",
        timeout=35
    )
    stats = {}
    if stdout:
        for line in stdout.split('\n'):
            if '|' not in line:
                continue
            parts = line.split('|')
            if len(parts) < 6:
                continue
            name = parts[0].strip()
            stats[name] = {
                'cpu':     parts[1].strip(),
                'mem':     parts[2].strip(),
                'mem_pct': parts[3].strip(),
                'net':     parts[4].strip(),
                'block':   parts[5].strip(),
            }
    return stats


def get_all_stats():
    all_stats = get_container_stats_remote()
    socks_out, _ = _ssh_main("ls /run/user/*/docker.sock 2>/dev/null")
    if socks_out:
        for sock in socks_out.split('\n'):
            sock = sock.strip()
            if not sock:
                continue
            stdout, _ = _ssh_main(
                f"DOCKER_HOST=unix://{sock} "
                f"docker stats --no-stream --format "
                f"'{{{{.Name}}}}|{{{{.CPUPerc}}}}|{{{{.MemUsage}}}}|{{{{.MemPerc}}}}|{{{{.NetIO}}}}|{{{{.BlockIO}}}}' 2>/dev/null",
                timeout=35
            )
            if stdout:
                for line in stdout.split('\n'):
                    if '|' not in line:
                        continue
                    parts = line.split('|')
                    if len(parts) < 6:
                        continue
                    all_stats[parts[0].strip()] = {
                        'cpu':     parts[1].strip(),
                        'mem':     parts[2].strip(),
                        'mem_pct': parts[3].strip(),
                        'net':     parts[4].strip(),
                        'block':   parts[5].strip(),
                    }
    return all_stats


# ────────────────────────────────────────────────────────────────
# SYSTEM INFO
# ────────────────────────────────────────────────────────────────

def get_system_info():
    cpu_out,   _ = _ssh_main("top -bn1 | grep 'Cpu(s)' | awk '{print $2+$4}'")
    mem_out,   _ = _ssh_main("free -m | awk 'NR==2{printf \"%s/%sMB\", $3, $2}'")
    mem_pct,   _ = _ssh_main("free | awk 'NR==2{printf \"%.0f\", $3/$2*100}'")
    disk_out,  _ = _ssh_main("df -h / | awk 'NR==2{printf \"%s/%s\", $3, $2}'")
    disk_pct,  _ = _ssh_main("df / | awk 'NR==2{print $5}' | tr -d '%'")
    load_out,  _ = _ssh_main("cat /proc/loadavg | awk '{print $1, $2, $3}'")
    uptime,    _ = _ssh_main("uptime -p")
    docker_v,  _ = _ssh_main("docker --version | cut -d' ' -f3 | tr -d ','")
    hostname,  _ = _run("hostname -f 2>/dev/null || hostname")
    return {
        'cpu_pct':  cpu_out  or '0',
        'memory':   mem_out  or 'N/A',
        'mem_pct':  mem_pct  or '0',
        'disk':     disk_out or 'N/A',
        'disk_pct': disk_pct or '0',
        'load':     load_out or 'N/A',
        'uptime':   uptime   or 'N/A',
        'docker_v': docker_v or 'N/A',
        'hostname': hostname or 'main server',
    }