<?php
require_once __DIR__ . '/../config/database.php';
require_once __DIR__ . '/helper.php';

const SOURCE_URL = 'https://onefootball.com/en/competition/premier-league-9';

function fetch_article_list() {
    $html = http_get(SOURCE_URL);
    $articles = [];
    $pattern = '#<a[^>]+href=\"(/en/news/[^\"]+)\"[^>]*>(.*?)</a>#i';
    if (preg_match_all($pattern, $html, $matches, PREG_SET_ORDER)) {
        foreach ($matches as $m) {
            $urlPath = html_entity_decode($m[1]);
            $url = 'https://onefootball.com' . $urlPath;
            $articles[$url] = $url;
        }
    }
    return array_values($articles);
}

function fetch_article_detail($url) {
    $html = http_get($url);

    $title = null;
    if (preg_match('#<h1[^>]*>(.*?)</h1>#is', $html, $m)) {
        $title = trim(strip_tags($m[1]));
    }

    if (preg_match('#<article[^>]*>(.*?)</article>#is', $html, $m)) {
        $contentHtml = $m[1];
    } else {
        if (preg_match('#<div[^>]+data-testid=\"article\"[^>]*>(.*?)</div>#is', $html, $m2)) {
            $contentHtml = $m2[1];
        } else {
            $contentHtml = '';
        }
    }

    $content = strip_tags($contentHtml, '<p><strong><em><ul><ol><li><br>');
    $content = preg_replace('#\s+#', ' ', $content);
    $content = trim($content);

    $img = null;
    if (preg_match('#<meta[^>]+property=\"og:image\"[^>]+content=\"([^\"]+)\"#i', $html, $mm)) {
        $img = $mm[1];
    }

    if (!$title || !$content) {
        throw new Exception('Gagal mengambil detail artikel dari: ' . $url);
    }

    return [
        'title' => $title,
        'content' => $content,
        'img_url' => $img,
    ];
}

function save_article_if_new($url, $title, $content, $imgUrl) {
    global $pdo;

    $stmt = $pdo->prepare("SELECT id FROM articles WHERE source_url = ?");
    $stmt->execute([$url]);
    $exists = $stmt->fetch();
    if ($exists) {
        echo "Skip (sudah ada): {$url}\n";
        return false;
    }

    $stmt = $pdo->prepare("INSERT INTO articles (source_url, title, content, img_url, published_at, status) VALUES (?, ?, ?, ?, NOW(), 'draft')");
    $stmt->execute([$url, $title, $content, $imgUrl]);

    echo "Simpan artikel baru: {$title}\n";
    return true;
}

function process_scrape_batch($max = 5) {
    $list = fetch_article_list();
    $count = 0;

    foreach ($list as $url) {
        if ($count >= $max) break;

        try {
            echo "Ambil artikel: {$url}\n";
            $detail = fetch_article_detail($url);
            if (save_article_if_new($url, $detail['title'], $detail['content'], $detail['img_url'])) {
                $count++;
            }
        } catch (Exception $e) {
            echo "GAGAL ambil artikel {$url}: " . $e->getMessage() . "\n";
        }
    }

    echo "Total artikel baru tersimpan: {$count}\n";
}
