JM Shelf - Scraper

收藏/历史抓取 + URL 发现 + 用户名检测 — JM Shelf 推荐脚本的模块库,通过 @require 被主脚本引用。

此腳本不應該直接安裝,它是一個供其他腳本使用的函式庫。欲使用本函式庫,請在腳本 metadata 寫上: // @require https://update.sleazyfork.org/scripts/581107/1842606/JM%20Shelf%20-%20Scraper.js

您需要先安裝使用者腳本管理器擴展,如 TampermonkeyGreasemonkeyViolentmonkey 之後才能安裝該腳本。

You will need to install an extension such as Tampermonkey to install this script.

您需要先安裝使用者腳本管理器擴充功能,如 TampermonkeyViolentmonkey 後才能安裝該腳本。

您需要先安裝使用者腳本管理器擴充功能,如 TampermonkeyUserscripts 後才能安裝該腳本。

你需要先安裝一款使用者腳本管理器擴展,比如 Tampermonkey,才能安裝此腳本

您需要先安裝使用者腳本管理器擴充功能後才能安裝該腳本。

(我已經安裝了使用者腳本管理器,讓我安裝!)

你需要先安裝一款使用者樣式管理器擴展,比如 Stylus,才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展,比如 Stylus,才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展,比如 Stylus,才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展後才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展後才能安裝此樣式

你需要先安裝一款使用者樣式管理器擴展後才能安裝此樣式

(我已經安裝了使用者樣式管理器,讓我安裝!)

// ==UserScript==
// @name         JM Shelf - Scraper
// @namespace    jmshelf-lib
// @version      1.0.0
// @author       Kesdi
// @description  收藏/历史抓取 + URL 发现 + 用户名检测 — JM Shelf 推荐脚本的模块库,通过 @require 被主脚本引用。
// @license      MIT
// ==/UserScript==
// 
// 此文件是 GreasyFork 库(library),不直接安装。
// 请安装主脚本: JM Shelf 给杂鱼的个性化推荐
//

// ═══ [9] SCRAPER ═══ — 收藏+历史抓取 + URL发现
  // ============================================================
  
  // 用户名检测
  function detectUsername() {
    const userLinks = document.querySelectorAll('a[href*="/user/"]');
    const candidates = [];
    for (const link of userLinks) {
      const match = (link.getAttribute('href') || '').match(/\/user\/([^/]+)/);
      if (match && match[1] !== 'user' && !match[1].includes('#') && match[1].length < 30) {
        candidates.push(match[1]);
      }
    }
    const unique = [...new Set(candidates)];
    LOG.info(`检测到用户: ${JSON.stringify(unique)}`);
    if (unique.length > 0) return unique[0];
    LOG.warn('未检测到用户名 — 请确认已登录18comic并在主页');
    return '';
  }

  function isLoggedIn() {
    const body = document.body?.textContent || '';
    return !body.includes('會員登錄/註冊');
  }

  // URL发现
  async function discoverFavoritesUrl(username) {
    if (State.getFavoritesUrl()) {
      LOG.info(`已保存收藏URL: ${State.getFavoritesUrl()}`);
      return State.getFavoritesUrl();
    }
    LOG.info(`探测收藏页... 用户=${username}`);
    const patterns = [
      `/user/${username}/favorite/albums`,
      `/user/${username}/favorites`,
      `/user/${username}/favorite`,
      `/user/${username}/bookmark`,
      `/user/${username}?tab=favorite`,
      `/user/${username}?tab=album`,
      `/bookmark`,
    ];
    for (const path of patterns) {
      try {
        const url = `https://18comic.vip${path}`;
        const html = await fetcher.enqueue(url, null, 10);
        if (html !== null) {
          const hasAlbums = html.includes('/album/');
          const hasLogin = html.includes('login-modal');
          LOG.info(`  ${path}: len=${html.length} albums=${hasAlbums} login=${hasLogin}`);
          if (hasAlbums && !hasLogin) {
            State.saveFavoritesUrl(url);
            LOG.info(`✅ 收藏页: ${url}`);
            return url;
          }
        }
      } catch (e) { LOG.info(`  ${path}: ${e.message}`); }
    }
    LOG.info('HTTP探测未发现收藏页 (预期, 将用构造URL)');
    return '';
  }

  // 收藏抓取(iframe翻页)
  async function scrapeFavorites(baseUrl, maxPages, onProgress) {
    const allIds = new Set();
    
    const getSet = async (url) => {
      const set = new Set();
      try {
        const resp = await fetch(url, { credentials: 'include' });
        return { html: resp.ok ? await resp.text() : '', set };
      } catch(e) { return { html: '', set: new Set() }; }
    };

    const p1Url = baseUrl + (baseUrl.includes('?') ? '&' : '?') + 'page=1';
    const { html: p1Html } = await getSet(p1Url);
    
    const folderUrls = [baseUrl];
    const doc = new DOMParser().parseFromString(p1Html, 'text/html');
    const fl = doc.querySelector('#folder_list');
    if (fl) {
      fl.querySelectorAll('a[href*="favorite/albums?folder="]').forEach(a => {
        const href = a.getAttribute('href');
        if (href) {
          const fu = 'https://18comic.vip' + href.replace(/[?&]page=\d+/, '');
          if (!folderUrls.includes(fu)) folderUrls.push(fu);
        }
      });
    }
    
    for (const fu of folderUrls) {
      const idSet = new Set();
      for (const pg of [1, 2]) {
        const url = fu + (fu.includes('?') ? '&' : '?') + `page=${pg}`;
        const { html } = await getSet(url);
        const d2 = new DOMParser().parseFromString(html, 'text/html');
        d2.querySelectorAll('a[href*="/album/"]').forEach(a => {
          const m = (a.getAttribute('href') || '').match(/\/album\/(\d+)/);
          if (m && parseInt(m[1]) > 100) {
            if (pg === 1) idSet.add(m[1]);
            else if (idSet.has(m[1])) allIds.add(m[1]);
          }
        });
      }
    }
    
    if (allIds.size === 0) {
      const { html } = await getSet(p1Url);
      const d3 = new DOMParser().parseFromString(html, 'text/html');
      d3.querySelectorAll('a[href*="/album/"]').forEach(a => {
        const m = (a.getAttribute('href') || '').match(/\/album\/(\d+)/);
        if (m && parseInt(m[1]) > 100) allIds.add(m[1]);
      });
    }

    const albums = [];
    for (const id of allIds) albums.push({ id, title: '' });
    onProgress && onProgress({ message: `收藏: ${folderUrls.length}个文件夹, ${albums.length}本`, progress: 100 });
    LOG.info(`收藏抓取: ${folderUrls.length}个文件夹, ${albums.length}本`);
    return albums;
  }

  // 全量扫描主流程
  async function initialScan(username, onProgress) {
    LOG.info('开始扫描...');
    onProgress && onProgress({ phase: 'discover', progress: 0, message: '探测收藏/历史URL...' });

    const currentPath = location.pathname;
    if (currentPath.includes('/favorite/') || currentPath.includes('/bookmark')) {
      const curUrl = location.href.replace(/[?&]page=\d+/, '');
      State.saveFavoritesUrl(curUrl);
      LOG.info(`已在收藏页: ${curUrl}`);
    }

    const histUrl = `https://18comic.vip/user/${username}/favorite/watchlist`;
    let history = [];
    try {
      onProgress && onProgress({ phase: 'history', progress: 0, message: '提取历史记录...' });
      const histHtml = await fetcher.enqueue(histUrl, null, 5);
      if (histHtml) {
        const items = Parser.parseListing(histHtml);
        history = items.slice(0, 25).map(it => ({ id: it.id, title: it.title || '', tags: it.tags || [] }));
        onProgress && onProgress({ phase: 'history', progress: 100, message: `历史: ${history.length} 条 (赛后补标签)` });
        LOG.info(`历史: ${history.length} 条 [${history.map(h=>h.id).join(',')}] (赛后从候选池补标签)`);
        const viewed = State.getViewedAlbums();
        const viewedMap = new Map(viewed.map(v => [String(v.id || v), typeof v === 'object' ? (v.viewedAt || 0) : 0]));
        const now = Date.now();
        let merged = 0;
        for (let idx = 0; idx < history.length; idx++) {
          const sid = String(history[idx].id);
          const oldTs = viewedMap.get(sid);
          if (oldTs !== undefined) continue;
          const estTs = now;
          viewed.push({ id: sid, viewedAt: estTs });
          merged++;
        }
        if (merged > 0) { State.saveViewedAlbums(viewed); LOG.info(`历史→浏览合并: +${merged}条`); }
      }
    } catch(e) { LOG.warn('历史抓取失败', e.message); }

    let favUrl = await discoverFavoritesUrl(username);
    if (!favUrl && username) {
      favUrl = `https://18comic.vip/user/${username}/favorite/albums`;
      LOG.info(`构造收藏URL: ${favUrl}`);
    }

    let favorites = [];
    if (favUrl) {
      onProgress && onProgress({ phase: 'favorites', progress: 0, message: 'iframe提取收藏...' });
      favorites = await scrapeFavorites(favUrl, 30, (info) => {
        onProgress && onProgress({ phase: 'favorites', progress: info.progress, message: info.message });
      });
      LOG.info(`收藏: ${favorites.length} 本`);
      if (favorites.length > 0) {
        onProgress && onProgress({ phase: 'favorites', progress: 90, message: `丰富 ${favorites.length} 收藏标签...` });
        favorites = await enrichAlbumsWithDetails(favorites, (i, total) => {
          onProgress && onProgress({ phase: 'favorites', progress: 90 + Math.round((i / total) * 10), message: `收藏标签 ${i}/${total}` });
        });
        State.saveFavorites(favorites);
        const ft2 = favorites.reduce((s,a) => s + (a.tags||[]).length, 0);
        LOG.info(`收藏详情: ${favorites.length}本 | 总标签:${ft2} | ID:${favorites.map(a=>a.id).slice(0,10).join(',')}${favorites.length>10?'...':''}`);
      }
    } else {
      LOG.warn('⚠️ 无收藏数据');
    }

    return { favorites, history };
  }

  async function scrapeAllPages(baseUrl, onPageProgress) {
    const allItems = [];
    const firstHtml = await fetcher.enqueue(baseUrl, null, 10);
    if (!firstHtml) return allItems;

    const items = Parser.parseListing(firstHtml);
    allItems.push(...items);
    const pagination = Parser.parsePagination(firstHtml);
    let totalPages = pagination.totalPages || 1;
    
    if (totalPages <= 1 && items.length >= 15) {
      totalPages = 10;
    }
    onPageProgress(1, totalPages);

    for (let page = 2; page <= totalPages; page++) {
      const sep = baseUrl.includes('?') ? '&' : '?';
      const url = `${baseUrl}${sep}page=${page}`;
      try {
        const html = await fetcher.enqueue(url, null, 5);
        if (html) {
          const pageItems = Parser.parseListing(html);
          if (pageItems.length === 0) break;
          allItems.push(...pageItems);
        }
      } catch (e) {
        LOG.warn(`第 ${page} 页抓取失败: ${e.message}`);
      }
      onPageProgress(page, totalPages);
    }

    return allItems;
  }

  async function enrichAlbumsWithDetails(albums, onProgress) {
    const enriched = [];
    for (let i = 0; i < albums.length; i++) {
      const album = albums[i];
      if (album.tags && album.tags.length >= CONFIG.TAG_ENRICH_THRESHOLD) {
        enriched.push(album);
        continue;
      }
      try {
        const html = await fetcher.enqueue(`https://18comic.vip/album/${album.id}/`, null, 3);
        if (html) {
          const detail = Parser.parseDetail(html);
          album.tags = detail.tags || [];
          album.authors = detail.authors || [];
          album.typeTags = detail.typeTags || [];
          album.title = detail.title || album.title;
        }
      } catch (e) {
        LOG.warn(`详情页获取失败 #${album.id}: ${e.message}`);
      }
      enriched.push(album);
      if (onProgress) onProgress(i + 1, albums.length);
    }
    return enriched;
  }