JM Shelf - Scraper のソースコード

JM Shelf - Scraper

收藏/历史抓取 + URL 发现 + 用户名检测 — JM Shelf 推荐脚本的模块库，通过 @require 被主脚本引用。
このスクリプトは単体で利用できません。右のようなメタデータを含むスクリプトから、ライブラリとして読み込まれます: // @require https://update.sleazyfork.org/scripts/581107/1842606/JM%20Shelf%20-%20Scraper.js
このスクリプトの質問や評価の投稿はこちら、通報はこちらへお寄せください
右端で折り返す
// ==UserScript==
// @name         JM Shelf - Scraper
// @namespace    jmshelf-lib
// @version      1.0.0
// @author       Kesdi
// @description  收藏/历史抓取 + URL 发现 + 用户名检测 — JM Shelf 推荐脚本的模块库，通过 @require 被主脚本引用。
// @license      MIT
// ==/UserScript==
// 
// 此文件是 GreasyFork 库(library)，不直接安装。
// 请安装主脚本: JM Shelf 给杂鱼的个性化推荐
//

// ═══ [9] SCRAPER ═══ — 收藏+历史抓取 + URL发现
  // ============================================================
  
  // 用户名检测
  function detectUsername() {
    const userLinks = document.querySelectorAll('a[href*="/user/"]');
    const candidates = [];
    for (const link of userLinks) {
      const match = (link.getAttribute('href') || '').match(/\/user\/([^/]+)/);
      if (match && match[1] !== 'user' && !match[1].includes('#') && match[1].length < 30) {
        candidates.push(match[1]);
      }
    }
    const unique = [...new Set(candidates)];
    LOG.info(`检测到用户: ${JSON.stringify(unique)}`);
    if (unique.length > 0) return unique[0];
    LOG.warn('未检测到用户名 — 请确认已登录18comic并在主页');
    return '';
  }

  function isLoggedIn() {
    const body = document.body?.textContent || '';
    return !body.includes('會員登錄/註冊');
  }

  // URL发现
  async function discoverFavoritesUrl(username) {
    if (State.getFavoritesUrl()) {
      LOG.info(`已保存收藏URL: ${State.getFavoritesUrl()}`);
      return State.getFavoritesUrl();
    }
    LOG.info(`探测收藏页... 用户=${username}`);
    const patterns = [
      `/user/${username}/favorite/albums`,
      `/user/${username}/favorites`,
      `/user/${username}/favorite`,
      `/user/${username}/bookmark`,
      `/user/${username}?tab=favorite`,
      `/user/${username}?tab=album`,
      `/bookmark`,
    ];
    for (const path of patterns) {
      try {
        const url = `https://18comic.vip${path}`;
        const html = await fetcher.enqueue(url, null, 10);
        if (html !== null) {
          const hasAlbums = html.includes('/album/');
          const hasLogin = html.includes('login-modal');
          LOG.info(`  ${path}: len=${html.length} albums=${hasAlbums} login=${hasLogin}`);
          if (hasAlbums && !hasLogin) {
            State.saveFavoritesUrl(url);
            LOG.info(`✅ 收藏页: ${url}`);
            return url;
          }
        }
      } catch (e) { LOG.info(`  ${path}: ${e.message}`); }
    }
    LOG.info('HTTP探测未发现收藏页 (预期, 将用构造URL)');
    return '';
  }

  // 收藏抓取(iframe翻页)
  async function scrapeFavorites(baseUrl, maxPages, onProgress) {
    const allIds = new Set();
    
    const getSet = async (url) => {
      const set = new Set();
      try {
        const resp = await fetch(url, { credentials: 'include' });
        return { html: resp.ok ? await resp.text() : '', set };
      } catch(e) { return { html: '', set: new Set() }; }
    };

    const p1Url = baseUrl + (baseUrl.includes('?') ? '&' : '?') + 'page=1';
    const { html: p1Html } = await getSet(p1Url);
    
    const folderUrls = [baseUrl];
    const doc = new DOMParser().parseFromString(p1Html, 'text/html');
    const fl = doc.querySelector('#folder_list');
    if (fl) {
      fl.querySelectorAll('a[href*="favorite/albums?folder="]').forEach(a => {
        const href = a.getAttribute('href');
        if (href) {
          const fu = 'https://18comic.vip' + href.replace(/[?&]page=\d+/, '');
          if (!folderUrls.includes(fu)) folderUrls.push(fu);
        }
      });
    }
    
    for (const fu of folderUrls) {
      const idSet = new Set();
      for (const pg of [1, 2]) {
        const url = fu + (fu.includes('?') ? '&' : '?') + `page=${pg}`;
        const { html } = await getSet(url);
        const d2 = new DOMParser().parseFromString(html, 'text/html');
        d2.querySelectorAll('a[href*="/album/"]').forEach(a => {
          const m = (a.getAttribute('href') || '').match(/\/album\/(\d+)/);
          if (m && parseInt(m[1]) > 100) {
            if (pg === 1) idSet.add(m[1]);
            else if (idSet.has(m[1])) allIds.add(m[1]);
          }
        });
      }
    }
    
    if (allIds.size === 0) {
      const { html } = await getSet(p1Url);
      const d3 = new DOMParser().parseFromString(html, 'text/html');
      d3.querySelectorAll('a[href*="/album/"]').forEach(a => {
        const m = (a.getAttribute('href') || '').match(/\/album\/(\d+)/);
        if (m && parseInt(m[1]) > 100) allIds.add(m[1]);
      });
    }

    const albums = [];
    for (const id of allIds) albums.push({ id, title: '' });
    onProgress && onProgress({ message: `收藏: ${folderUrls.length}个文件夹, ${albums.length}本`, progress: 100 });
    LOG.info(`收藏抓取: ${folderUrls.length}个文件夹, ${albums.length}本`);
    return albums;
  }

  // 全量扫描主流程
  async function initialScan(username, onProgress) {
    LOG.info('开始扫描...');
    onProgress && onProgress({ phase: 'discover', progress: 0, message: '探测收藏/历史URL...' });

    const currentPath = location.pathname;
    if (currentPath.includes('/favorite/') || currentPath.includes('/bookmark')) {
      const curUrl = location.href.replace(/[?&]page=\d+/, '');
      State.saveFavoritesUrl(curUrl);
      LOG.info(`已在收藏页: ${curUrl}`);
    }

    const histUrl = `https://18comic.vip/user/${username}/favorite/watchlist`;
    let history = [];
    try {
      onProgress && onProgress({ phase: 'history', progress: 0, message: '提取历史记录...' });
      const histHtml = await fetcher.enqueue(histUrl, null, 5);
      if (histHtml) {
        const items = Parser.parseListing(histHtml);
        history = items.slice(0, 25).map(it => ({ id: it.id, title: it.title || '', tags: it.tags || [] }));
        onProgress && onProgress({ phase: 'history', progress: 100, message: `历史: ${history.length} 条 (赛后补标签)` });
        LOG.info(`历史: ${history.length} 条 [${history.map(h=>h.id).join(',')}] (赛后从候选池补标签)`);
        const viewed = State.getViewedAlbums();
        const viewedMap = new Map(viewed.map(v => [String(v.id || v), typeof v === 'object' ? (v.viewedAt || 0) : 0]));
        const now = Date.now();
        let merged = 0;
        for (let idx = 0; idx < history.length; idx++) {
          const sid = String(history[idx].id);
          const oldTs = viewedMap.get(sid);
          if (oldTs !== undefined) continue;
          const estTs = now;
          viewed.push({ id: sid, viewedAt: estTs });
          merged++;
        }
        if (merged > 0) { State.saveViewedAlbums(viewed); LOG.info(`历史→浏览合并: +${merged}条`); }
      }
    } catch(e) { LOG.warn('历史抓取失败', e.message); }

    let favUrl = await discoverFavoritesUrl(username);
    if (!favUrl && username) {
      favUrl = `https://18comic.vip/user/${username}/favorite/albums`;
      LOG.info(`构造收藏URL: ${favUrl}`);
    }

    let favorites = [];
    if (favUrl) {
      onProgress && onProgress({ phase: 'favorites', progress: 0, message: 'iframe提取收藏...' });
      favorites = await scrapeFavorites(favUrl, 30, (info) => {
        onProgress && onProgress({ phase: 'favorites', progress: info.progress, message: info.message });
      });
      LOG.info(`收藏: ${favorites.length} 本`);
      if (favorites.length > 0) {
        onProgress && onProgress({ phase: 'favorites', progress: 90, message: `丰富 ${favorites.length} 收藏标签...` });
        favorites = await enrichAlbumsWithDetails(favorites, (i, total) => {
          onProgress && onProgress({ phase: 'favorites', progress: 90 + Math.round((i / total) * 10), message: `收藏标签 ${i}/${total}` });
        });
        State.saveFavorites(favorites);
        const ft2 = favorites.reduce((s,a) => s + (a.tags||[]).length, 0);
        LOG.info(`收藏详情: ${favorites.length}本 | 总标签:${ft2} | ID:${favorites.map(a=>a.id).slice(0,10).join(',')}${favorites.length>10?'...':''}`);
      }
    } else {
      LOG.warn('⚠️ 无收藏数据');
    }

    return { favorites, history };
  }

  async function scrapeAllPages(baseUrl, onPageProgress) {
    const allItems = [];
    const firstHtml = await fetcher.enqueue(baseUrl, null, 10);
    if (!firstHtml) return allItems;

    const items = Parser.parseListing(firstHtml);
    allItems.push(...items);
    const pagination = Parser.parsePagination(firstHtml);
    let totalPages = pagination.totalPages || 1;
    
    if (totalPages <= 1 && items.length >= 15) {
      totalPages = 10;
    }
    onPageProgress(1, totalPages);

    for (let page = 2; page <= totalPages; page++) {
      const sep = baseUrl.includes('?') ? '&' : '?';
      const url = `${baseUrl}${sep}page=${page}`;
      try {
        const html = await fetcher.enqueue(url, null, 5);
        if (html) {
          const pageItems = Parser.parseListing(html);
          if (pageItems.length === 0) break;
          allItems.push(...pageItems);
        }
      } catch (e) {
        LOG.warn(`第 ${page} 页抓取失败: ${e.message}`);
      }
      onPageProgress(page, totalPages);
    }

    return allItems;
  }

  async function enrichAlbumsWithDetails(albums, onProgress) {
    const enriched = [];
    for (let i = 0; i < albums.length; i++) {
      const album = albums[i];
      if (album.tags && album.tags.length >= CONFIG.TAG_ENRICH_THRESHOLD) {
        enriched.push(album);
        continue;
      }
      try {
        const html = await fetcher.enqueue(`https://18comic.vip/album/${album.id}/`, null, 3);
        if (html) {
          const detail = Parser.parseDetail(html);
          album.tags = detail.tags || [];
          album.authors = detail.authors || [];
          album.typeTags = detail.typeTags || [];
          album.title = detail.title || album.title;
        }
      } catch (e) {
        LOG.warn(`详情页获取失败 #${album.id}: ${e.message}`);
      }
      enriched.push(album);
      if (onProgress) onProgress(i + 1, albums.length);
    }
    return enriched;
  }