JM Shelf - Candidate Pool - 原始程式碼

JM Shelf - Candidate Pool

8通道候选池构建与合并去重 — JM Shelf 推荐脚本的模块库，通过 @require 被主脚本引用。
此腳本不應該直接安裝，它是一個供其他腳本使用的函式庫。欲使用本函式庫，請在腳本 metadata 寫上： // @require https://update.sleazyfork.org/scripts/581108/1842607/JM%20Shelf%20-%20Candidate%20Pool.js
發表問題、評論、或檢舉該腳本。
換行
// ==UserScript==
// @name         JM Shelf - Candidate Pool
// @namespace    jmshelf-lib
// @version      1.0.0
// @author       Kesdi
// @description  8通道候选池构建与合并去重 — JM Shelf 推荐脚本的模块库，通过 @require 被主脚本引用。
// @license      MIT
// ==/UserScript==
// 
// 此文件是 GreasyFork 库(library)，不直接安装。
// 请安装主脚本: JM Shelf 给杂鱼的个性化推荐
//

// ═══ [10] CANDIDATE POOL ═══ - Multi-channel recall
  // ============================================================
  const CandidatePool = {
    async build(profile, topFavAlbums, onProgress) {
      const channels = {};
      const albumCache = State.getAlbumCache();

      onProgress && onProgress({ phase: 'candidates', progress: 0, message: '构建候选池...' });

      // 辅助: 从HTML解析并注入搜索标签
      async function fetchTagResults(baseUrl, injectTag, pages, maxItems) {
        const ids = [];
        let globalEstimate = 0;
        for (let pg = 1; pg <= pages; pg++) {
          const url = baseUrl + (baseUrl.includes('?') ? '&' : '?') + `page=${pg}`;
          try {
            const html = await fetcher.enqueue(url, null, pg === 1 ? 3 : 2);
            if (!html) continue;
            const items = Parser.parseListing(html).slice(0, maxItems);
            for (const item of items) {
              if (injectTag && item.tags.length < 2 && !item.tags.includes(injectTag)) item.tags.push(injectTag);
              ids.push(item.id);
              if (!albumCache[item.id]) albumCache[item.id] = item;
            }
            if (pg === 1 && injectTag) {
              const pag = Parser.parsePagination(html);
              globalEstimate = (pag.totalPages || 1) * 40;
            }
            if (items.length < maxItems / 2) break;
          } catch (e) {}
        }
        if (injectTag && globalEstimate > 0) {
          const tagFreq = State.getTagFreq();
          const nt = normalizeTag(injectTag);
          tagFreq[nt] = globalEstimate;
          State.saveTagFreq(tagFreq);
        }
        return ids;
      }

      // Channel 1: 标签搜索
      const topTags = ProfileManager.getTopTags(profile, CONFIG.TOP_TAGS_COUNT);
      onProgress && onProgress({ phase: 'candidates', progress: 2, message: `通道①: 标签召回×${CONFIG.SEARCH_PAGES}页 (${topTags.length}个标签)...` });
      channels['tag'] = [];
      for (let i = 0; i < topTags.length; i++) {
        const tag = topTags[i];
        const baseUrl = `https://18comic.vip/search/photos?search_query=${encodeURIComponent(tag)}&main_tag=0`;
        const ids = await fetchTagResults(baseUrl, tag, CONFIG.SEARCH_PAGES, 80);
        channels['tag'].push(...ids);
        onProgress && onProgress({ phase: 'candidates', progress: 2 + Math.round((i / topTags.length) * 16), message: `① ${tag} (${ids.length}本)${ids.length===0?' ⚠':''}` });
      }

      // Channel 2: /albums 浏览
      onProgress && onProgress({ phase: 'candidates', progress: 18, message: `通道②: /albums标签补充×${CONFIG.ALBUMS_PAGES}页...` });
      channels['albums'] = [];
      for (let pg = 1; pg <= CONFIG.ALBUMS_PAGES; pg++) {
        try {
          const html = await fetcher.enqueue(`https://18comic.vip/albums?page=${pg}&o=mr`, null, 2);
          if (!html) continue;
          const items = Parser.parseListing(html).slice(0, 80);
          for (const item of items) {
            channels['albums'].push(item.id);
            if (albumCache[item.id]) {
              const existing = albumCache[item.id];
              const mergedTags = [...new Set([...existing.tags, ...item.tags])];
              albumCache[item.id] = { ...existing, ...item, tags: mergedTags };
            } else {
              albumCache[item.id] = item;
            }
          }
        } catch (e) {}
        onProgress && onProgress({ phase: 'candidates', progress: 18 + Math.round((pg / CONFIG.ALBUMS_PAGES) * 10), message: `② /albums p${pg}` });
      }

      // Channel 3: 标签组合
      onProgress && onProgress({ phase: 'candidates', progress: 28, message: '通道③: 标签组合召回...' });
      channels['combo'] = [];
      const comboTop = topTags.slice(0, CONFIG.TAG_COMBO_TOP);
      let comboCount = 0;
      for (let i = 0; i < Math.min(comboTop.length, 8); i++) {
        for (let j = i + 1; j < Math.min(comboTop.length, 8); j++) {
          if (comboCount >= 12) break;
          const tagA = comboTop[i], tagB = comboTop[j];
          const baseUrl = `https://18comic.vip/search/photos?search_query=${encodeURIComponent(tagA + ' ' + tagB)}&main_tag=0`;
          try {
            const html = await fetcher.enqueue(baseUrl, null, 2);
            if (html) {
              const items = Parser.parseListing(html).slice(0, 30);
              for (const item of items) {
                if (item.tags.length < 3) {
                  if (!item.tags.includes(tagA)) item.tags.push(tagA);
                  if (!item.tags.includes(tagB)) item.tags.push(tagB);
                }
                channels['combo'].push(item.id);
                if (!albumCache[item.id]) albumCache[item.id] = item;
              }
            }
          } catch (e) {}
          comboCount++;
        }
      }

      // Channel 4: 作者搜索
      const topAuthors = ProfileManager.getTopAuthors(profile, CONFIG.TOP_AUTHORS_MIN_WORKS);
      const auCount = Math.min(topAuthors.length, 10);
      onProgress && onProgress({ phase: 'candidates', progress: 33, message: `通道④: 作者召回(${auCount})...` });
      channels['author'] = [];
      for (let i = 0; i < auCount; i++) {
        const author = topAuthors[i];
        try {
          const html = await fetcher.enqueue(`https://18comic.vip/search/photos?search_query=${encodeURIComponent(author)}&main_tag=2`, null, 4);
          if (html) {
            const items = Parser.parseListing(html).slice(0, 40);
            for (const item of items) {
              channels['author'].push(item.id);
              if (!albumCache[item.id]) albumCache[item.id] = item;
            }
          }
        } catch (e) {}
        onProgress && onProgress({ phase: 'candidates', progress: 33 + Math.round((i / auCount) * 4), message: `④ ${author}` });
      }

      // Channel 5: 类型浏览
      onProgress && onProgress({ phase: 'candidates', progress: 37, message: '通道⑤: 类型浏览...' });
      channels['type'] = [];
      const typeUrls = [];
      const typePrefs = Object.entries(profile.types).sort((a, b) => b[1] - a[1]).slice(0, 4);
      for (const [type] of typePrefs) {
        let typePath;
        switch (type) {
          case '韓漫': typePath = '/albums/hanman'; break;
          case '同人': typePath = '/albums/doujin'; break;
          case '單本': typePath = '/albums/single'; break;
          case '短篇': typePath = '/albums/short'; break;
          default: typePath = '/albums'; break;
        }
        typeUrls.push(typePath + '?o=mr');
      }
      for (const tUrl of typeUrls) {
        try {
          const html = await fetcher.enqueue(`https://18comic.vip${tUrl}`, null, 2);
          if (html) {
            const items = Parser.parseListing(html).slice(0, 80);
            for (const item of items) {
              channels['type'].push(item.id);
              if (!albumCache[item.id]) albumCache[item.id] = item;
            }
          }
        } catch (e) {}
      }

      // Channel 6: 关联漫画
      onProgress && onProgress({ phase: 'candidates', progress: 40, message: '通道⑥: 关联漫画...' });
      channels['related'] = [];
      const topFavs = topFavAlbums.slice(0, 8);
      for (let i = 0; i < topFavs.length; i++) {
        const fav = topFavs[i];
        try {
          const html = await fetcher.enqueue(`https://18comic.vip/album/${fav.id}/`, null, 2);
          if (html) {
            const detail = Parser.parseDetail(html);
            for (const rid of detail.related) channels['related'].push(rid);
            if (albumCache[fav.id]) {
              albumCache[fav.id].tags = [...new Set([...albumCache[fav.id].tags, ...(detail.tags||[])])];
              albumCache[fav.id].authors = [...new Set([...albumCache[fav.id].authors, ...(detail.authors||[])])];
            }
          }
        } catch (e) {}
      }

      // Channel 7: 探索
      onProgress && onProgress({ phase: 'candidates', progress: 43, message: '通道⑦: 随机探索...' });
      channels['discover'] = [];
      const maxPage = 200;
      for (let i = 0; i < CONFIG.EXPLORE_PAGES; i++) {
        const randPage = Math.floor(_random() * maxPage) + 1;
        try {
          const html = await fetcher.enqueue(`https://18comic.vip/albums?page=${randPage}&o=mr`, null, 1);
          if (html) {
            const items = Parser.parseListing(html).slice(0, 60);
            for (const item of items) {
              channels['discover'].push(item.id);
              if (!albumCache[item.id]) albumCache[item.id] = item;
            }
          }
        } catch (e) {}
        onProgress && onProgress({ phase: 'candidates', progress: 43 + Math.round((i / CONFIG.EXPLORE_PAGES) * 4), message: `⑦ 探索p${randPage}` });
      }

      // Channel 8: 排行榜
      onProgress && onProgress({ phase: 'candidates', progress: 47, message: `通道⑧: 排行矩阵×${CONFIG.RANKING_CHANNELS.length}...` });
      channels['rank'] = [];
      channels['rank_explore'] = [];
      channels['rank_comp'] = [];
      let _rcIdx = 0;
      for (const rc of CONFIG.RANKING_CHANNELS) {
        for (let pg = 1; pg <= rc.pages; pg++) {
          try {
            const pageUrl = rc.url + (rc.url.includes('page=') ? '' : (rc.url.includes('?') ? '&' : '?') + 'page=' + pg);
            const html = await fetcher.enqueue(`https://18comic.vip${pageUrl}`, null, 2);
            if (html) {
              const items = Parser.parseListing(html).slice(0, 80);
              for (const item of items) {
                channels['rank'].push(item.id);
                if (rc.pool === 'explore' || rc.pool === 'both') channels['rank_explore'].push(item.id);
                if (rc.pool === 'comp' || rc.pool === 'both') channels['rank_comp'].push(item.id);
                if (!albumCache[item.id]) { albumCache[item.id] = item; }
                else {
                  const existing = albumCache[item.id];
                  existing.tags = [...new Set([...existing.tags, ...(item.tags||[])])];
                  existing.authors = [...new Set([...existing.authors, ...(item.authors||[])])];
                }
              }
            }
          } catch (e) {}
          onProgress && onProgress({ phase: 'candidates', progress: 47 + Math.round(((_rcIdx + pg / rc.pages) / CONFIG.RANKING_CHANNELS.length) * 10), message: `⑧ ${rc.label} p${pg}/${rc.pages}` });
        }
        _rcIdx++;
      }

      // 标签补充
      onProgress && onProgress({ phase: 'candidates', progress: 57, message: '补全标签数据...' });

      if (fetcher._scanFailed) {
        onProgress && onProgress({ phase: 'error', progress: 0, message: '❌ 扫描失败: 请求超时' });
        LOG.error('扫描终止: 部分请求超过10分钟重试上限');
        return;
      }

      let enrichedCount = 0;
      for (const [id, data] of Object.entries(albumCache)) {
        if ((data.tags || []).length <= CONFIG.TAG_ENRICH_THRESHOLD && enrichedCount < 30) {
          try {
            const html = await fetcher.enqueue(`https://18comic.vip/album/${id}/`, null, 1);
            if (html) {
              const detail = Parser.parseDetail(html);
              if (detail.tags.length > 0) {
                data.tags = [...new Set([...data.tags, ...(detail.tags||[])])];
                data.authors = [...new Set([...data.authors, ...(detail.authors||[])])];
                data.typeTags = [...new Set([...data.typeTags, ...(detail.typeTags||[])])];
                enrichedCount++;
              }
            }
          } catch (e) {}
        }
      }
      onProgress && onProgress({ phase: 'candidates', progress: 60, message: `补全${enrichedCount}个标签` });

      // 合并去重
      const channelMap = {};
      for (const [chName, ids] of Object.entries(channels)) {
        for (const id of ids) {
          if (!channelMap[id]) channelMap[id] = [];
          if (!channelMap[id].includes(chName)) channelMap[id].push(chName);
        }
      }

      const allIds = Object.keys(channelMap);
      const blacklist = getBlacklist();
      const candidates = [];
      const seenTitles = new Set();
      let mergeStats = { total:allIds.length, blAlbum:0, typeKilled:0, survived:0 };

      const normTitle = (t) => {
        let s = (t||'').toLowerCase();
        s = s.replace(/\[.*?\]/g,'').replace(/【.*?】/g,'').replace(/（.*?）/g,'').replace(/\(.*?\)/g,'');
        let r = '';
        for (const ch of s) { const v = TAG_NORMALIZE[ch]; r += v !== undefined ? v : ch; }
        return r.replace(/[\s\-～~　]+/g, '').trim();
      };

      for (const id of allIds) {
        if (blacklist.albums.includes(id)) { mergeStats.blAlbum++; continue; }
        const channelCount = channelMap[id].length;
        const albumData = albumCache[id] || { id, title: '', tags: [], authors: [], typeTags: [], views: 0 };
        const isTypeBlacklisted = (albumData.typeTags||[]).some(t => CONFIG.TYPE_HARD_BLACKLIST.includes(t)) ||
            (albumData.tags||[]).some(t => CONFIG.TYPE_HARD_BLACKLIST.includes(t)) ||
            (albumData.chapters||0) > 20;
        if (isTypeBlacklisted) { mergeStats.typeKilled++; continue; }
        const nt = normTitle(albumData.title);
        if (nt && seenTitles.has(nt)) { mergeStats.titleDup = (mergeStats.titleDup||0)+1; continue; }
        if (nt && albumData.title) {
          const dc = _dedupCheck(albumData.title, id, false);
          if (dc.action === 'dup') { mergeStats.trieDup = (mergeStats.trieDup||0)+1; continue; }
        }
        if (nt) seenTitles.add(nt);
        mergeStats.survived++;
        if (!albumCache[id]) albumCache[id] = { ...albumData };
        albumCache[id].channels = channelMap[id];
        albumCache[id].channelCount = channelMap[id].length;
        candidates.push({ ...albumData, channelCount: channelMap[id].length, channels: channelMap[id] });
      }

      LOG.info(`📊 合并去重: 原始${mergeStats.total} → -${mergeStats.blAlbum}黑名 -${mergeStats.typeKilled}类型 -${mergeStats.titleDup||0}标题 -${mergeStats.trieDup||0}AC去重 → 候选${mergeStats.survived}`);

      State.saveAlbumCache(albumCache);
      candidates.sort((a, b) => (b.channelCount||0) - (a.channelCount||0) || (b.views||0) - (a.views||0));
      State.saveCandidates(candidates.slice(0, CONFIG.CANDIDATE_POOL_MAX).map(c => c.id));

      onProgress && onProgress({ phase: 'candidates', progress: 65, message: `候选池: ${candidates.length}个` });

      return candidates;
    },

    addAlbum(albumData) {
      const cache = State.getAlbumCache();
      if (cache[albumData.id]) return;
      cache[albumData.id] = albumData;
      State.saveAlbumCache(cache);

      const candidates = State.getCandidates();
      if (!candidates.includes(albumData.id) && candidates.length < CONFIG.CANDIDATE_POOL_MAX) {
        candidates.push(albumData.id);
        State.saveCandidates(candidates);
      }
    },

    getCachedAlbum(id) {
      return State.getAlbumCache()[id] || null;
    },
  };