8通道候选池构建与合并去重 — JM Shelf 推荐脚本的模块库,通过 @require 被主脚本引用。
Bu script direkt olarak kurulamaz. Başka scriptler için bir kütüphanedir ve meta yönergeleri içerir // @require https://update.sleazyfork.org/scripts/581108/1842607/JM%20Shelf%20-%20Candidate%20Pool.js
// ==UserScript==
// @name JM Shelf - Candidate Pool
// @namespace jmshelf-lib
// @version 1.0.0
// @author Kesdi
// @description 8通道候选池构建与合并去重 — JM Shelf 推荐脚本的模块库,通过 @require 被主脚本引用。
// @license MIT
// ==/UserScript==
//
// 此文件是 GreasyFork 库(library),不直接安装。
// 请安装主脚本: JM Shelf 给杂鱼的个性化推荐
//
// ═══ [10] CANDIDATE POOL ═══ - Multi-channel recall
// ============================================================
const CandidatePool = {
async build(profile, topFavAlbums, onProgress) {
const channels = {};
const albumCache = State.getAlbumCache();
onProgress && onProgress({ phase: 'candidates', progress: 0, message: '构建候选池...' });
// 辅助: 从HTML解析并注入搜索标签
async function fetchTagResults(baseUrl, injectTag, pages, maxItems) {
const ids = [];
let globalEstimate = 0;
for (let pg = 1; pg <= pages; pg++) {
const url = baseUrl + (baseUrl.includes('?') ? '&' : '?') + `page=${pg}`;
try {
const html = await fetcher.enqueue(url, null, pg === 1 ? 3 : 2);
if (!html) continue;
const items = Parser.parseListing(html).slice(0, maxItems);
for (const item of items) {
if (injectTag && item.tags.length < 2 && !item.tags.includes(injectTag)) item.tags.push(injectTag);
ids.push(item.id);
if (!albumCache[item.id]) albumCache[item.id] = item;
}
if (pg === 1 && injectTag) {
const pag = Parser.parsePagination(html);
globalEstimate = (pag.totalPages || 1) * 40;
}
if (items.length < maxItems / 2) break;
} catch (e) {}
}
if (injectTag && globalEstimate > 0) {
const tagFreq = State.getTagFreq();
const nt = normalizeTag(injectTag);
tagFreq[nt] = globalEstimate;
State.saveTagFreq(tagFreq);
}
return ids;
}
// Channel 1: 标签搜索
const topTags = ProfileManager.getTopTags(profile, CONFIG.TOP_TAGS_COUNT);
onProgress && onProgress({ phase: 'candidates', progress: 2, message: `通道①: 标签召回×${CONFIG.SEARCH_PAGES}页 (${topTags.length}个标签)...` });
channels['tag'] = [];
for (let i = 0; i < topTags.length; i++) {
const tag = topTags[i];
const baseUrl = `https://18comic.vip/search/photos?search_query=${encodeURIComponent(tag)}&main_tag=0`;
const ids = await fetchTagResults(baseUrl, tag, CONFIG.SEARCH_PAGES, 80);
channels['tag'].push(...ids);
onProgress && onProgress({ phase: 'candidates', progress: 2 + Math.round((i / topTags.length) * 16), message: `① ${tag} (${ids.length}本)${ids.length===0?' ⚠':''}` });
}
// Channel 2: /albums 浏览
onProgress && onProgress({ phase: 'candidates', progress: 18, message: `通道②: /albums标签补充×${CONFIG.ALBUMS_PAGES}页...` });
channels['albums'] = [];
for (let pg = 1; pg <= CONFIG.ALBUMS_PAGES; pg++) {
try {
const html = await fetcher.enqueue(`https://18comic.vip/albums?page=${pg}&o=mr`, null, 2);
if (!html) continue;
const items = Parser.parseListing(html).slice(0, 80);
for (const item of items) {
channels['albums'].push(item.id);
if (albumCache[item.id]) {
const existing = albumCache[item.id];
const mergedTags = [...new Set([...existing.tags, ...item.tags])];
albumCache[item.id] = { ...existing, ...item, tags: mergedTags };
} else {
albumCache[item.id] = item;
}
}
} catch (e) {}
onProgress && onProgress({ phase: 'candidates', progress: 18 + Math.round((pg / CONFIG.ALBUMS_PAGES) * 10), message: `② /albums p${pg}` });
}
// Channel 3: 标签组合
onProgress && onProgress({ phase: 'candidates', progress: 28, message: '通道③: 标签组合召回...' });
channels['combo'] = [];
const comboTop = topTags.slice(0, CONFIG.TAG_COMBO_TOP);
let comboCount = 0;
for (let i = 0; i < Math.min(comboTop.length, 8); i++) {
for (let j = i + 1; j < Math.min(comboTop.length, 8); j++) {
if (comboCount >= 12) break;
const tagA = comboTop[i], tagB = comboTop[j];
const baseUrl = `https://18comic.vip/search/photos?search_query=${encodeURIComponent(tagA + ' ' + tagB)}&main_tag=0`;
try {
const html = await fetcher.enqueue(baseUrl, null, 2);
if (html) {
const items = Parser.parseListing(html).slice(0, 30);
for (const item of items) {
if (item.tags.length < 3) {
if (!item.tags.includes(tagA)) item.tags.push(tagA);
if (!item.tags.includes(tagB)) item.tags.push(tagB);
}
channels['combo'].push(item.id);
if (!albumCache[item.id]) albumCache[item.id] = item;
}
}
} catch (e) {}
comboCount++;
}
}
// Channel 4: 作者搜索
const topAuthors = ProfileManager.getTopAuthors(profile, CONFIG.TOP_AUTHORS_MIN_WORKS);
const auCount = Math.min(topAuthors.length, 10);
onProgress && onProgress({ phase: 'candidates', progress: 33, message: `通道④: 作者召回(${auCount})...` });
channels['author'] = [];
for (let i = 0; i < auCount; i++) {
const author = topAuthors[i];
try {
const html = await fetcher.enqueue(`https://18comic.vip/search/photos?search_query=${encodeURIComponent(author)}&main_tag=2`, null, 4);
if (html) {
const items = Parser.parseListing(html).slice(0, 40);
for (const item of items) {
channels['author'].push(item.id);
if (!albumCache[item.id]) albumCache[item.id] = item;
}
}
} catch (e) {}
onProgress && onProgress({ phase: 'candidates', progress: 33 + Math.round((i / auCount) * 4), message: `④ ${author}` });
}
// Channel 5: 类型浏览
onProgress && onProgress({ phase: 'candidates', progress: 37, message: '通道⑤: 类型浏览...' });
channels['type'] = [];
const typeUrls = [];
const typePrefs = Object.entries(profile.types).sort((a, b) => b[1] - a[1]).slice(0, 4);
for (const [type] of typePrefs) {
let typePath;
switch (type) {
case '韓漫': typePath = '/albums/hanman'; break;
case '同人': typePath = '/albums/doujin'; break;
case '單本': typePath = '/albums/single'; break;
case '短篇': typePath = '/albums/short'; break;
default: typePath = '/albums'; break;
}
typeUrls.push(typePath + '?o=mr');
}
for (const tUrl of typeUrls) {
try {
const html = await fetcher.enqueue(`https://18comic.vip${tUrl}`, null, 2);
if (html) {
const items = Parser.parseListing(html).slice(0, 80);
for (const item of items) {
channels['type'].push(item.id);
if (!albumCache[item.id]) albumCache[item.id] = item;
}
}
} catch (e) {}
}
// Channel 6: 关联漫画
onProgress && onProgress({ phase: 'candidates', progress: 40, message: '通道⑥: 关联漫画...' });
channels['related'] = [];
const topFavs = topFavAlbums.slice(0, 8);
for (let i = 0; i < topFavs.length; i++) {
const fav = topFavs[i];
try {
const html = await fetcher.enqueue(`https://18comic.vip/album/${fav.id}/`, null, 2);
if (html) {
const detail = Parser.parseDetail(html);
for (const rid of detail.related) channels['related'].push(rid);
if (albumCache[fav.id]) {
albumCache[fav.id].tags = [...new Set([...albumCache[fav.id].tags, ...(detail.tags||[])])];
albumCache[fav.id].authors = [...new Set([...albumCache[fav.id].authors, ...(detail.authors||[])])];
}
}
} catch (e) {}
}
// Channel 7: 探索
onProgress && onProgress({ phase: 'candidates', progress: 43, message: '通道⑦: 随机探索...' });
channels['discover'] = [];
const maxPage = 200;
for (let i = 0; i < CONFIG.EXPLORE_PAGES; i++) {
const randPage = Math.floor(_random() * maxPage) + 1;
try {
const html = await fetcher.enqueue(`https://18comic.vip/albums?page=${randPage}&o=mr`, null, 1);
if (html) {
const items = Parser.parseListing(html).slice(0, 60);
for (const item of items) {
channels['discover'].push(item.id);
if (!albumCache[item.id]) albumCache[item.id] = item;
}
}
} catch (e) {}
onProgress && onProgress({ phase: 'candidates', progress: 43 + Math.round((i / CONFIG.EXPLORE_PAGES) * 4), message: `⑦ 探索p${randPage}` });
}
// Channel 8: 排行榜
onProgress && onProgress({ phase: 'candidates', progress: 47, message: `通道⑧: 排行矩阵×${CONFIG.RANKING_CHANNELS.length}...` });
channels['rank'] = [];
channels['rank_explore'] = [];
channels['rank_comp'] = [];
let _rcIdx = 0;
for (const rc of CONFIG.RANKING_CHANNELS) {
for (let pg = 1; pg <= rc.pages; pg++) {
try {
const pageUrl = rc.url + (rc.url.includes('page=') ? '' : (rc.url.includes('?') ? '&' : '?') + 'page=' + pg);
const html = await fetcher.enqueue(`https://18comic.vip${pageUrl}`, null, 2);
if (html) {
const items = Parser.parseListing(html).slice(0, 80);
for (const item of items) {
channels['rank'].push(item.id);
if (rc.pool === 'explore' || rc.pool === 'both') channels['rank_explore'].push(item.id);
if (rc.pool === 'comp' || rc.pool === 'both') channels['rank_comp'].push(item.id);
if (!albumCache[item.id]) { albumCache[item.id] = item; }
else {
const existing = albumCache[item.id];
existing.tags = [...new Set([...existing.tags, ...(item.tags||[])])];
existing.authors = [...new Set([...existing.authors, ...(item.authors||[])])];
}
}
}
} catch (e) {}
onProgress && onProgress({ phase: 'candidates', progress: 47 + Math.round(((_rcIdx + pg / rc.pages) / CONFIG.RANKING_CHANNELS.length) * 10), message: `⑧ ${rc.label} p${pg}/${rc.pages}` });
}
_rcIdx++;
}
// 标签补充
onProgress && onProgress({ phase: 'candidates', progress: 57, message: '补全标签数据...' });
if (fetcher._scanFailed) {
onProgress && onProgress({ phase: 'error', progress: 0, message: '❌ 扫描失败: 请求超时' });
LOG.error('扫描终止: 部分请求超过10分钟重试上限');
return;
}
let enrichedCount = 0;
for (const [id, data] of Object.entries(albumCache)) {
if ((data.tags || []).length <= CONFIG.TAG_ENRICH_THRESHOLD && enrichedCount < 30) {
try {
const html = await fetcher.enqueue(`https://18comic.vip/album/${id}/`, null, 1);
if (html) {
const detail = Parser.parseDetail(html);
if (detail.tags.length > 0) {
data.tags = [...new Set([...data.tags, ...(detail.tags||[])])];
data.authors = [...new Set([...data.authors, ...(detail.authors||[])])];
data.typeTags = [...new Set([...data.typeTags, ...(detail.typeTags||[])])];
enrichedCount++;
}
}
} catch (e) {}
}
}
onProgress && onProgress({ phase: 'candidates', progress: 60, message: `补全${enrichedCount}个标签` });
// 合并去重
const channelMap = {};
for (const [chName, ids] of Object.entries(channels)) {
for (const id of ids) {
if (!channelMap[id]) channelMap[id] = [];
if (!channelMap[id].includes(chName)) channelMap[id].push(chName);
}
}
const allIds = Object.keys(channelMap);
const blacklist = getBlacklist();
const candidates = [];
const seenTitles = new Set();
let mergeStats = { total:allIds.length, blAlbum:0, typeKilled:0, survived:0 };
const normTitle = (t) => {
let s = (t||'').toLowerCase();
s = s.replace(/\[.*?\]/g,'').replace(/【.*?】/g,'').replace(/(.*?)/g,'').replace(/\(.*?\)/g,'');
let r = '';
for (const ch of s) { const v = TAG_NORMALIZE[ch]; r += v !== undefined ? v : ch; }
return r.replace(/[\s\-~~ ]+/g, '').trim();
};
for (const id of allIds) {
if (blacklist.albums.includes(id)) { mergeStats.blAlbum++; continue; }
const channelCount = channelMap[id].length;
const albumData = albumCache[id] || { id, title: '', tags: [], authors: [], typeTags: [], views: 0 };
const isTypeBlacklisted = (albumData.typeTags||[]).some(t => CONFIG.TYPE_HARD_BLACKLIST.includes(t)) ||
(albumData.tags||[]).some(t => CONFIG.TYPE_HARD_BLACKLIST.includes(t)) ||
(albumData.chapters||0) > 20;
if (isTypeBlacklisted) { mergeStats.typeKilled++; continue; }
const nt = normTitle(albumData.title);
if (nt && seenTitles.has(nt)) { mergeStats.titleDup = (mergeStats.titleDup||0)+1; continue; }
if (nt && albumData.title) {
const dc = _dedupCheck(albumData.title, id, false);
if (dc.action === 'dup') { mergeStats.trieDup = (mergeStats.trieDup||0)+1; continue; }
}
if (nt) seenTitles.add(nt);
mergeStats.survived++;
if (!albumCache[id]) albumCache[id] = { ...albumData };
albumCache[id].channels = channelMap[id];
albumCache[id].channelCount = channelMap[id].length;
candidates.push({ ...albumData, channelCount: channelMap[id].length, channels: channelMap[id] });
}
LOG.info(`📊 合并去重: 原始${mergeStats.total} → -${mergeStats.blAlbum}黑名 -${mergeStats.typeKilled}类型 -${mergeStats.titleDup||0}标题 -${mergeStats.trieDup||0}AC去重 → 候选${mergeStats.survived}`);
State.saveAlbumCache(albumCache);
candidates.sort((a, b) => (b.channelCount||0) - (a.channelCount||0) || (b.views||0) - (a.views||0));
State.saveCandidates(candidates.slice(0, CONFIG.CANDIDATE_POOL_MAX).map(c => c.id));
onProgress && onProgress({ phase: 'candidates', progress: 65, message: `候选池: ${candidates.length}个` });
return candidates;
},
addAlbum(albumData) {
const cache = State.getAlbumCache();
if (cache[albumData.id]) return;
cache[albumData.id] = albumData;
State.saveAlbumCache(cache);
const candidates = State.getCandidates();
if (!candidates.includes(albumData.id) && candidates.length < CONFIG.CANDIDATE_POOL_MAX) {
candidates.push(albumData.id);
State.saveCandidates(candidates);
}
},
getCachedAlbum(id) {
return State.getAlbumCache()[id] || null;
},
};