JM Shelf - Parser

HTML 解析器 (OWL/列表/详情/分页) — JM Shelf 推荐脚本的模块库,通过 @require 被主脚本引用。

ეს სკრიპტი არ უნდა იყოს პირდაპირ დაინსტალირებული. ეს ბიბლიოთეკაა, სხვა სკრიპტებისთვის უნდა ჩართეთ მეტა-დირექტივაში // @require https://update.sleazyfork.org/scripts/581105/1842604/JM%20Shelf%20-%20Parser.js.

You will need to install an extension such as Tampermonkey, Greasemonkey or Violentmonkey to install this script.

You will need to install an extension such as Tampermonkey to install this script.

You will need to install an extension such as Tampermonkey or Violentmonkey to install this script.

You will need to install an extension such as Tampermonkey or Userscripts to install this script.

You will need to install an extension such as Tampermonkey to install this script.

You will need to install a user script manager extension to install this script.

(I already have a user script manager, let me install it!)

You will need to install an extension such as Stylus to install this style.

You will need to install an extension such as Stylus to install this style.

You will need to install an extension such as Stylus to install this style.

You will need to install a user style manager extension to install this style.

You will need to install a user style manager extension to install this style.

You will need to install a user style manager extension to install this style.

(I already have a user style manager, let me install it!)

// ==UserScript==
// @name         JM Shelf - Parser
// @namespace    jmshelf-lib
// @version      1.0.0
// @author       Kesdi
// @description  HTML 解析器 (OWL/列表/详情/分页) — JM Shelf 推荐脚本的模块库,通过 @require 被主脚本引用。
// @license      MIT
// ==/UserScript==
// 
// 此文件是 GreasyFork 库(library),不直接安装。
// 请安装主脚本: JM Shelf 给杂鱼的个性化推荐
//

// ═══ [7] PARSER ═══
  // ============================================================
  const Parser = {
    /**
     * Parse an album listing page.
     * Auto-detects owl-carousel vs list layout.
     * Returns array of { id, title, tags: [], authors: [], typeTags: [], views: number }
     */
    parseListing(html) {
      const doc = new DOMParser().parseFromString(html, 'text/html');

      // Detect owl-carousel layout (search results, homepage sections)
      const owlItems = doc.querySelectorAll('.owl-item');
      if (owlItems.length > 3) {
        return this._parseOwl(owlItems);
      }
      return this._parseList(doc);
    },

    /**
     * Parse owl-carousel layout: each .owl-item contains one album card.
     * Tags/authors are inside the card's .title-truncate-index div.
     */
    _parseOwl(owlItems) {
      const results = [];
      const seen = new Set();
      const typePatterns = ['單本', '同人', '韓漫', '短篇', '漢化', '3D', '青年漫', '長篇', '一般向韓漫'];

      for (const item of owlItems) {
        const albumLink = item.querySelector('a[href*="/album/"]');
        if (!albumLink) continue;
        const href = albumLink.getAttribute('href') || '';
        const idMatch = href.match(/\/album\/(\d+)\//);
        if (!idMatch || seen.has(idMatch[1])) continue;
        const id = idMatch[1];
        seen.add(id);

        // Title from img alt/title or video-title span
        const img = item.querySelector('img');
        const videoTitle = item.querySelector('.video-title');
        const title = (videoTitle ? videoTitle.textContent.trim() : (img ? (img.getAttribute('title') || img.getAttribute('alt') || '') : ''));

        // Tags and authors from within the card
        const tags = [], authors = [];
        const tagLinks = item.querySelectorAll('a[href*="search_query"]');
        for (const tl of tagLinks) {
          const th = tl.getAttribute('href') || '';
          const tm = th.match(/search_query=([^&]+)/);
          if (!tm) continue;
          const tagName = decodeURIComponent(tm[1]);
          if (th.includes('main_tag=2')) {
            const nonAu = ['N/A','濫交','滥交','NTR','完結','連載','中文','漢化','全彩','無修正'];
            if (!nonAu.includes(tagName) && !authors.includes(tagName)) authors.push(tagName);
          } else {
            if (!tags.includes(tagName)) tags.push(tagName);
          }
        }

        // Type tags from card text
        const cardText = item.textContent || '';
        const typeTags = [];
        for (const tp of typePatterns) {
          if (cardText.includes(tp) && !typeTags.includes(tp)) typeTags.push(tp);
        }

        // View count (K format)
        const vm = cardText.match(/(\d+\.?\d*)K/);
        const views = vm ? Math.round(parseFloat(vm[1]) * 1000) : 0;

        results.push({
          id, title: (title || '').substring(0, 200),
          tags: normalizeTags(tags),
          authors,
          typeTags: normalizeTags(typeTags),
          views, likes: views,
        });
      }
      return results;
    },

    /**
     * Parse list-style layout (e.g. /albums pages).
     * Sequential scanning: albums separated by tag/author links in flat DOM order.
     */
    _parseList(doc) {
      const results = [];

      // Collect ALL significant links in document order
      const allLinks = doc.querySelectorAll('a[href]');
      const linkNodes = [];
      for (const link of allLinks) {
        const href = link.getAttribute('href') || '';
        const albumMatch = href.match(/\/album\/(\d+)/);
        const searchMatch = href.match(/search_query=([^&]+)/);
        const isAuthor = href.includes('main_tag=2');
        const isBookmark = href.includes('login-modal') || href.includes('bookmark');
        linkNodes.push({
          node: link,
          albumId: albumMatch ? albumMatch[1] : null,
          isAlbum: !!albumMatch,
          searchTag: searchMatch ? decodeURIComponent(searchMatch[1]) : null,
          isAuthor: isAuthor,
          isBookmark: isBookmark,
        });
      }

      // Only consider albums AFTER the category filter bar
      // (skips nav bar, banners, etc. that also have album links)
      let categoryBarEnd = 0;
      for (let i = 0; i < linkNodes.length; i++) {
        const href = (linkNodes[i].node.getAttribute('href') || '');
        const text = linkNodes[i].node.textContent.trim();
        if (href.includes('/albums/hanman') || href.includes('/albums/doujin') ||
            href.includes('/albums/single') || href.includes('/albums/another') ||
            href.includes('/albums/short') || href.includes('/albums/meiman') ||
            href.includes('/albums/hanmansfw')) {
          categoryBarEnd = Math.max(categoryBarEnd, i);
        }
        if (text === '最新的' || text === '最收藏' || text === '最多收藏') {
          categoryBarEnd = Math.max(categoryBarEnd, i);
        }
      }

      // Find all album entries and their boundary positions
      const albumEntries = [];
      for (let i = 0; i < linkNodes.length; i++) {
        if (linkNodes[i].isAlbum && linkNodes[i].albumId && i > categoryBarEnd) {
          albumEntries.push({ index: i, id: linkNodes[i].albumId, node: linkNodes[i].node });
        }
      }

      // For each album, scan forward to the next album to find its tags/authors
      const seen = new Set();
      for (let ei = 0; ei < albumEntries.length; ei++) {
        const entry = albumEntries[ei];
        const id = entry.id;
        if (seen.has(id)) continue;
        seen.add(id);

        const startIdx = entry.index;
        const endIdx = (ei + 1 < albumEntries.length) ? albumEntries[ei + 1].index : linkNodes.length;

        let title = entry.node.textContent.trim();
        if (!title) {
          const img = entry.node.querySelector('img');
          if (img) title = img.getAttribute('alt') || img.getAttribute('title') || '';
        }
        const tags = [];
        const authors = [];
        let views = 0;
        let likes = 0;

        for (let j = startIdx + 1; j < endIdx; j++) {
          const ln = linkNodes[j];
          if (ln.searchTag) {
            if (ln.isAuthor) {
              if (ln.searchTag && ln.searchTag !== 'N/A' && !authors.includes(ln.searchTag)) {
                authors.push(ln.searchTag);
              }
            } else {
              if (!tags.includes(ln.searchTag)) tags.push(ln.searchTag);
            }
          }
          if (!ln.isAlbum && !ln.isBookmark && !ln.searchTag) {
            const vt = ln.node.textContent.trim();
            const vm = vt.match(/^([\d,]+[KMB]?)$/);
            if (vm && views === 0) views = parseViewCount(vm[1]);
            else if (vm && likes === 0) likes = parseViewCount(vm[1]);
          }
        }

        if (likes === 0) likes = views;

        // Extract type tags from ALL text between this album and next
        let rangeText = '';
        try {
          const walker = doc.createTreeWalker(
            doc.body,
            NodeFilter.SHOW_TEXT | NodeFilter.SHOW_ELEMENT,
            null,
            false
          );
          let collecting = false;
          let walkerNode = walker.nextNode();
          while (walkerNode) {
            if (walkerNode === entry.node) { collecting = true; walkerNode = walker.nextNode(); continue; }
            if (endIdx < linkNodes.length && walkerNode === linkNodes[endIdx].node) break;
            if (collecting) {
              if (walkerNode.nodeType === Node.TEXT_NODE) {
                rangeText += ' ' + walkerNode.textContent;
              } else if (walkerNode.nodeType === Node.ELEMENT_NODE && walkerNode.textContent) {
                rangeText += ' ' + walkerNode.textContent;
              }
            }
            walkerNode = walker.nextNode();
          }
        } catch (e) { /* fallback */ }
        const typePatterns = ['單本', '同人', '韓漫', '短篇', '其他類', '漢化', '3D', '青年漫', '長篇', '一般向韓漫'];
        const typeTags = [];
        for (const tp of typePatterns) {
          if (rangeText.includes(tp) && !typeTags.includes(tp)) typeTags.push(tp);
        }

        results.push({
          id,
          title: title.substring(0, 200),
          tags: normalizeTags(tags),
          authors,
          typeTags: normalizeTags(typeTags),
          views,
          likes: views,
        });
      }
      return results;
    },

    /**
     * Parse an album detail page for full metadata + related comics.
     * Uses span[itemprop] attributes to correctly separate tags vs authors.
     */
    parseDetail(html) {
      const doc = new DOMParser().parseFromString(html, 'text/html');
      const result = { tags: [], authors: [], typeTags: [], related: [], desc: '', title: '' };

      const h1 = doc.querySelector('h1');
      if (h1) result.title = h1.textContent.trim();

      const h2s = doc.querySelectorAll('h2');
      for (const h2 of h2s) {
        if (h2.textContent.trim().startsWith('敘述:')) {
          result.desc = (h2.parentElement?.textContent || '').replace('敘述:', '').trim().substring(0, 500);
          break;
        }
      }

      const tagSpans = doc.querySelectorAll('span[itemprop="genre"][data-type="tags"]');
      for (const span of tagSpans) {
        const links = span.querySelectorAll('a');
        for (const link of links) {
          const href = link.getAttribute('href') || '';
          const queryMatch = href.match(/search_query=([^&]+)/);
          if (queryMatch) {
            const tagName = decodeURIComponent(queryMatch[1]);
            if (!result.tags.includes(tagName)) result.tags.push(tagName);
          }
        }
      }

      const authorSpans = doc.querySelectorAll('span[itemprop="author"][data-type="author"]');
      const authorSet = new Set();
      for (const span of authorSpans) {
        const links = span.querySelectorAll('a[href*="search_query"]');
        for (const link of links) {
          const href = link.getAttribute('href') || '';
          const queryMatch = href.match(/search_query=([^&]+)/);
          if (queryMatch) {
            const authorName = decodeURIComponent(queryMatch[1]);
            if (authorName !== 'N/A' && authorName.length >= 2 && !authorName.includes(' ')) {
              const nonAuthorWords = ['N/A','濫交','滥交','NTR','完結','完','連載','中文','漢化','全彩','無修正','巨乳','蘿莉','触手','純愛','劇情','短篇','長篇','單本','同人','韓漫','3D','CG'];
              if (!nonAuthorWords.includes(authorName)) authorSet.add(authorName);
            }
          }
        }
      }
      result.authors = [...authorSet];

      const bodyText = doc.body?.textContent || '';
      const typePatterns = ['單本', '同人', '韓漫', '短篇', '其他類', '漢化', '3D', '青年漫', '長篇'];
      const mainText = bodyText.substring(0, 5000);
      for (const tp of typePatterns) {
        if (mainText.includes(tp) && !result.typeTags.includes(tp)) result.typeTags.push(tp);
      }

      const relatedLinks = doc.querySelectorAll('a[href*="/album/"]');
      const seen = new Set();
      for (const rl of relatedLinks) {
        const match = (rl.getAttribute('href') || '').match(/\/album\/(\d+)/);
        if (match && !seen.has(match[1])) {
          seen.add(match[1]);
          result.related.push(match[1]);
        }
      }

      result.tags = normalizeTags(result.tags);
      result.typeTags = normalizeTags(result.typeTags);
      return result;
    },

    /**
     * Extract pagination info: { totalPages }
     */
    parsePagination(html) {
      const doc = new DOMParser().parseFromString(html, 'text/html');
      const pageLinks = doc.querySelectorAll('a[href*="page="]');
      let maxPage = 1;
      for (const pl of pageLinks) {
        const match = (pl.getAttribute('href') || '').match(/page=(\d+)/);
        if (match) {
          const p = parseInt(match[1], 10);
          if (p > maxPage) maxPage = p;
        }
      }
      const bodyText = doc.body?.textContent || '';
      const totalMatch = bodyText.match(/(\d+[\d,]*)\s*搜[索尋]結果/);
      if (totalMatch) {
        const totalItems = parseInt(totalMatch[1].replace(/,/g, ''), 10);
        maxPage = Math.max(maxPage, Math.ceil(totalItems / 80));
      }
      return { totalPages: maxPage || 1 };
    },
  };