JM Shelf - Parser - Source code

JM Shelf - Parser

HTML 解析器 (OWL/列表/详情/分页) — JM Shelf 推荐脚本的模块库，通过 @require 被主脚本引用。
ეს სკრიპტი არ უნდა იყოს პირდაპირ დაინსტალირებული. ეს ბიბლიოთეკაა, სხვა სკრიპტებისთვის უნდა ჩართეთ მეტა-დირექტივაში // @require https://update.sleazyfork.org/scripts/581105/1842604/JM%20Shelf%20-%20Parser.js.
Ask a question, post a review, or report the script.
Wrap lines
// ==UserScript==
// @name         JM Shelf - Parser
// @namespace    jmshelf-lib
// @version      1.0.0
// @author       Kesdi
// @description  HTML 解析器 (OWL/列表/详情/分页) — JM Shelf 推荐脚本的模块库，通过 @require 被主脚本引用。
// @license      MIT
// ==/UserScript==
// 
// 此文件是 GreasyFork 库(library)，不直接安装。
// 请安装主脚本: JM Shelf 给杂鱼的个性化推荐
//

// ═══ [7] PARSER ═══
  // ============================================================
  const Parser = {
    /**
     * Parse an album listing page.
     * Auto-detects owl-carousel vs list layout.
     * Returns array of { id, title, tags: [], authors: [], typeTags: [], views: number }
     */
    parseListing(html) {
      const doc = new DOMParser().parseFromString(html, 'text/html');

      // Detect owl-carousel layout (search results, homepage sections)
      const owlItems = doc.querySelectorAll('.owl-item');
      if (owlItems.length > 3) {
        return this._parseOwl(owlItems);
      }
      return this._parseList(doc);
    },

    /**
     * Parse owl-carousel layout: each .owl-item contains one album card.
     * Tags/authors are inside the card's .title-truncate-index div.
     */
    _parseOwl(owlItems) {
      const results = [];
      const seen = new Set();
      const typePatterns = ['單本', '同人', '韓漫', '短篇', '漢化', '3D', '青年漫', '長篇', '一般向韓漫'];

      for (const item of owlItems) {
        const albumLink = item.querySelector('a[href*="/album/"]');
        if (!albumLink) continue;
        const href = albumLink.getAttribute('href') || '';
        const idMatch = href.match(/\/album\/(\d+)\//);
        if (!idMatch || seen.has(idMatch[1])) continue;
        const id = idMatch[1];
        seen.add(id);

        // Title from img alt/title or video-title span
        const img = item.querySelector('img');
        const videoTitle = item.querySelector('.video-title');
        const title = (videoTitle ? videoTitle.textContent.trim() : (img ? (img.getAttribute('title') || img.getAttribute('alt') || '') : ''));

        // Tags and authors from within the card
        const tags = [], authors = [];
        const tagLinks = item.querySelectorAll('a[href*="search_query"]');
        for (const tl of tagLinks) {
          const th = tl.getAttribute('href') || '';
          const tm = th.match(/search_query=([^&]+)/);
          if (!tm) continue;
          const tagName = decodeURIComponent(tm[1]);
          if (th.includes('main_tag=2')) {
            const nonAu = ['N/A','濫交','滥交','NTR','完結','連載','中文','漢化','全彩','無修正'];
            if (!nonAu.includes(tagName) && !authors.includes(tagName)) authors.push(tagName);
          } else {
            if (!tags.includes(tagName)) tags.push(tagName);
          }
        }

        // Type tags from card text
        const cardText = item.textContent || '';
        const typeTags = [];
        for (const tp of typePatterns) {
          if (cardText.includes(tp) && !typeTags.includes(tp)) typeTags.push(tp);
        }

        // View count (K format)
        const vm = cardText.match(/(\d+\.?\d*)K/);
        const views = vm ? Math.round(parseFloat(vm[1]) * 1000) : 0;

        results.push({
          id, title: (title || '').substring(0, 200),
          tags: normalizeTags(tags),
          authors,
          typeTags: normalizeTags(typeTags),
          views, likes: views,
        });
      }
      return results;
    },

    /**
     * Parse list-style layout (e.g. /albums pages).
     * Sequential scanning: albums separated by tag/author links in flat DOM order.
     */
    _parseList(doc) {
      const results = [];

      // Collect ALL significant links in document order
      const allLinks = doc.querySelectorAll('a[href]');
      const linkNodes = [];
      for (const link of allLinks) {
        const href = link.getAttribute('href') || '';
        const albumMatch = href.match(/\/album\/(\d+)/);
        const searchMatch = href.match(/search_query=([^&]+)/);
        const isAuthor = href.includes('main_tag=2');
        const isBookmark = href.includes('login-modal') || href.includes('bookmark');
        linkNodes.push({
          node: link,
          albumId: albumMatch ? albumMatch[1] : null,
          isAlbum: !!albumMatch,
          searchTag: searchMatch ? decodeURIComponent(searchMatch[1]) : null,
          isAuthor: isAuthor,
          isBookmark: isBookmark,
        });
      }

      // Only consider albums AFTER the category filter bar
      // (skips nav bar, banners, etc. that also have album links)
      let categoryBarEnd = 0;
      for (let i = 0; i < linkNodes.length; i++) {
        const href = (linkNodes[i].node.getAttribute('href') || '');
        const text = linkNodes[i].node.textContent.trim();
        if (href.includes('/albums/hanman') || href.includes('/albums/doujin') ||
            href.includes('/albums/single') || href.includes('/albums/another') ||
            href.includes('/albums/short') || href.includes('/albums/meiman') ||
            href.includes('/albums/hanmansfw')) {
          categoryBarEnd = Math.max(categoryBarEnd, i);
        }
        if (text === '最新的' || text === '最收藏' || text === '最多收藏') {
          categoryBarEnd = Math.max(categoryBarEnd, i);
        }
      }

      // Find all album entries and their boundary positions
      const albumEntries = [];
      for (let i = 0; i < linkNodes.length; i++) {
        if (linkNodes[i].isAlbum && linkNodes[i].albumId && i > categoryBarEnd) {
          albumEntries.push({ index: i, id: linkNodes[i].albumId, node: linkNodes[i].node });
        }
      }

      // For each album, scan forward to the next album to find its tags/authors
      const seen = new Set();
      for (let ei = 0; ei < albumEntries.length; ei++) {
        const entry = albumEntries[ei];
        const id = entry.id;
        if (seen.has(id)) continue;
        seen.add(id);

        const startIdx = entry.index;
        const endIdx = (ei + 1 < albumEntries.length) ? albumEntries[ei + 1].index : linkNodes.length;

        let title = entry.node.textContent.trim();
        if (!title) {
          const img = entry.node.querySelector('img');
          if (img) title = img.getAttribute('alt') || img.getAttribute('title') || '';
        }
        const tags = [];
        const authors = [];
        let views = 0;
        let likes = 0;

        for (let j = startIdx + 1; j < endIdx; j++) {
          const ln = linkNodes[j];
          if (ln.searchTag) {
            if (ln.isAuthor) {
              if (ln.searchTag && ln.searchTag !== 'N/A' && !authors.includes(ln.searchTag)) {
                authors.push(ln.searchTag);
              }
            } else {
              if (!tags.includes(ln.searchTag)) tags.push(ln.searchTag);
            }
          }
          if (!ln.isAlbum && !ln.isBookmark && !ln.searchTag) {
            const vt = ln.node.textContent.trim();
            const vm = vt.match(/^([\d,]+[KMB]?)$/);
            if (vm && views === 0) views = parseViewCount(vm[1]);
            else if (vm && likes === 0) likes = parseViewCount(vm[1]);
          }
        }

        if (likes === 0) likes = views;

        // Extract type tags from ALL text between this album and next
        let rangeText = '';
        try {
          const walker = doc.createTreeWalker(
            doc.body,
            NodeFilter.SHOW_TEXT | NodeFilter.SHOW_ELEMENT,
            null,
            false
          );
          let collecting = false;
          let walkerNode = walker.nextNode();
          while (walkerNode) {
            if (walkerNode === entry.node) { collecting = true; walkerNode = walker.nextNode(); continue; }
            if (endIdx < linkNodes.length && walkerNode === linkNodes[endIdx].node) break;
            if (collecting) {
              if (walkerNode.nodeType === Node.TEXT_NODE) {
                rangeText += ' ' + walkerNode.textContent;
              } else if (walkerNode.nodeType === Node.ELEMENT_NODE && walkerNode.textContent) {
                rangeText += ' ' + walkerNode.textContent;
              }
            }
            walkerNode = walker.nextNode();
          }
        } catch (e) { /* fallback */ }
        const typePatterns = ['單本', '同人', '韓漫', '短篇', '其他類', '漢化', '3D', '青年漫', '長篇', '一般向韓漫'];
        const typeTags = [];
        for (const tp of typePatterns) {
          if (rangeText.includes(tp) && !typeTags.includes(tp)) typeTags.push(tp);
        }

        results.push({
          id,
          title: title.substring(0, 200),
          tags: normalizeTags(tags),
          authors,
          typeTags: normalizeTags(typeTags),
          views,
          likes: views,
        });
      }
      return results;
    },

    /**
     * Parse an album detail page for full metadata + related comics.
     * Uses span[itemprop] attributes to correctly separate tags vs authors.
     */
    parseDetail(html) {
      const doc = new DOMParser().parseFromString(html, 'text/html');
      const result = { tags: [], authors: [], typeTags: [], related: [], desc: '', title: '' };

      const h1 = doc.querySelector('h1');
      if (h1) result.title = h1.textContent.trim();

      const h2s = doc.querySelectorAll('h2');
      for (const h2 of h2s) {
        if (h2.textContent.trim().startsWith('敘述：')) {
          result.desc = (h2.parentElement?.textContent || '').replace('敘述：', '').trim().substring(0, 500);
          break;
        }
      }

      const tagSpans = doc.querySelectorAll('span[itemprop="genre"][data-type="tags"]');
      for (const span of tagSpans) {
        const links = span.querySelectorAll('a');
        for (const link of links) {
          const href = link.getAttribute('href') || '';
          const queryMatch = href.match(/search_query=([^&]+)/);
          if (queryMatch) {
            const tagName = decodeURIComponent(queryMatch[1]);
            if (!result.tags.includes(tagName)) result.tags.push(tagName);
          }
        }
      }

      const authorSpans = doc.querySelectorAll('span[itemprop="author"][data-type="author"]');
      const authorSet = new Set();
      for (const span of authorSpans) {
        const links = span.querySelectorAll('a[href*="search_query"]');
        for (const link of links) {
          const href = link.getAttribute('href') || '';
          const queryMatch = href.match(/search_query=([^&]+)/);
          if (queryMatch) {
            const authorName = decodeURIComponent(queryMatch[1]);
            if (authorName !== 'N/A' && authorName.length >= 2 && !authorName.includes(' ')) {
              const nonAuthorWords = ['N/A','濫交','滥交','NTR','完結','完','連載','中文','漢化','全彩','無修正','巨乳','蘿莉','触手','純愛','劇情','短篇','長篇','單本','同人','韓漫','3D','CG'];
              if (!nonAuthorWords.includes(authorName)) authorSet.add(authorName);
            }
          }
        }
      }
      result.authors = [...authorSet];

      const bodyText = doc.body?.textContent || '';
      const typePatterns = ['單本', '同人', '韓漫', '短篇', '其他類', '漢化', '3D', '青年漫', '長篇'];
      const mainText = bodyText.substring(0, 5000);
      for (const tp of typePatterns) {
        if (mainText.includes(tp) && !result.typeTags.includes(tp)) result.typeTags.push(tp);
      }

      const relatedLinks = doc.querySelectorAll('a[href*="/album/"]');
      const seen = new Set();
      for (const rl of relatedLinks) {
        const match = (rl.getAttribute('href') || '').match(/\/album\/(\d+)/);
        if (match && !seen.has(match[1])) {
          seen.add(match[1]);
          result.related.push(match[1]);
        }
      }

      result.tags = normalizeTags(result.tags);
      result.typeTags = normalizeTags(result.typeTags);
      return result;
    },

    /**
     * Extract pagination info: { totalPages }
     */
    parsePagination(html) {
      const doc = new DOMParser().parseFromString(html, 'text/html');
      const pageLinks = doc.querySelectorAll('a[href*="page="]');
      let maxPage = 1;
      for (const pl of pageLinks) {
        const match = (pl.getAttribute('href') || '').match(/page=(\d+)/);
        if (match) {
          const p = parseInt(match[1], 10);
          if (p > maxPage) maxPage = p;
        }
      }
      const bodyText = doc.body?.textContent || '';
      const totalMatch = bodyText.match(/(\d+[\d,]*)\s*搜[索尋]結果/);
      if (totalMatch) {
        const totalItems = parseInt(totalMatch[1].replace(/,/g, ''), 10);
        maxPage = Math.max(maxPage, Math.ceil(totalItems / 80));
      }
      return { totalPages: maxPage || 1 };
    },
  };