Red 40

e6* downloader without a memory leak
Ask a question, post a review, or report the script.
Wrap lines
// ==UserScript==
// @name            Red 40
// @description     e6* downloader without a memory leak
// @author          1fz54ARh0m8g0KUYzqui
// @license         Unlicense
// @version         3.0
// @match           https://e621.net/*
// @match           https://e6ai.net/*
// @match           https://e926.net/*
// @grant           GM.getValue
// @grant           GM.setValue
// @grant           GM_download
// @grant           GM_log
// @run-at          document-end
// @namespace https://greasyfork.org/users/1518908
// ==/UserScript==
// Verified working on e621ng 25.09.17

// A constructed object cannot be returned from an async IIFE
// This might be the only way to do this
// There is a race condition here
const configuration = {
  maximumAttempts: 1,
  clientName: "",
  rememberHashes: false,
  userAgentCompliant: false,

  getHashes: async () => {
    try {
      const retrieved = JSON.parse(GM.getValue("hashes", []));
      if (!(retrieved instanceof Array)) {
        return [];
      }

      return retrieved;
    } catch {
      return [];
    }
  },

  setHashes: async (hashArray) => {
    if (!(hashArray instanceof Array)) {
      return new TypeError("Array required");
    }

    await GM.setValue("hashes", JSON.stringify(hashArray));
  }
};

// Page data
const currentPage = document.querySelector("nav.pagination.numbered")?.dataset.current | 0;
const finalPage = (document.querySelector("nav.pagination.numbered")?.dataset.total ?? currentPage) | 0;
const hasPosts = !!(document.querySelector("section.posts-container"));

// Locking system modified from
// https://medium.com/@chris_marois/asynchronous-locks-in-modern-javascript-8142c877baf
const metadata = {
  unlock: () => {},
  lock: () => {
    this.promise = new Promise(resolve => this.unlock = resolve);
  },
  promise: Promise.resolve(),

  data: []
};

// Heavy checks since the user can put invalid data
(async function() {
  const maximumAttempts = Math.max((await GM.getValue("maximumAttempts", 3)) | 0, 1);
  const clientName = String(await GM.getValue("clientName", "e129"));
  const rememberHashes = !!(await GM.getValue("rememberHashes", false));
  const userAgentCompliant = !!(await GM.getValue("userAgentCompliant", true));

  configuration.maximumAttempts = maximumAttempts;
  configuration.clientName = clientName;
  configuration.rememberHashes = rememberHashes;
  configuration.userAgentCompliant = (userAgentCompliant && clientName.length !== 0);
})();

// Having a reusable attempts function was slow and broke
// No point if there's only 2 attemptable functions
// FireMonkey has issues with this, that sucks
async function _download(url, currentAttempt) {
  url = String(url);
  currentAttempt = Math.max(currentAttempt|0, 1);

  // Guess the file name
  let fileName = new URL(url).pathname.split("/").pop();

  // If the file name is invalid, just go with no extension
  if (!/^[0-9A-Za-z.]+$/.test(fileName)) {
    fileName = `download@${Date.now()}`;
  }

  GM_log(`Downloading ${fileName}.`);

  const downloadPromise = new Promise((resolve, reject) => {
    GM_download({
      url: url,
      name: fileName,
      saveAs: false,
      conflictAction: "uniquify",
      onerror: (error) => reject(error),
      onload: (response) => resolve(response)
    });
  });

  await downloadPromise.catch(async () => {
    GM_log(`Failed to download ${fileName}. (Attempt ${currentAttempt}/${configuration.maximumAttempts})`);
    if (currentAttempt >= configuration.maximumAttempts) {
      return;
    }

    // Exponential backoff
    await new Promise(resolve => setTimeout(resolve, 1000 * Math.pow(2, currentAttempt)));
    await _download(url, currentAttempt+1);
  });

  await downloadPromise.then(() => GM_log(`Downloaded ${fileName}.`));
}

// A wrapper for _download
async function _downloadList(urlList) {
  if (!(urlList instanceof Array)) {
    return new TypeError("Array required");
  }

  for (const url of urlList) {
    await _download(url, 1);
  }
}

async function _fetch(url, currentAttempt) {
  url = String(url);
  currentAttempt = Math.max(currentAttempt | 0, 1);

  let response = await fetch(url);

  if (!response.ok) {
    GM_log(`Failed to fetch ${url} (Attempt ${currentAttempt}/${configuration.maximumAttempts})`);
    if (currentAttempt >= configuration.maximumAttempts) {
      return undefined;
    }

    // Hopefully this frees memory
    response = undefined;

    // Exponential backoff
    await new Promise(resolve => setTimeout(resolve, 1000 * Math.pow(2, currentAttempt)));
    return await _fetch(url, currentAttempt+1);
  }

  response = await response.text();

  // This has to be on the same line, I don't know why
  const parser = new DOMParser().parseFromString(response, "text/html");
  return parser;
}

async function _getMetadata() {
  if (!hasPosts) {
    return;
  }

  await metadata.promise;

  if (metadata.data.length !== 0) {
    return;
  }

  metadata.lock();

  if (finalPage === 1) {
    GM_log("Fetching 1 page.");
  } else {
    GM_log(`Fetching ${(finalPage+1)-currentPage} pages.`);
  }

  let pagesSkipped = 0;

  // The other parameters are unknown, so only replace the page numbers
  // A URL is needed, so provide one if there is only one page
  const baseURL = new URL(document.querySelector("a.page.last")?.href ?? "https://example.com");
  const baseURLParameters = new URLSearchParams(baseURL.search);

  if (configuration.userAgentCompliant) {
    baseURLParameters.set("_client", String(configuration.clientName));
  }

  // Keep only one of following pages' DOMs active
  for (let i = currentPage; i <= finalPage; i++) {
    let parser;

    if (i === currentPage) {
      parser = document;
    } else {
      // This should never happen
      if (baseURL.hostname === "example.com") {
        pagesSkipped++;
        continue;
      }

      baseURLParameters.set("page", i);
      baseURL.search = baseURLParameters.toString();

      // Fetch isn't async, it just returns a promise
      parser = await _fetch(baseURL.toString());

      if (parser === undefined) {
        pagesSkipped++;
        continue;
      }
    }

    const pagePosts = parser.querySelector("section.posts-container")?.querySelectorAll("article");

    // There is a chance that a human verification page is returned, nothing can be done
    if (pagePosts === null) {
      pagesSkipped++;
      continue;
    }

    for (const post of pagePosts) {
      // Tags are split by spaces, so split here
      const tags = post.dataset.tags.split(' ');

      const postData = {
        id: post.dataset.id | 0,
        extension: post.dataset.fileExt,
        md5: post.dataset.md5,
        tags: tags.map((e) => e.replaceAll('_', ' ')),
        url: post.dataset.fileUrl
      };

      // Race condition possible, oh well
      metadata.data.push(postData);
    }
  }

  metadata.unlock();

  if (pagesSkipped === 1) {
    GM_log("1 page skipped.");
  } else if (pagesSkipped !== 0) {
    GM_log(`${pagesSkipped} pages skipped.`);
  }
}

const metadataExportButton = document.createElement("a");
metadataExportButton.onclick = async () => {
  if (metadataExportButton.disabled) {
    return;
  }

  metadataExportButton.disabled = true;

  let jsonData;

  if (hasPosts) {
    await _getMetadata();
    jsonData = JSON.stringify(metadata.data);
  } else {
    // This data could be cached instead of computed, but what are the benefits?
    // There must be at least 1 tag on a post
    const tagList = document.querySelector("section#tag-list").querySelectorAll("span.tag-list-name");
    const tagObject = {
      tags: []
    };

    tagList.foreach((e) => tagObject.tags.push(e.innerText.trim()));
    jsonData = JSON.stringify(tagObject);
  }

  const blob = new Blob([jsonData], {
    type: "application/json"
  });

  const a = document.createElement("a");

  a.setAttribute('download', `download@${Date.now()}.json`);
  a.setAttribute('href', window.URL.createObjectURL(blob));
  a.click();

  metadataExportButton.disabled = false;
};

const postDownloadButton = document.createElement("a");
postDownloadButton.onclick = async () => {
  if (postDownloadButton.disabled) {
    return;
  }

  postDownloadButton.disabled = true;

  await _getMetadata();
  const previousHashes = await configuration.getHashes();
  const futureHashes = [];

  // This pointer will be changed
  let urlList1 = [];

  for (const e of metadata.data) {
    if (configuration.rememberHashes && previousHashes.includes(e.md5)) {
      continue;
    }

    urlList1.push(e.url);
    futureHashes.push(e.md5);
  }

  const fileCount = urlList1.length;

  // 0 is weirdly truthy in this case
  const urlList2 = urlList1.filter((e, i) => !!(i & 1));
  urlList1 = urlList1.filter((e, i) => !(i & 1));

  GM_log(`Downloading ${fileCount} files.`);
  postDownloadButton.innerText = `Downloading ${fileCount} files...`;

  await Promise.allSettled([_downloadList(urlList1), _downloadList(urlList2)]);
  GM_log(`Finished downloading ${fileCount} files.`);

  if (configuration.rememberHashes) {
    await configuration.setHashes(previousHashes.concat(futureHashes));
    GM_log("Saved hashes of the download list.");
  }

  postDownloadButton.disabled = false;
  postDownloadButton.innerText = "Download all posts";
};

const secondaryBar = document.querySelector("menu.nav-secondary.desktop");

if (hasPosts || document.querySelector("section#tag-list")) {
  const metadataExportItem = document.createElement("li");
  metadataExportItem.id = "subnav-metadata-export";
  metadataExportButton.id = "subnav-metadata-export-link";
  metadataExportButton.innerText = "Export metadata";
  metadataExportItem.appendChild(metadataExportButton);
  secondaryBar?.appendChild(metadataExportItem);
}

if (hasPosts) {
  const postDownloadItem = document.createElement("li");
  postDownloadItem.id = "subnav-post-download";
  postDownloadButton.id = "subnav-post-download-link";
  postDownloadButton.innerText = "Download all posts";
  postDownloadItem.appendChild(postDownloadButton);
  secondaryBar?.appendChild(postDownloadItem);
}