User:Harej/citation-watchlist-staging.js

Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump.
This code will be executed when previewing this page.
Documentation for this user script can be added at User:Harej/citation-watchlist-staging.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/**
 * 
 * Citation Watchlist
 * https://en.wikipedia.org/wiki/WP:WATCHCITE
 *
 * 
/**
 * ==========================================================================
 * Domain List Configuration
 * ==========================================================================
 *
 * Citation Watchlist requires the following wiki pages to function:
 *
 * 1. Public Suffix List
 *    - A local copy of the public suffix list, used for domain parsing.
 *    - Copy the contents of:
 *        https://en.wikipedia.org/wiki/Wikipedia:Citation_Watchlist/Public_Suffix_List
 *      to a page on your own wiki.
 *    - Update the `publicSuffixList` variable below to reflect your page title.
 *
 * 2. List of Lists
 *    - A page linking to one or more domain list pages.
 *    - Format as a bullet list: "* [[Page Title]]" (space after asterisk).
 *    - Reference formatting example:
 *        https://en.wikipedia.org/wiki/Wikipedia:Citation_Watchlist/Lists
 *    - Update the `listOfLists` variable below accordingly.
 *
 * 3. Domain List Pages
 *    - One or more pages listing suspicious or noteworthy domains.
 *    - Each page must contain section headers that match the `indicators` config
 *      below (e.g., "==Warn==", "==Caution==").
 *    - Under each section, list domains in the format: "* example.com"
 *    - Do not use link formatting—just plain text.
 */
const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List";
const listOfLists = "Wikipedia:Citation_Watchlist/Lists";

/**
 * ==========================================================================
 * Indicator Configuration
 * ==========================================================================
 *
 * Defines metadata for domain indicators used in the watchlist UI.
 * Each indicator is associated with a level of urgency and a unique symbol.
 *
 * Fields:
 * - msg:     Display label for the level (e.g., "Warning", "Caution").
 * - emoji:   Unicode character for the visual indicator (escaped as `\uXXXX`).
 * - section: Must exactly match the section headers in the domain list pages.
 * - priority: Higher values override lower ones for conflicting domain matches.
 *             Priority scale: 1 (lowest) to N (highest).
 * - list:    Defined as "new Set()" for all indicator types.
 *
 * If a domain appears in multiple lists, the one with the highest priority
 * takes precedence.
 */
const indicators = {
  warning: {
    msg: "Warning",
    emoji: '\u2757',
    section: "==Warn==",
    priority: 3,
    list: new Set()
  },
  caution: {
    msg: "Caution",
    emoji: '\u270B',
    section: "==Caution==",
    priority: 2,
    list: new Set()
  },
  inspect: {
    msg: "Inspect",
    emoji: '\uD83D\uDD0E',
    section: "==Inspect==",
    priority: 1,
    list: new Set()
  }
};

/**
 * Citation Watchlist
 * 
 * Highlights potentially questionable citations added in Wikipedia revisions,
 * using predefined domain lists and a public suffix list to analyze diffs.
 * 
 * Documentation: https://en.wikipedia.org/wiki/WP:WATCHCITE
 * 
 * Author: James Hare under contract with Hacks/Hackers
 * License: GNU General Public License v3.0 (GPL-3.0)
 * 
 * @version 1.12
 * @since 2025-04-23
 */

let publicSuffixSet = new Set();
const namespaces = Object.entries(mw.config.get('wgFormattedNamespaces'))
  .filter(([num, name]) => num !== '0' && num !== '118')
  .map(([_, name]) => name.replace(/ /g, '_') + ':');

/**
 * Main entry point for Citation Watchlist.
 * Determines if the current page should be analyzed, fetches domain and suffix
 * lists, processes each change/revision in the recent changes or history page,
 * and triggers analysis to highlight questionable domains.
 */
async function analyzeView() {
  purgeExpiredCache();
  const ns = mw.config.get('wgNamespaceNumber');
  if (![-1, 0, 118].includes(ns)) {
    return;
  }
  publicSuffixSet = await fetchPublicSuffixList();
  if (publicSuffixSet.size === 0) {
    console.error('Public Suffix List loading failed');
    return;
  }
  console.log("Welcome to Citation Watchlist");
  const listPages = await fetchDomainListPages(listOfLists);
  if (listPages) {
    const lists = await fetchAndOrganizeDomainLists(listPages);
    if (lists) {
      for (const type in indicators) {
        lists[type].list.forEach(indicators[type].list.add, indicators[type].list);
      }
    }
  }
  const entriesContainers = document.querySelectorAll('.mw-changeslist-links');
  let noLinks = true;
  for (const container of entriesContainers) {
    const diffLink = container.querySelector('a.mw-changeslist-diff');
    const histLink = container.querySelector('a.mw-changeslist-history');
    const prevLink = container.querySelector(
      'a.mw-history-histlinks-previous');
    const curLink = container.querySelector('a.mw-history-histlinks-current');
    let revision = null;
    let urlParams = '';
    if (diffLink) {
      noLinks = false;
      const diffUrl = new URL(diffLink.href);
      urlParams = new URLSearchParams(diffUrl.search);
      const pageTitle = urlParams.get('title');
      if (isNotArticle(pageTitle)) continue;
      revision = {
        oldrevision: urlParams.get('diff'),
        newrevision: urlParams.get('oldid'),
        element: diffLink.parentNode.parentNode
      };
      if (revision.oldrevision == 'prev') { // This happens on user contributions pages
        const previousRevisionMap = await fetchPreviousRevisionIds(
        	[revision.newrevision]);
        revision.oldrevision = revision.newrevision;
        revision.newrevision = previousRevisionMap[revision.newrevision];
      }
    } else if (histLink) {
      noLinks = false;
      const histUrl = new URL(histLink.href);
      urlParams = new URLSearchParams(histUrl.search);
      const pageTitle = urlParams.get('title');
      if (isNotArticle(pageTitle)) continue;
      const firstID = await fetchFirstRevisionId(pageTitle);
      if (!firstID) continue;
      revision = {
        oldrevision: firstID,
        element: histLink.parentNode.parentNode
      };
    } else if (prevLink) {
      noLinks = false;
      urlParams = new URLSearchParams(prevLink.href);
      const previousRevisionMap = await fetchPreviousRevisionIds(
      	[urlParams.get('oldid')]);
      revision = {
        oldrevision: urlParams.get('oldid'),
        newrevision: previousRevisionMap[urlParams.get('oldid')],
        element: prevLink.parentNode.parentNode
      };
    } else if (curLink) {
      noLinks = false;
      urlParams = new URLSearchParams(curLink.href);
      revision = {
        oldrevision: urlParams.get('oldid'),
        element: curLink.parentNode.parentNode
      };
    }
    if (revision) {
      await analyzeRevision(revision);
    }
  }
  // If no links were found, extract the first revision ID
  if (noLinks == true) {
    const pageTitle = mw.config.get('wgTitle');
    const firstID = await fetchFirstRevisionId(pageTitle);
    revision = {
      oldrevision: firstID,
      element: entriesContainers[0]
    };
    await analyzeRevision(revision);
  }
}

/**
 * Analyzes a revision (or a pair of revisions) for newly added URLs,
 * compares them against domain watchlists, and highlights matches.
 *
 * @param {Object} revision - Object containing oldrevision, optional newrevision, and DOM element.
 */
async function analyzeRevision(revision) {
  const lookup = [revision.oldrevision];
  if (revision.newrevision) {
    lookup.push(revision.newrevision);
  }
  const wikiDomain = location.hostname;
  const cacheKey = `revisionDiff:${wikiDomain}:${revision.oldrevision}:${revision.newrevision || 'null'}`;
  const oneMonth = 30 * 24 * 60 * 60 * 1000;
  let addedURLs = [];

  // Try reading from cache
  const cached = localStorage.getItem(cacheKey);
  if (cached) {
    try {
      const parsed = JSON.parse(cached);
      const age = Date.now() - parsed.timestamp;
      if (age < oneMonth && Array.isArray(parsed.addedURLs)) {
        console.log(`Cache hit for revision ${cacheKey}`);
        addedURLs = parsed.addedURLs;
      }
    } catch (e) {
      console.warn('Cache parse error, refetching:', e);
    }
  }

  // If not cached, fetch and process
  if (addedURLs.length === 0) {
    const wikitext = await fetchRevisionContent(lookup);
    const fromURLs = new Set(extractAddedURLs(wikitext.oldrevision) || []);
    const toURLs = new Set(extractAddedURLs(wikitext.newrevision) || []);

    if (revision.newrevision) {
      addedURLs = [...toURLs].filter(url => !fromURLs.has(url));
    } else {
      addedURLs = Array.from(fromURLs);
    }
    try {
      localStorage.setItem(cacheKey, JSON.stringify({
        timestamp: Date.now(),
        addedURLs
      }));
    } catch (e) {
      console.warn('Failed to store cache:', e);
    }
  }
  console.log(`Revision element: ${revision.element.innerHTML}
  Added URLs: ${addedURLs.join(' ')}
  `);

  // Match domains to indicator types
  const matchedDomains = Object.keys(indicators).reduce((acc, key) => {
    acc[key] = [];
    return acc;
  }, {});
  for (const url of addedURLs) {
    const hostname = new URL(url).hostname;
    const domain = getRootDomain(hostname, publicSuffixSet);
    let highestPriorityType = null;
    for (const type in indicators) {
      if (indicators[type].list.has(domain)) {
        if (
          highestPriorityType === null ||
          indicators[type].priority > indicators[highestPriorityType].priority
        ) {
          highestPriorityType = type;
        }
      }
    }
    if (
      highestPriorityType !== null &&
      !matchedDomains[highestPriorityType].includes(domain)
    ) {
      matchedDomains[highestPriorityType].push(domain);
      for (const type in indicators) {
        if (
          indicators[type].priority < indicators[highestPriorityType].priority
        ) {
          matchedDomains[type] = matchedDomains[type].filter(d => d !== domain);
        }
      }
    }
  }

  // Prepend emoji indicators
  for (const type in indicators) {
    if (matchedDomains[type].length > 0) {
      prependEmojiWithTooltip(revision.element, type, matchedDomains[type]);
    }
  }
}

/**
 * Prepends an emoji and tooltip to a revision list entry DOM element if any
 * domains matched a warning list.
 *
 * @param {HTMLElement} element - The container element to prepend the emoji to.
 * @param {string} type - The type of indicator ('warning', 'caution', 'inspect').
 * @param {string[]} domains - The list of matched domains for the indicator.
 */
function prependEmojiWithTooltip(element, type, domains) {
  const indicator = indicators[type];
  if (!indicator || element.getAttribute(`data-processed-${type}`) === 'true') {
    return;
  }
  const emojiSpan = document.createElement('span');
  emojiSpan.textContent = indicator.emoji + " ";
  emojiSpan.title = `${indicator.msg}: ${domains.join(", ")}`;
  element.parentNode.insertBefore(emojiSpan, element);
  element.setAttribute(`data-processed-${type}`, 'true');
}

/**
 * Extracts the first page object from MediaWiki API query response.
 *
 * @param {Object} data - MediaWiki API response.
 * @returns {Object|null} The first page object or null if unavailable.
 */
async function getFirstPage(data) {
  if (!data || !data.query || !data.query.pages) return null;
  const pages = data.query.pages;
  return Object.values(pages)[0]; // Return the first page
}

/**
 * Retrieves the first revision from a page object.
 *
 * @param {Object} page - Page object containing revisions.
 * @returns {Object|null} First revision object or null.
 */
async function getFirstRevision(page) {
  if (page.revisions && page.revisions.length > 0) {
    return page.revisions[0];
  }
  return null;
}

/**
 * Fetches wikitext content for one or two revisions by ID.
 *
 * @param {string[]} revIds - Array of revision IDs.
 * @returns {Object} Object with `oldrevision` and optionally `newrevision` as wikitext strings.
 */
async function fetchRevisionContent(revIds) {
  const data = await fetchRevisionData({
    revids: revIds,
    rvprop: ['content'],
    rvslots: ['main']
  });
  const page = await getFirstPage(data);
  const wikitext = { oldrevision: null, newrevision: null };
  if (page.revisions && page.revisions.length > 0) {
    wikitext.oldrevision = page.revisions[0].slots.main['*'] || null;
    if (page.revisions.length > 1) {
      wikitext.newrevision = page.revisions[1].slots.main['*'] || null;
    }
  }
  return wikitext;
}

/**
 * Fetches the parent revision IDs for a given list of revision IDs.
 *
 * @param {string[]} revisionIds - Array of revision IDs.
 * @returns {Object} Map of revision ID to its parent ID.
 */
async function fetchPreviousRevisionIds(revisionIds) {
  const data = await fetchRevisionData({
    revids: revisionIds,
    rvprop: ['ids']
  });
  const page = await getFirstPage(data);
  if (!page) return {};
  const revisionMap = {};
  for (const revision of page.revisions) {
    revisionMap[revision.revid] = revision.parentid;
  }
  return revisionMap;
}

/**
 * Fetches the ID of the first revision of a page.
 *
 * @param {string} pageTitle - The page title to look up.
 * @returns {number|null} Revision ID or null.
 */
async function fetchFirstRevisionId(pageTitle) {
  const data = await fetchRevisionData({
    titles: [pageTitle],
    rvlimit: 1,
    rvdir: 'newer',
    rvprop: ['ids'],
  });
  const page = await getFirstPage(data);
  if (!page) return null;
  const revision = await getFirstRevision(page);
  return revision ? revision.revid : null;
}

/**
 * Fetches the list of subpages from the list of lists, parses wikilinks, caches
 * the result, and returns list of subpage titles.
 *
 * @param {string} pageName - Title of the list-of-lists page.
 * @returns {Promise<string[]>} List of subpage titles.
 */
async function fetchDomainListPages(pageName) {
  const cacheKey = `citationWatchlistFetchDomainListPages_${pageName}`;
  const cacheExpiration = 4 * 60 * 60 * 1000;
  const now = Date.now();
  const cachedData = localStorage.getItem(cacheKey);
  const cachedTimestamp = localStorage.getItem(`${cacheKey}_timestamp`);
  if (cachedData && cachedTimestamp && (now - parseInt(cachedTimestamp, 10)) <
    cacheExpiration) {
    console.log("Loaded list of lists from cache");
    return JSON.parse(cachedData);
  }
  const data = await fetchRevisionData({
    titles: [pageName],
    rvprop: ['content'],
    rvslots: ['*']
  });
  const page = await getFirstPage(data);
  if (!page) return [];
  const content = page.revisions[0].slots.main['*'];
  const pageTitles = [];
  const lines = content.split('\n');
  for (let line of lines) {
    if (line.startsWith('* [[')) {
      const match = line.match(
        /\[\[([^\]]+)\]\]/); // Matches the first instance of [[Page Title]]
      if (match) {
        pageTitles.push(match[1]);
      }
    }
  }
  localStorage.setItem(cacheKey, JSON.stringify(pageTitles));
  localStorage.setItem(`${cacheKey}_timestamp`, now.toString());
  console.log("Loaded from API and stored in cache");
  return pageTitles;
}

/**
 * Loads domain lists from a set of pages, categorizes them by indicator section
 * headers, and populates the corresponding `Set` in the global `indicators` object.
 *
 * @param {string[]} pageNames - List of page titles to fetch.
 * @returns {Object} Updated indicators object with domain sets.
 */
async function fetchAndOrganizeDomainLists(pageNames) {
  const cacheTTL = 6 * 60 * 60 * 1000;
  const now = Date.now();
  const cachedData = {};
  const pagesToFetch = [];
  for (const title of pageNames) {
    const cacheKey = `domainList:${location.hostname}:${title}`;
    const cached = localStorage.getItem(cacheKey);
    if (cached) {
      try {
        const parsed = JSON.parse(cached);
        if (now - parsed.timestamp < cacheTTL && parsed.content) {
          console.log(`Using cached content for page: ${title}`);
          cachedData[title] = parsed.content;
          continue;
        } else {
          console.log(`Cache expired for page: ${title}`);
        }
      } catch (e) {
        console.warn(`Cache error for ${title}:`, e);
      }
    }
    console.log(`Will fetch page: ${title}`);
    pagesToFetch.push(title);
  }
  let fetchedPages = {};
  if (pagesToFetch.length > 0) {
    const apiData = await fetchRevisionData({
      titles: pagesToFetch,
      rvprop: ['content'],
      rvslots: ['*'],
    });
    const pages = apiData.query.pages;
    for (const pageId in pages) {
      const page = pages[pageId];
      const title = page.title;
      const content = page.revisions[0].slots.main['*'];
      fetchedPages[title] = content;
      const cacheKey = `domainList:${location.hostname}:${title}`;
      try {
        localStorage.setItem(cacheKey, JSON.stringify({
          timestamp: now,
          content,
        }));
        console.log(`Cached content for page: ${title}`);
      } catch (e) {
        console.warn(`Failed to cache ${title}:`, e);
      }
    }
  }
  const allContent = { ...cachedData, ...fetchedPages };
  for (const title in allContent) {
    const content = allContent[title];
    let currentList = null;
    const lines = content.split('\n');
    for (let line of lines) {
      for (const type in indicators) {
        if (line.trim() === indicators[type].section) {
          currentList = indicators[type].list;
          break;
        }
      }
      if (line.startsWith('*') && currentList) {
        const domain = line.substring(1).trim();
        currentList.add(domain);
      }
    }
  }
  return indicators;
}

/**
 * Fetches and caches the public suffix list used to identify top-level domains.
 *
 * @returns {Promise<Set<string>>} Set of public suffixes.
 */
async function fetchPublicSuffixList() {
  const cacheKey = 'publicSuffixListCache';
  const cacheTTL = 24 * 60 * 60 * 1000;
  const cached = localStorage.getItem(cacheKey);
  if (cached) {
    try {
      const parsed = JSON.parse(cached);
      const age = Date.now() - parsed.timestamp;
      if (age < cacheTTL && parsed.content) {
        console.log('Using cached public suffix list');
        return new Set(parsed.content.split('\n').filter(line =>
          line.trim() && !line.trim().startsWith('//')
        ).map(line => line.trim()));
      }
    } catch (e) {
      console.warn('Error parsing cache, refetching:', e);
    }
  }
  const pslUrl = mw.config.get('wgArticlePath').replace('$1', publicSuffixList) 
    + '?action=raw';
  console.log(`Raw page text request: ${pslUrl}`);
  const content = await safeFetch(fetch, pslUrl).then(response => response ?
    response.text() : null);
  if (!content) return new Set();
  try {
    localStorage.setItem(cacheKey, JSON.stringify({
      timestamp: Date.now(),
      content
    }));
  } catch (e) {
    console.warn('Failed to write to cache:', e);
  }
  const suffixSet = new Set();
  const lines = content.split('\n');
  for (const line of lines) {
    if (line.trim() && !line.trim().startsWith('//')) {
      suffixSet.add(line.trim());
    }
  }
  return suffixSet;
}

/**
 * Makes a MediaWiki API call to fetch revision metadata or content.
 *
 * @param {Object} data - Options for the API call, such as `revids`, `titles`, `rvprop`, etc.
 * @returns {Promise<Object>} MediaWiki API query result.
 */
async function fetchRevisionData(data) {
  const paramKeys = ['rvprop', 'revids', 'titles', 'rvslots'];
  const params = {
    action: 'query',
    prop: 'revisions',
    format: 'json',
    rvdir: data.rvdir || 'older',
    origin: '*'
  };
  if (data.rvlimit) { params.rvlimit = data.rvlimit; }
  paramKeys.forEach(key => {
    if (data[key]) {
      params[key] = Array.isArray(data[key]) ? data[key].join('|') : data[key];
    }
  });
  const api = new mw.Api();
  return await safeFetch(api.get.bind(api), params);
}

/**
 * Wraps any asynchronous fetch function and logs errors without throwing.
 *
 * @param {Function} fn - The function to execute (usually an API call).
 * @param {...any} args - Arguments to pass to the fetch function.
 * @returns {Promise<any|null>} Result of the fetch or null on failure.
 */
async function safeFetch(fn, ...args) {
  try {
    return await fn(...args);
  } catch (error) {
    console.error(`Error during ${fn.name}:`, error);
    return null;
  }
}

/**
 * Extracts all HTTP(S) URLs from a given wikitext string.
 *
 * @param {string} wikitext - Raw wikitext revision content.
 * @returns {string[]} List of valid extracted URLs.
 */
function extractAddedURLs(wikitext) {
  const addedURLs = [];
  const urlRegex = /https?:\/\/[^\s<"]+/g;
  let match;
  while ((match = urlRegex.exec(wikitext)) !== null) {
    try {
      const url = new URL(match[0]);
      addedURLs.push(url.href);
    } catch (error) {
      console.error(`Invalid URL rejected: ${match[0]}`);
    }
  }
  return addedURLs;
}

/**
 * Extracts the top-level domain from a full hostname using a public suffix set.
 *
 * @param {string} hostname - Full hostname (e.g., sub.example.co.uk).
 * @param {Set<string>} publicSuffixSet - Set of known public suffixes.
 * @returns {string} The top-level domain (e.g., example.co.uk).
 */
function getRootDomain(hostname, publicSuffixSet) {
  const domainParts = hostname.split('.');
  for (let i = 0; i < domainParts.length; i++) {
    const candidate = domainParts.slice(i).join('.');
    if (publicSuffixSet.has(candidate) || publicSuffixSet.has(
        `!${candidate}`)) {
      return domainParts.slice(i - 1).join('.');
    }
  }
  return hostname;
}

/**
 * Determines whether a given page title does *not* belong to the main or draft namespaces.
 *
 * @param {string} pageTitle - The title of the page.
 * @returns {boolean} True if not an article namespace.
 */
function isNotArticle(pageTitle) {
  return namespaces.some(namespace => pageTitle.startsWith(namespace));
}

/**
 * Cleans up expired localStorage cache entries based on known cache key prefixes and TTLs.
 */
function purgeExpiredCache() {
  const now = Date.now();
  const knownCaches = [
    { prefix: 'revisionDiff:', ttl: 30 * 24 * 60 * 60 * 1000 },
    { prefix: 'domainList:', ttl: 6 * 60 * 60 * 1000 },
    { prefix: 'publicSuffixListCache', ttl: 24 * 60 * 60 * 1000 },
    { prefix: 'citationWatchlistFetchDomainListPages_', ttl: 4 * 60 * 60 * 1000 }
  ];
  for (let i = 0; i < localStorage.length; i++) {
    const key = localStorage.key(i);
    for (const cache of knownCaches) {
      if (key.startsWith(cache.prefix)) {
        let expired = false;
        try {
          if (key.endsWith('_timestamp')) {
            const baseKey = key.replace(/_timestamp$/, '');
            const timestamp = parseInt(localStorage.getItem(key), 10);
            if (isNaN(timestamp) || now - timestamp > cache.ttl) {
              expired = true;
              localStorage.removeItem(key);
              localStorage.removeItem(baseKey);
              console.log(`Purged expired cache: ${baseKey}`);
            }
          } else {
            const value = localStorage.getItem(key);
            const parsed = JSON.parse(value);
            if (parsed && parsed.timestamp && now - parsed.timestamp > cache.ttl) {
              expired = true;
              localStorage.removeItem(key);
              console.log(`Purged expired cache: ${key}`);
            }
          }
        } catch (e) {
          console.warn(`Failed to check or purge cache for ${key}:`, e);
        }
        break;
      }
    }
  }
}

analyzeView().then(() => console.log(
  'Citation Watchlist script finished executing'));