Jump to content

User:Harej/citation-watchlist-staging.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/**
 * 
 * Citation Watchlist
 * https://en.wikipedia.org/wiki/WP:WATCHCITE
 *
 * 
/**
 * ==========================================================================
 * Domain List Configuration
 * ==========================================================================
 *
 * Citation Watchlist requires the following wiki pages to function:
 *
 * 1. Public Suffix List
 *    - A local copy of the public suffix list, used for domain parsing.
 *    - Copy the contents of:
 *        https://en.wikipedia.org/wiki/Wikipedia:Citation_Watchlist/Public_Suffix_List
 *      to a page on your own wiki.
 *    - Update the `publicSuffixList` variable below to reflect your page title.
 *
 * 2. List of Lists
 *    - A page linking to one or more domain list pages.
 *    - Format as a bullet list: "* [[Page Title]]" (space after asterisk).
 *    - Reference formatting example:
 *        https://en.wikipedia.org/wiki/Wikipedia:Citation_Watchlist/Lists
 *    - Update the `listOfLists` variable below accordingly.
 *
 * 3. Domain List Pages
 *    - One or more pages listing suspicious or noteworthy domains.
 *    - Each page must contain section headers that match the `indicators` config
 *      below (e.g., "==Warn==", "==Caution==").
 *    - Under each section, list domains in the format: "* example.com"
 *    - Do not use link formatting—just plain text.
 */
const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List";
const listOfLists = "Wikipedia:Citation_Watchlist/Lists";

/**
 * ==========================================================================
 * Indicator Configuration
 * ==========================================================================
 *
 * Defines metadata for domain indicators used in the watchlist UI.
 * Each indicator is associated with a level of urgency and a unique symbol.
 *
 * Fields:
 * - msg:     Display label for the level (e.g., "Warning", "Caution").
 * - emoji:   Unicode character for the visual indicator (escaped as `\uXXXX`).
 * - section: Must exactly match the section headers in the domain list pages.
 * - priority: Higher values override lower ones for conflicting domain matches.
 *             Priority scale: 1 (lowest) to N (highest).
 * - list:    Defined as "new Set()" for all indicator types.
 *
 * If a domain appears in multiple lists, the one with the highest priority
 * takes precedence.
 */
const indicators = {
  warning: {
    msg: "Warning",
    emoji: '\u2757',
    section: "==Warn==",
    priority: 3,
    list: new Set()
  },
  caution: {
    msg: "Caution",
    emoji: '\u270B',
    section: "==Caution==",
    priority: 2,
    list: new Set()
  },
  inspect: {
    msg: "Inspect",
    emoji: '\uD83D\uDD0E',
    section: "==Inspect==",
    priority: 1,
    list: new Set()
  }
};

/**
 * Citation Watchlist
 * 
 * Highlights potentially questionable citations added in Wikipedia revisions,
 * using predefined domain lists and a public suffix list to analyze diffs.
 * 
 * Documentation: https://en.wikipedia.org/wiki/WP:WATCHCITE
 * 
 * Author: James Hare under contract with Hacks/Hackers
 * License: GNU General Public License v3.0 (GPL-3.0)
 * 
 * @version 1.12
 * @since 2025-04-23
 */

let publicSuffixSet = new Set();
const namespaces = Object.entries(mw.config.get('wgFormattedNamespaces'))
  .filter(([num, name]) => num !== '0' && num !== '118')
  .map(([_, name]) => name.replace(/ /g, '_') + ':');

/**
 * Main entry point for Citation Watchlist.
 * Determines if the current page should be analyzed, fetches domain and suffix
 * lists, processes each change/revision in the recent changes or history page,
 * and triggers analysis to highlight questionable domains.
 */
async function analyzeView() {
  purgeExpiredCache();
  const ns = mw.config.get('wgNamespaceNumber');
  if (![-1, 0, 118].includes(ns)) {
    return;
  }
  publicSuffixSet = await fetchPublicSuffixList();
  if (publicSuffixSet.size === 0) {
    console.error('Public Suffix List loading failed');
    return;
  }
  console.log("Welcome to Citation Watchlist");
  const listPages = await fetchDomainListPages(listOfLists);
  if (listPages) {
    const lists = await fetchAndOrganizeDomainLists(listPages);
    if (lists) {
      for (const type in indicators) {
        lists[type].list.forEach(indicators[type].list.add, indicators[type].list);
      }
    }
  }
  const entriesContainers = document.querySelectorAll('.mw-changeslist-links');
  let noLinks = true;
  for (const container of entriesContainers) {
    const diffLink = container.querySelector('a.mw-changeslist-diff');
    const histLink = container.querySelector('a.mw-changeslist-history');
    const prevLink = container.querySelector(
      'a.mw-history-histlinks-previous');
    const curLink = container.querySelector('a.mw-history-histlinks-current');
    let revision = null;
    let urlParams = '';
    if (diffLink) {
      noLinks = false;
      const diffUrl = new URL(diffLink.href);
      urlParams = new URLSearchParams(diffUrl.search);
      const pageTitle = urlParams.get('title');
      if (isNotArticle(pageTitle)) continue;
      revision = {
        oldrevision: urlParams.get('diff'),
        newrevision: urlParams.get('oldid'),
        element: diffLink.parentNode.parentNode
      };
      if (revision.oldrevision == 'prev') { // This happens on user contributions pages
        const previousRevisionMap = await fetchPreviousRevisionIds(
        	[revision.newrevision]);
        revision.oldrevision = revision.newrevision;
        revision.newrevision = previousRevisionMap[revision.newrevision];
      }
    } else if (histLink) {
      noLinks = false;
      const histUrl = new URL(histLink.href);
      urlParams = new URLSearchParams(histUrl.search);
      const pageTitle = urlParams.get('title');
      if (isNotArticle(pageTitle)) continue;
      const firstID = await fetchFirstRevisionId(pageTitle);
      if (!firstID) continue;
      revision = {
        oldrevision: firstID,
        element: histLink.parentNode.parentNode
      };
    } else if (prevLink) {
      noLinks = false;
      urlParams = new URLSearchParams(prevLink.href);
      const previousRevisionMap = await fetchPreviousRevisionIds(
      	[urlParams.get('oldid')]);
      revision = {
        oldrevision: urlParams.get('oldid'),
        newrevision: previousRevisionMap[urlParams.get('oldid')],
        element: prevLink.parentNode.parentNode
      };
    } else if (curLink) {
      noLinks = false;
      urlParams = new URLSearchParams(curLink.href);
      revision = {
        oldrevision: urlParams.get('oldid'),
        element: curLink.parentNode.parentNode
      };
    }
    if (revision) {
      await analyzeRevision(revision);
    }
  }
  // If no links were found, extract the first revision ID
  if (noLinks == true) {
    const pageTitle = mw.config.get('wgTitle');
    const firstID = await fetchFirstRevisionId(pageTitle);
    revision = {
      oldrevision: firstID,
      element: entriesContainers[0]
    };
    await analyzeRevision(revision);
  }
}

/**
 * Analyzes a revision (or a pair of revisions) for newly added URLs,
 * compares them against domain watchlists, and highlights matches.
 *
 * @param {Object} revision - Object containing oldrevision, optional newrevision, and DOM element.
 */
async function analyzeRevision(revision) {
  const lookup = [revision.oldrevision];
  if (revision.newrevision) {
    lookup.push(revision.newrevision);
  }
  const wikiDomain = location.hostname;
  const cacheKey = `revisionDiff:${wikiDomain}:${revision.oldrevision}:${revision.newrevision || 'null'}`;
  const oneMonth = 30 * 24 * 60 * 60 * 1000;
  let addedURLs = [];

  // Try reading from cache
  const cached = localStorage.getItem(cacheKey);
  if (cached) {
    try {
      const parsed = JSON.parse(cached);
      const age = Date.now() - parsed.timestamp;
      if (age < oneMonth && Array.isArray(parsed.addedURLs)) {
        console.log(`Cache hit for revision ${cacheKey}`);
        addedURLs = parsed.addedURLs;
      }
    } catch (e) {
      console.warn('Cache parse error, refetching:', e);
    }
  }

  // If not cached, fetch and process
  if (addedURLs.length === 0) {
    const wikitext = await fetchRevisionContent(lookup);
    const fromURLs = new Set(extractAddedURLs(wikitext.oldrevision) || []);
    const toURLs = new Set(extractAddedURLs(wikitext.newrevision) || []);

    if (revision.newrevision) {
      addedURLs = [...toURLs].filter(url => !fromURLs.has(url));
    } else {
      addedURLs = Array.from(fromURLs);
    }
    try {
      localStorage.setItem(cacheKey, JSON.stringify({
        timestamp: Date.now(),
        addedURLs
      }));
    } catch (e) {
      console.warn('Failed to store cache:', e);
    }
  }
  console.log(`Revision element: ${revision.element.innerHTML}
  Added URLs: ${addedURLs.join(' ')}
  `);

  // Match domains to indicator types
  const matchedDomains = Object.keys(indicators).reduce((acc, key) => {
    acc[key] = [];
    return acc;
  }, {});
  for (const url of addedURLs) {
    const hostname = new URL(url).hostname;
    const domain = getRootDomain(hostname, publicSuffixSet);
    let highestPriorityType = null;
    for (const type in indicators) {
      if (indicators[type].list.has(domain)) {
        if (
          highestPriorityType === null ||
          indicators[type].priority > indicators[highestPriorityType].priority
        ) {
          highestPriorityType = type;
        }
      }
    }
    if (
      highestPriorityType !== null &&
      !matchedDomains[highestPriorityType].includes(domain)
    ) {
      matchedDomains[highestPriorityType].push(domain);
      for (const type in indicators) {
        if (
          indicators[type].priority < indicators[highestPriorityType].priority
        ) {
          matchedDomains[type] = matchedDomains[type].filter(d => d !== domain);
        }
      }
    }
  }

  // Prepend emoji indicators
  for (const type in indicators) {
    if (matchedDomains[type].length > 0) {
      prependEmojiWithTooltip(revision.element, type, matchedDomains[type]);
    }
  }
}

/**
 * Prepends an emoji and tooltip to a revision list entry DOM element if any
 * domains matched a warning list.
 *
 * @param {HTMLElement} element - The container element to prepend the emoji to.
 * @param {string} type - The type of indicator ('warning', 'caution', 'inspect').
 * @param {string[]} domains - The list of matched domains for the indicator.
 */
function prependEmojiWithTooltip(element, type, domains) {
  const indicator = indicators[type];
  if (!indicator || element.getAttribute(`data-processed-${type}`) === 'true') {
    return;
  }
  const emojiSpan = document.createElement('span');
  emojiSpan.textContent = indicator.emoji + " ";
  emojiSpan.title = `${indicator.msg}: ${domains.join(", ")}`;
  element.parentNode.insertBefore(emojiSpan, element);
  element.setAttribute(`data-processed-${type}`, 'true');
}

/**
 * Extracts the first page object from MediaWiki API query response.
 *
 * @param {Object} data - MediaWiki API response.
 * @returns {Object|null} The first page object or null if unavailable.
 */
async function getFirstPage(data) {
  if (!data || !data.query || !data.query.pages) return null;
  const pages = data.query.pages;
  return Object.values(pages)[0]; // Return the first page
}

/**
 * Retrieves the first revision from a page object.
 *
 * @param {Object} page - Page object containing revisions.
 * @returns {Object|null} First revision object or null.
 */
async function getFirstRevision(page) {
  if (page.revisions && page.revisions.length > 0) {
    return page.revisions[0];
  }
  return null;
}

/**
 * Fetches wikitext content for one or two revisions by ID.
 *
 * @param {string[]} revIds - Array of revision IDs.
 * @returns {Object} Object with `oldrevision` and optionally `newrevision` as wikitext strings.
 */
async function fetchRevisionContent(revIds) {
  const data = await fetchRevisionData({
    revids: revIds,
    rvprop: ['content'],
    rvslots: ['main']
  });
  const page = await getFirstPage(data);
  const wikitext = { oldrevision: null, newrevision: null };
  if (page.revisions && page.revisions.length > 0) {
    wikitext.oldrevision = page.revisions[0].slots.main['*'] || null;
    if (page.revisions.length > 1) {
      wikitext.newrevision = page.revisions[1].slots.main['*'] || null;
    }
  }
  return wikitext;
}

/**
 * Fetches the parent revision IDs for a given list of revision IDs.
 *
 * @param {string[]} revisionIds - Array of revision IDs.
 * @returns {Object} Map of revision ID to its parent ID.
 */
async function fetchPreviousRevisionIds(revisionIds) {
  const data = await fetchRevisionData({
    revids: revisionIds,
    rvprop: ['ids']
  });
  const page = await getFirstPage(data);
  if (!page) return {};
  const revisionMap = {};
  for (const revision of page.revisions) {
    revisionMap[revision.revid] = revision.parentid;
  }
  return revisionMap;
}

/**
 * Fetches the ID of the first revision of a page.
 *
 * @param {string} pageTitle - The page title to look up.
 * @returns {number|null} Revision ID or null.
 */
async function fetchFirstRevisionId(pageTitle) {
  const data = await fetchRevisionData({
    titles: [pageTitle],
    rvlimit: 1,
    rvdir: 'newer',
    rvprop: ['ids'],
  });
  const page = await getFirstPage(data);
  if (!page) return null;
  const revision = await getFirstRevision(page);
  return revision ? revision.revid : null;
}

/**
 * Fetches the list of subpages from the list of lists, parses wikilinks, caches
 * the result, and returns list of subpage titles.
 *
 * @param {string} pageName - Title of the list-of-lists page.
 * @returns {Promise<string[]>} List of subpage titles.
 */
async function fetchDomainListPages(pageName) {
  const cacheKey = `citationWatchlistFetchDomainListPages_${pageName}`;
  const cacheExpiration = 4 * 60 * 60 * 1000;
  const now = Date.now();
  const cachedData = localStorage.getItem(cacheKey);
  const cachedTimestamp = localStorage.getItem(`${cacheKey}_timestamp`);
  if (cachedData && cachedTimestamp && (now - parseInt(cachedTimestamp, 10)) <
    cacheExpiration) {
    console.log("Loaded list of lists from cache");
    return JSON.parse(cachedData);
  }
  const data = await fetchRevisionData({
    titles: [pageName],
    rvprop: ['content'],
    rvslots: ['*']
  });
  const page = await getFirstPage(data);
  if (!page) return [];
  const content = page.revisions[0].slots.main['*'];
  const pageTitles = [];
  const lines = content.split('\n');
  for (let line of lines) {
    if (line.startsWith('* [[')) {
      const match = line.match(
        /\[\[([^\]]+)\]\]/); // Matches the first instance of [[Page Title]]
      if (match) {
        pageTitles.push(match[1]);
      }
    }
  }
  localStorage.setItem(cacheKey, JSON.stringify(pageTitles));
  localStorage.setItem(`${cacheKey}_timestamp`, now.toString());
  console.log("Loaded from API and stored in cache");
  return pageTitles;
}

/**
 * Loads domain lists from a set of pages, categorizes them by indicator section
 * headers, and populates the corresponding `Set` in the global `indicators` object.
 *
 * @param {string[]} pageNames - List of page titles to fetch.
 * @returns {Object} Updated indicators object with domain sets.
 */
async function fetchAndOrganizeDomainLists(pageNames) {
  const cacheTTL = 6 * 60 * 60 * 1000;
  const now = Date.now();
  const cachedData = {};
  const pagesToFetch = [];
  for (const title of pageNames) {
    const cacheKey = `domainList:${location.hostname}:${title}`;
    const cached = localStorage.getItem(cacheKey);
    if (cached) {
      try {
        const parsed = JSON.parse(cached);
        if (now - parsed.timestamp < cacheTTL && parsed.content) {
          console.log(`Using cached content for page: ${title}`);
          cachedData[title] = parsed.content;
          continue;
        } else {
          console.log(`Cache expired for page: ${title}`);
        }
      } catch (e) {
        console.warn(`Cache error for ${title}:`, e);
      }
    }
    console.log(`Will fetch page: ${title}`);
    pagesToFetch.push(title);
  }
  let fetchedPages = {};
  if (pagesToFetch.length > 0) {
    const apiData = await fetchRevisionData({
      titles: pagesToFetch,
      rvprop: ['content'],
      rvslots: ['*'],
    });
    const pages = apiData.query.pages;
    for (const pageId in pages) {
      const page = pages[pageId];
      const title = page.title;
      const content = page.revisions[0].slots.main['*'];
      fetchedPages[title] = content;
      const cacheKey = `domainList:${location.hostname}:${title}`;
      try {
        localStorage.setItem(cacheKey, JSON.stringify({
          timestamp: now,
          content,
        }));
        console.log(`Cached content for page: ${title}`);
      } catch (e) {
        console.warn(`Failed to cache ${title}:`, e);
      }
    }
  }
  const allContent = { ...cachedData, ...fetchedPages };
  for (const title in allContent) {
    const content = allContent[title];
    let currentList = null;
    const lines = content.split('\n');
    for (let line of lines) {
      for (const type in indicators) {
        if (line.trim() === indicators[type].section) {
          currentList = indicators[type].list;
          break;
        }
      }
      if (line.startsWith('*') && currentList) {
        const domain = line.substring(1).trim();
        currentList.add(domain);
      }
    }
  }
  return indicators;
}

/**
 * Fetches and caches the public suffix list used to identify top-level domains.
 *
 * @returns {Promise<Set<string>>} Set of public suffixes.
 */
async function fetchPublicSuffixList() {
  const cacheKey = 'publicSuffixListCache';
  const cacheTTL = 24 * 60 * 60 * 1000;
  const cached = localStorage.getItem(cacheKey);
  if (cached) {
    try {
      const parsed = JSON.parse(cached);
      const age = Date.now() - parsed.timestamp;
      if (age < cacheTTL && parsed.content) {
        console.log('Using cached public suffix list');
        return new Set(parsed.content.split('\n').filter(line =>
          line.trim() && !line.trim().startsWith('//')
        ).map(line => line.trim()));
      }
    } catch (e) {
      console.warn('Error parsing cache, refetching:', e);
    }
  }
  const pslUrl = mw.config.get('wgArticlePath').replace('$1', publicSuffixList) 
    + '?action=raw';
  console.log(`Raw page text request: ${pslUrl}`);
  const content = await safeFetch(fetch, pslUrl).then(response => response ?
    response.text() : null);
  if (!content) return new Set();
  try {
    localStorage.setItem(cacheKey, JSON.stringify({
      timestamp: Date.now(),
      content
    }));
  } catch (e) {
    console.warn('Failed to write to cache:', e);
  }
  const suffixSet = new Set();
  const lines = content.split('\n');
  for (const line of lines) {
    if (line.trim() && !line.trim().startsWith('//')) {
      suffixSet.add(line.trim());
    }
  }
  return suffixSet;
}

/**
 * Makes a MediaWiki API call to fetch revision metadata or content.
 *
 * @param {Object} data - Options for the API call, such as `revids`, `titles`, `rvprop`, etc.
 * @returns {Promise<Object>} MediaWiki API query result.
 */
async function fetchRevisionData(data) {
  const paramKeys = ['rvprop', 'revids', 'titles', 'rvslots'];
  const params = {
    action: 'query',
    prop: 'revisions',
    format: 'json',
    rvdir: data.rvdir || 'older',
    origin: '*'
  };
  if (data.rvlimit) { params.rvlimit = data.rvlimit; }
  paramKeys.forEach(key => {
    if (data[key]) {
      params[key] = Array.isArray(data[key]) ? data[key].join('|') : data[key];
    }
  });
  const api = new mw.Api();
  return await safeFetch(api.get.bind(api), params);
}

/**
 * Wraps any asynchronous fetch function and logs errors without throwing.
 *
 * @param {Function} fn - The function to execute (usually an API call).
 * @param {...any} args - Arguments to pass to the fetch function.
 * @returns {Promise<any|null>} Result of the fetch or null on failure.
 */
async function safeFetch(fn, ...args) {
  try {
    return await fn(...args);
  } catch (error) {
    console.error(`Error during ${fn.name}:`, error);
    return null;
  }
}

/**
 * Extracts all HTTP(S) URLs from a given wikitext string.
 *
 * @param {string} wikitext - Raw wikitext revision content.
 * @returns {string[]} List of valid extracted URLs.
 */
function extractAddedURLs(wikitext) {
  const addedURLs = [];
  const urlRegex = /https?:\/\/[^\s<"]+/g;
  let match;
  while ((match = urlRegex.exec(wikitext)) !== null) {
    try {
      const url = new URL(match[0]);
      addedURLs.push(url.href);
    } catch (error) {
      console.error(`Invalid URL rejected: ${match[0]}`);
    }
  }
  return addedURLs;
}

/**
 * Extracts the top-level domain from a full hostname using a public suffix set.
 *
 * @param {string} hostname - Full hostname (e.g., sub.example.co.uk).
 * @param {Set<string>} publicSuffixSet - Set of known public suffixes.
 * @returns {string} The top-level domain (e.g., example.co.uk).
 */
function getRootDomain(hostname, publicSuffixSet) {
  const domainParts = hostname.split('.');
  for (let i = 0; i < domainParts.length; i++) {
    const candidate = domainParts.slice(i).join('.');
    if (publicSuffixSet.has(candidate) || publicSuffixSet.has(
        `!${candidate}`)) {
      return domainParts.slice(i - 1).join('.');
    }
  }
  return hostname;
}

/**
 * Determines whether a given page title does *not* belong to the main or draft namespaces.
 *
 * @param {string} pageTitle - The title of the page.
 * @returns {boolean} True if not an article namespace.
 */
function isNotArticle(pageTitle) {
  return namespaces.some(namespace => pageTitle.startsWith(namespace));
}

/**
 * Cleans up expired localStorage cache entries based on known cache key prefixes and TTLs.
 */
function purgeExpiredCache() {
  const now = Date.now();
  const knownCaches = [
    { prefix: 'revisionDiff:', ttl: 30 * 24 * 60 * 60 * 1000 },
    { prefix: 'domainList:', ttl: 6 * 60 * 60 * 1000 },
    { prefix: 'publicSuffixListCache', ttl: 24 * 60 * 60 * 1000 },
    { prefix: 'citationWatchlistFetchDomainListPages_', ttl: 4 * 60 * 60 * 1000 }
  ];
  for (let i = 0; i < localStorage.length; i++) {
    const key = localStorage.key(i);
    for (const cache of knownCaches) {
      if (key.startsWith(cache.prefix)) {
        let expired = false;
        try {
          if (key.endsWith('_timestamp')) {
            const baseKey = key.replace(/_timestamp$/, '');
            const timestamp = parseInt(localStorage.getItem(key), 10);
            if (isNaN(timestamp) || now - timestamp > cache.ttl) {
              expired = true;
              localStorage.removeItem(key);
              localStorage.removeItem(baseKey);
              console.log(`Purged expired cache: ${baseKey}`);
            }
          } else {
            const value = localStorage.getItem(key);
            const parsed = JSON.parse(value);
            if (parsed && parsed.timestamp && now - parsed.timestamp > cache.ttl) {
              expired = true;
              localStorage.removeItem(key);
              console.log(`Purged expired cache: ${key}`);
            }
          }
        } catch (e) {
          console.warn(`Failed to check or purge cache for ${key}:`, e);
        }
        break;
      }
    }
  }
}

analyzeView().then(() => console.log(
  'Citation Watchlist script finished executing'));