Jump to content

Module:Diff/sandbox

From Wikipedia, the free encyclopedia
This is the current revision of this page, as edited by Waddie96 (talk | contribs) at 16:38, 9 October 2025 (fixx22). The present address (URL) is a permanent link to this version.
(diff) ← Previous revision | Latest revision (diff) | Newer revision → (diff)
-----------------------------------------------------------------------------
-- Provides functions for getting the difference between two pieces of text.
--
-- (c) 2007, 2008  Yuri Takhteyev (yuri@freewisdom.org)
-- (c) 2007 Hisham Muhammad
-- Adapted to MediaWiki Lua originally by User:Ebrahim
--
-- License: MIT/X, see http://sputnik.freewisdom.org/en/License
-----------------------------------------------------------------------------

--- Localization of globals
local lstring = mw.ustring
local ltext = mw.text
local table_insert = table.insert
local table_remove = table.remove
local html_create = mw.html.create
local p = {}

local SKIP_SEPARATOR = true -- a constant
local STYLES = 'src = "Module:Diff/styles.css"'

-- Token statuses
local IN = "in"
local OUT = "out"
local SAME = "same"

-----------------------------------------------------------------------------
--- Split a string into tokens.
-- (Adapted from Gavin Kistner's split on http://lua-users.org/wiki/SplitJoin.)
--
--- @param text string Input string to be split.
--- @param separator string|'%s+'? the separator pattern (default: whitespace)
--- @param skip_separator boolean? don't include the separator in the results.
--- @return table token_list list of tokens.
-----------------------------------------------------------------------------
local function split(text, separator, skip_separator)
	skip_separator = skip_separator or SKIP_SEPARATOR
	separator = separator or "%s+"
	local parts = {}
	local start = 1
	local split_start, split_end = lstring.find(text, separator, start)
	while split_start do
		table_insert(parts, lstring.sub(text, start, split_start - 1))
		if not skip_separator then
			table_insert(parts, lstring.sub(text, split_start, split_end))
		end
		start = split_end + 1
		split_start, split_end = lstring.find(text, separator, start)
	end
	if lstring.sub(text, start) ~= "" then
		table_insert(parts, lstring.sub(text, start))
	end
	return parts
end

-----------------------------------------------------------------------------
--- Derives the longest common subsequence of two strings.  This is a faster
--- implementation than one provided by stdlib.  Submitted by Hisham Muhammad.
-- (The algorithm was taken from:
-- http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_subsequence)
--
--- NOTE: High-level design choice.
--- This implementation makes use of nested metatables to create a sparse,
--- auto-initializing 2D array. This is clever but complex and slightly
--- unconventional in order to create the LCS matrix only as needed.
--- It's a neat trick, but it may be slower than a more traditional approach,
--- especially for large strings.  It also uses more memory, since the matrix
--- is not a sparse array and the matrix is of size (m+1)×(n+1), where m and n
--- are the lengths of the two strings t1 and t2.
--- While the metatable approach is compact, it increases mental overhead too.
--- Sparse array initialization and explicitly managing state with tables would
--- just as helpful and could improve performance.
---
--- @param t1 any first string.
--- @param t2 any second string.
--- @return table lcs_matrix least common subsequence as a matrix.
-----------------------------------------------------------------------------
local function quick_LCS(t1, t2)
	local m = #t1
	local n = #t2

	-- Build the dynamic programming matrix C on demand using nested metatables.
	-- This creates a "sparse array" structure where C[i][j] automatically
	-- initializes to 0 when first accessed, avoiding the need for explicit
	-- initialization of a potentially large (m+1) x (n+1) table.
	local C = {}
	local setmetatable = setmetatable
	local mt_tbl = {
		__index = function(t, k)
			t[k] = 0
			return 0
		end,
	}
	local mt_C = {
		__index = function(t, k)
			local tbl = {}
			setmetatable(tbl, mt_tbl)
			t[k] = tbl
			return tbl
		end,
	}
	setmetatable(C, mt_C)
	local max = math.max

	-- Loops run from 1 to m and 1 to n.
	for i = 1, m do
		local ci1 = C[i + 1]
		local ci = C[i]
		for j = 1, n do
			-- Compare the tokens directly using 1-based indices (t1[i] and t2[j]).
			if t1[i] == t2[j] then
				ci1[j + 1] = ci[j] + 1
			else
				ci1[j + 1] = max(ci1[j], ci[j + 1])
			end
		end
	end
	return C
end

-----------------------------------------------------------------------------
--- Formats an inline diff as HTML, with <ins> and <del> tags.
--
--- @param tokens table table of {token, status} key-value pairs.
--- @return string diff_buffer HTML string.
-----------------------------------------------------------------------------
local function format_as_html(tokens)
	local diff_buffer = ""
	local token, status
	for _, token_record in ipairs(tokens) do
		token = ltext.nowiki(token_record[1])
		status = token_record[2]
		if status == "in" then
			diff_buffer = diff_buffer .. "<ins>" .. token .. "</ins>"
		elseif status == "out" then
			diff_buffer = diff_buffer .. "<del>" .. token .. "</del>"
		else
			diff_buffer = diff_buffer .. token
		end
	end
	return diff_buffer
end

-----------------------------------------------------------------------------
-- Returns a diff of two strings in tuples/two-element array, where the key
-- is a token and the value the token's status ("same", "in", "out").
--
--- @param old string "old" text string
--- @param new string "new" text string
--- @param separator string|'%s+'? separator pattern (default: whitespace).
--- @return table<string, 'same'|'in'|'out'> token_list Two-element array/tuples of annotated tokens.
-----------------------------------------------------------------------------
local function diff(old, new, separator)
	assert(old)
	assert(new)
	local tbl_new = split(new, separator)
	local tbl_old = split(old, separator)

	-- First, compare the beginnings and ends of strings to remove the common
	-- prefix and suffix.
	local prefix = "" -- common text in the beginning
	local suffix = "" -- common text in the end

	while tbl_old[1] and tbl_old[1] == tbl_new[1] do
		local token = table_remove(tbl_old, 1)
		table_remove(tbl_new, 1)
		prefix = prefix .. token
	end
	while tbl_old[#tbl_old] and tbl_old[#tbl_old] == tbl_new[#tbl_new] do
		local token = table_remove(tbl_old)
		table_remove(tbl_new)
		suffix = token .. suffix
	end

	-- Setup the table that will store the diff in reverse order.
	-- ... (rest of the function is now fine as it uses tbl_old/tbl_new)

	local rev_diff = {}
	-- Define local helper functions to manage token insertion into rev_diff.
	local function put(token, type)
		table_insert(rev_diff, { token, type })
	end
	local function ins(token)
		put(token, IN)
	end
	local function del(token)
		put(token, OUT)
	end
	local function same(token)
		if token then
			put(token, SAME)
		end
	end

	-- Put the suffix as the first token (we are storing the diff in the
	-- reverse order)
	same(suffix)
	-- Define a function that will scan the LCS matrix backwards and build the
	-- diff output recursively.
	local function get_diff(C, tbl_old, tbl_new, i, j)
		local old_i = tbl_old[i]
		local new_j = tbl_new[j]
		if i >= 1 and j >= 1 and old_i == new_j then
			same(old_i)
			return get_diff(C, tbl_old, tbl_new, i - 1, j - 1)
		else
			local Cij1 = C[i][j - 1]
			local Ci1j = C[i - 1][j]
			if j >= 1 and (i == 0 or Cij1 >= Ci1j) then
				ins(new_j)
				return get_diff(C, tbl_old, tbl_new, i, j - 1)
			elseif i >= 1 and (j == 0 or Cij1 < Ci1j) then
				del(old_i)
				return get_diff(C, tbl_old, tbl_new, i - 1, j)
			end
		end
	end

	-- Then call it.
	get_diff(quick_LCS(tbl_old, tbl_new), tbl_old, tbl_new, #tbl_old + 1, #tbl_new + 1)
	-- Put the prefix in at the end
	same(prefix)

	-- Reverse the diff.
	local tbl_diff = {}
	for i = #rev_diff, 1, -1 do
		table_insert(tbl_diff, rev_diff[i])
	end
	tbl_diff.to_html = format_as_html
	return tbl_diff
end

local function attach_styles(base)
	local templatestyles_tag
	if not templatestyles_tag then
		templatestyles_tag = lstring.format(
			"%s %s",
			mw.getCurrentFrame():extensionTag({
				name = "templatestyles",
				content = "",
				args = { STYLES },
			}),
			base
		)
	else
		return lstring.format("%s%s", templatestyles_tag, base)
	end
end

-----------------------------------------------------------------------------
-- Wiki diff style, currently just for a line
-----------------------------------------------------------------------------
local function wikiDiff(old, new, separator)
	local tokens = diff(old, new, separator)
	local root = html_create("")

	local token, status
	local invalidStyles = "-webkit-border-end-width: 1px;"
		.. "-webkit-border-start-width: 4px;"
		.. "-moz-border-end-width: 1px;"
		.. "-moz-border-start-width: 4px;"

	local tr = root
		:tag("table")
		:addClass("mdl-diff")
		:attr("role", "none")
		-- :attr("aria-label", "text difference, table, two columns," ..
		-- " first column content removed, second column content added")
		:tag(
			"tr"
		)

	tr:tag("td"):addClass("mdl-diff-marker"):attr("aria-hidden", "true"):wikitext("−")

	local deleted = tr
		:tag("td")
		:attr("aria-label", "Removed")
		:addClass("mdl-diff-deletedline")
		:addClass("mdl-diff-changedline")
		:css(invalidStyles) -- Workaround for a bug in Firefox where border styles are ignored on table cells
		:addClass("monospace")
		:addClass("monospaced")
		:tag("div")
		:addClass("mdl-diff-div")

	for _, token_record in ipairs(tokens) do
		token = ltext.nowiki(token_record[1]):gsub("\n", "&#10;") -- Force all newlines to encode to avoid linter issues
		status = token_record[2]
		if status == OUT then
			deleted:tag("del"):addClass("mdl-diff-deletedline"):addClass("mdl-diff-change-inline"):wikitext(token)
		elseif status == SAME then
			deleted:wikitext(token)
		end
	end

	tr:tag("td"):addClass("mdl-diff-marker"):attr("aria-hidden", "true"):wikitext("+")

	local inserted = tr:tag("td")
		:attr("aria-label", "Added")
		:addClass("mdl-diff-addedline")
		:addClass("mdl-diff-changedline")
		:css(invalidStyles)
		:addClass("monospace")
		:addClass("monospaced")
		:tag("div")
		:addClass("mdl-diff-div")

	for _, token_record in ipairs(tokens) do
		token = ltext.nowiki(token_record[1]):gsub("\n", "&#10;") -- Force all newlines to encode to avoid linter issues
		status = token_record[2]
		if status == IN then
			inserted:tag("ins"):addClass("mdl-diff-addedline"):addClass("mdl-diff-change-inline"):wikitext(token)
		elseif status == SAME then
			inserted:wikitext(token)
		end
	end

	return attach_styles(tostring(root))
end

local function main(frame)
	return wikiDiff(
		ltext.decode(ltext.unstrip(frame.args[1])),
		ltext.decode(ltext.unstrip(frame.args[2])),
		frame.args[3] or "[%s%.:-]+"
	)
end

p = {
	diff = diff,
	wikiDiff = wikiDiff,
	main = main,
}

return p