Jump to content

Module:Wikitext Parsing

Permanently protected module
From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Aidan9382 (talk | contribs) at 19:11, 16 March 2023 (give it some proper entry points). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

require("strict")
local p = {}

--Helper functions
local function startswith(text, subtext)
	return string.sub(text, 1, #subtext) == subtext
end
local function endswith(text, subtext)
	return string.sub(text, -#subtext, -1) == subtext
end
local function allcases(s)
	return s:gsub("%a", function(c) 
		return "["..c:upper()..c:lower().."]"
	end)
end

--[[ Implementation notes
---- NORMAL HTML TAGS ----
Tags are very strict on how they want to start, but loose on how they end.
The start must strictly follow <[tAgNaMe] with no room for whitespace, but may
then flow as they want afterwards, making <div\nclass\n=\n"\nerror\n"\n> valid

There's no sense of escaping < or >
E.g.
 <div class="error\>"> will end at the first \> despite it being inside a quote
 <div class="<span class="error">error</span>"> will not process the larger div

If a tag has no end, it will consume all text instead of not processing

---- NOPROCESSING TAGS (nowiki, pre, syntaxhighlight, source) ----
SIDENOTE: For this specific paragraph, <pre> functions like a normal tag, so the
rules from above apply instead.
No-Processing tags have some interesting differences to the above rules.
For example, their syntax is a lot stricter. While an opening tag appears to
follow the same set of rules, A closing tag can't have any sort of extra
formatting period. While </div a/a> is valid, </nowiki a/a> isn't - only
newlines and spaces are allowed in closing tags.

Both the content inside the tag pair and the content inside each side of the
pair is not processed. E.g. <nowiki |}}>|}}</nowiki> would have both of the |}}
escaped in practice

When something in the code is referenced to as a "NowikiTag", it means a tag
which causes wiki text to not be processed, which includes <nowiki>, <pre>,
and <syntaxhighlight>

(In most comments, <source> will not be mentioned. This is because it is the
deprecated version of <syntaxhighlight>)

Since we only care about these 3 tags, we ignore the idea of an intercepting tag
preventing processing, and just go straight for the first ending we can find

---- HTML COMMENT ----
HTML Comments are about as basic as it could get for this
Start at <!--, end at -->, no extra conditions. Simple enough
If a comment has no end, it will eat all text instead of not being processed
--]]
local validtags = {nowiki=1, pre=1, syntaxhighlight=1, source=1}
--This function expects the string to start with the tag
local function TestForNowikiTag(text)
	local tagName = (string.match(text, "^<([^\n />]+)") or ""):lower()
	if not validtags[tagName] then
		return nil
	end
	local nextOpener = string.find(text, "<", 2) or -1
	local nextCloser = string.find(text, ">", 2) or -1
	if nextCloser > -1 and (nextOpener == -1 or nextCloser < nextOpener) then
		local startingTag = string.sub(text, 1, nextCloser)
		--We have our starting tag (E.g. '<pre style="color:red">')
		--Now find our ending...
		if endswith(startingTag, "/>") then --self-closing tag (we are our own ending)
			return {
				Tag = tagName,
				Start = startingTag,
				Content = "", End = "",
				Length = #startingTag
			}

		elseif tagName == "pre" then
			local endingTagPosition = #text+1 --<pre> consumes all text if unended
			local endingTag = --no | so we just use 2 matches
				string.match(text, "</[Pp][Rr][Ee]>") or
				string.match(text, "</[Pp][Rr][Ee][ \t\n/][^<]*>") or ""
			if endingTag ~= "" then
				endingTagPosition = string.find(text, endingTag, nextCloser, true)
			end
			local tagContent = string.sub(text, nextCloser+1, endingTagPosition-1)
			return {
				Tag = tagName,
				Start = startingTag,
				Content = tagContent,
				End = endingTag,
				Length = #startingTag + #tagContent + #endingTag
			}

		else --<nowiki> and <syntaxhighlight> are stricter
			local endingTag = string.match(text, "</"..allcases(tagName).."[ \t\n]*>")
			if endingTag then
				local endingTagPosition = string.find(text, endingTag, nextCloser, true)
				local tagContent = string.sub(text, nextCloser+1, endingTagPosition-1)
				return {
					Tag = tagName,
					Start = startingTag,
					Content = tagContent,
					End = endingTag,
					Length = #startingTag + #tagContent + #endingTag
				}
			end
		end
	end
	return nil
end
local function TestForComment(text) --Like TestForNowikiTag but for <!-- -->
	if startswith(text, "<!--") then
		local commentEnd = string.find(text, "-->", 5, true)
		if commentEnd then
			return {
				Start = "<!--", End = "-->",
				Content = string.sub(text, 5, commentEnd-1),
				Length = commentEnd+2
			}
		else --Consumes all text if not given an ending
			return {
				Start = "<!--", End = "",
				Content = string.sub(text, 5),
				Length = #text
			}
		end
	end
	return nil
end

--[[ Implementation notes
The goal of this function is to escape all text that wouldn't be parsed if it
was preprocessed (anything in nowiki, pre, syntaxhighlight, or <!----> tags).

Using keepComments will keep all HTML comments instead of removing them. They
will still be escaped to avoid processing errors
--]]
local function EscapeEscapedText(text, keepComments) --What a name!
	local newtext = ""
	while text ~= "" do
		local NextCheck = string.find(text,"<[NnSsPp!]") --Advance to the next potential tag we care about
		if not NextCheck then --Done
			newtext = newtext .. text
			break
		end
		newtext = newtext .. string.sub(text,1,NextCheck-1)
		text = string.sub(text, NextCheck)
		local Comment = TestForComment(text)
		if Comment then
			if keepComments then
				newtext = newtext .. Comment.Start .. mw.text.nowiki(Comment.Content) .. Comment.End
			end
			text = string.sub(text, Comment.Length+1)
		else
			local Tag = TestForNowikiTag(text)
			if Tag then
				local newTagStart = "<" .. mw.text.nowiki(string.sub(Tag.Start,2,-2)) .. ">"
				local newTagEnd = 
					Tag.End == "" and "" or --if no end tag, keep it that way
					"</" .. mw.text.nowiki(string.sub(Tag.End,3,-2)) .. ">"
				local newContent = mw.text.nowiki(Tag.Content)
				newtext = newtext .. newTagStart .. newContent .. newTagEnd
				text = string.sub(text, Tag.Length+1)
			else --Nothing special, move on...
				newtext = newtext .. string.sub(text, 1, 1)
				text = string.sub(text, 2)
			end
		end
	end
	return newtext
end

--Main entry points
p.EscapeEscapedText = EscapeEscapedText
p.EET = EscapeEscapedText
function p.main(o1, o2)
	if type(o1) == "table" and o1.getParent then --Template invocation most likely
		--return "<span class='error'>See [[Module:Sandbox/Aidan9382/ExcessiveParsing/doc]] for proper usage of this module</span>"
		o1, o2 = o1.args[1], o1.args[2] --Gonna see how this plays out
		return EscapeEscapedText(o1, o2)
	else --Module invocation, defer to EET
		return EscapeEscapedText(o1, o2)
	end
end
--Extra entry points, likely not required
p.TestForNowikiTag = TestForNowikiTag
p.TFNWT = TestForNowikiTag
p.TestForComment = TestForComment
p.TTC = TestForComment

return p

--[[ console tests
local s = [=[Hey!{{Text|<nowiki | ||>
Hey! }}
A</nowiki>|<!--AAAAA|AAA-->Should see|Shouldn't see}}]=]
local out = p.EET(s)
mw.logObject(out)

local s = [=[<!--
Hey!
-->A]=]
local out = p.TFC(s)
mw.logObject(out); mw.log(string.sub(s, 1, out.Length))
]]