Jump to content

Module:Wikitext Parsing

Permanently protected module
From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Aidan9382 (talk | contribs) at 10:41, 17 March 2023 (work on includeonly tag behaviour). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

require("strict")
local p = {}

--Helper functions
local function startswith(text, subtext)
	return string.sub(text, 1, #subtext) == subtext
end
local function endswith(text, subtext)
	return string.sub(text, -#subtext, -1) == subtext
end
local function allcases(s)
	return s:gsub("%a", function(c) 
		return "["..c:upper()..c:lower().."]"
	end)
end

--[[ Implementation notes
---- NORMAL HTML TAGS ----
Tags are very strict on how they want to start, but loose on how they end.
The start must strictly follow <[tAgNaMe](%s|>) with no room for whitespace in
the tag's name, but may then flow as they want afterwards, making
<div\nclass\n=\n"\nerror\n"\n> valid

There's no sense of escaping < or >
E.g.
 <div class="error\>"> will end at \> despite it being inside a quote
 <div class="<span class="error">error</span>"> will not process the larger div

If a tag has no end, it will consume all text instead of not processing

---- NOPROCESSING TAGS (nowiki, pre, syntaxhighlight, source) ----
(In most comments, <source> will not be mentioned. This is because it is the
deprecated version of <syntaxhighlight>)

No-Processing tags have some interesting differences to the above rules.
For example, their syntax is a lot stricter. While an opening tag appears to
follow the same set of rules, A closing tag can't have any sort of extra
formatting period. While </div a/a> is valid, </nowiki a/a> isn't - only
newlines and spaces are allowed in closing tags (except in <pre> tags, which
follow the rules of a regular html tag for formatting).

Both the content inside the tag pair and the content inside each side of the
pair is not processed. E.g. <nowiki |}}>|}}</nowiki> would have both of the |}}
escaped in practice.

When something in the code is referenced to as a "Nowiki Tag", it means a tag
which causes wiki text to not be processed, which includes <nowiki>, <pre>,
and <syntaxhighlight>

Since we only care about these tags, we can ignore the idea of an intercepting
tag preventing processing, and just go straight for the first ending we can find
If there is no ending to find, the tag will NOT consume the rest of the text in
terms of processing behaviour (though <pre> will appear to have an effect).
Even if there is no end of the tag, the content inside the opening half will
still be unprocessed, meaning {{X20|<nowiki }}>}} wouldn't end at the first }}
despite there being no ending to the tag.

---- HTML COMMENTS AND INCLUDEONLY ----
HTML Comments are about as basic as it could get for this
Start at <!--, end at -->, no extra conditions. Simple enough
If a comment has no end, it will eat all text instead of not being processed

includeonly tags function mostly like a regular nowiki tag, with the exception
that the tag will actually consume all future text if not given an ending as
opposed to simply giving up and not changing anything.
--]]
local validtags = {nowiki=1, pre=1, syntaxhighlight=1, source=1, includeonly=1}
--This function expects the string to start with the tag
local function TestForNowikiTag(text)
	local tagName = (string.match(text, "^<([^\n />]+)") or ""):lower()
	if not validtags[tagName] then
		return nil
	end
	local nextOpener = string.find(text, "<", 2) or -1
	local nextCloser = string.find(text, ">", 2) or -1
	if nextCloser > -1 and (nextOpener == -1 or nextCloser < nextOpener) then
		local startingTag = string.sub(text, 1, nextCloser)
		--We have our starting tag (E.g. '<pre style="color:red">')
		--Now find our ending...
		if endswith(startingTag, "/>") then --self-closing tag (we are our own ending)
			return {
				Tag = tagName,
				Start = startingTag,
				Content = "", End = "",
				Length = #startingTag
			}

		else
			local endingTag
			if tagName == "pre" then --Looser restrictions for <pre>
				endingTag = --no | so we just use 2 matches
					string.match(text, "</[Pp][Rr][Ee]>") or
					string.match(text, "</[Pp][Rr][Ee][ \t\n/][^<]*>")
			else
				endingTag = string.match(text, "</"..allcases(tagName).."[ \t\n]*>")
			end

			if endingTag then --Regular tag formation
				local endingTagPosition = string.find(text, endingTag, nextCloser, true)
				local tagContent = string.sub(text, nextCloser+1, endingTagPosition-1)
				return {
					Tag = tagName,
					Start = startingTag,
					Content = tagContent,
					End = endingTag,
					Length = #startingTag + #tagContent + #endingTag
				}

			elseif tagName == "includeonly" then --Doesn't require an ending
				local tagContent = string.sub(text, nextCloser+1)
				return {
					Tag = tagName,
					Start = startingTag,
					Content = tagContent,
					End = "", Length = #startingTag + #tagContent --#text
				}

			else --Content inside still needs escaping (also linter error!)
				return {
					Tag = tagName,
					Start = startingTag,
					Content = "", End = "",
					Length = #startingTag
				}
			end
		end
	end
	return nil
end
local function TestForComment(text) --Like TestForNowikiTag but for <!-- -->
	if startswith(text, "<!--") then
		local commentEnd = string.find(text, "-->", 5, true)
		if commentEnd then
			return {
				Start = "<!--", End = "-->",
				Content = string.sub(text, 5, commentEnd-1),
				Length = commentEnd+2
			}
		else --Consumes all text if not given an ending
			return {
				Start = "<!--", End = "",
				Content = string.sub(text, 5),
				Length = #text
			}
		end
	end
	return nil
end

--[[ Implementation notes
The goal of this function is to escape all text that wouldn't be parsed if it
was preprocessed (see above implementation notes).

Using keepComments will keep all HTML comments instead of removing them. They
will still be escaped regardless to avoid processing errors
--]]
local function EscapeEscapedText(text, keepComments) --What a name!
	local newtext = ""
	while text ~= "" do
		local NextCheck = string.find(text,"<[NnSsPpIi!]") --Advance to the next potential tag we care about
		if not NextCheck then --Done
			newtext = newtext .. text
			break
		end
		newtext = newtext .. string.sub(text,1,NextCheck-1)
		text = string.sub(text, NextCheck)
		local Comment = TestForComment(text)
		if Comment then
			if keepComments then
				newtext = newtext .. Comment.Start .. mw.text.nowiki(Comment.Content) .. Comment.End
			end
			text = string.sub(text, Comment.Length+1)
		else
			local Tag = TestForNowikiTag(text)
			if Tag then
				local newTagStart = "<" .. mw.text.nowiki(string.sub(Tag.Start,2,-2)) .. ">"
				local newTagEnd = 
					Tag.End == "" and "" or --Respect no tag ending
					"</" .. mw.text.nowiki(string.sub(Tag.End,3,-2)) .. ">"
				local newContent = mw.text.nowiki(Tag.Content)
				newtext = newtext .. newTagStart .. newContent .. newTagEnd
				text = string.sub(text, Tag.Length+1)
			else --Nothing special, move on...
				newtext = newtext .. string.sub(text, 1, 1)
				text = string.sub(text, 2)
			end
		end
	end
	return newtext
end

--Main entry points
p.EscapeEscapedText = EscapeEscapedText
p.EET = EscapeEscapedText
function p.main(o1, o2)
	if type(o1) == "table" and o1.getParent then --Template invocation most likely
		return "<span class='error'>This module can't be invoked directly via a Template. " ..
		"See [[Module:Sandbox/Aidan9382/ExcessiveParsing/doc]] for proper usage of this module</span>"
	else --Module invocation, defer to EET
		return EscapeEscapedText(o1, o2)
	end
end
--Extra entry points, likely not required
p.TestForNowikiTag = TestForNowikiTag
p.TFNWT = TestForNowikiTag
p.TestForComment = TestForComment
p.TTC = TestForComment

return p

--[[ console tests
local s = [=[Hey!{{Text|<nowiki | ||>
Hey! }}
A</nowiki>|<!--AAAAA|AAA-->Should see|Shouldn't see}}]=]
local out = p.EET(s)
mw.logObject(out)

local s = [=[<!--
Hey!
-->A]=]
local out = p.TFC(s)
mw.logObject(out); mw.log(string.sub(s, 1, out.Length))
]]