Jump to content

Module:Wikitext Parsing

Permanently protected module
From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Aidan9382 (talk | contribs) at 18:43, 26 March 2023 (Properly decode everything). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

require("strict")

--Helper functions
local function startswith(text, subtext)
	return string.sub(text, 1, #subtext) == subtext
end
local function endswith(text, subtext)
	return string.sub(text, -#subtext, -1) == subtext
end
local function allcases(s)
	return s:gsub("%a", function(c) 
		return "["..c:upper()..c:lower().."]"
	end)
end

--[=[ Implementation notes
---- NORMAL HTML TAGS ----
Tags are very strict on how they want to start, but loose on how they end.
The start must strictly follow <[tAgNaMe](%s|>) with no room for whitespace in
the tag's name, but may then flow as they want afterwards, making
<div\nclass\n=\n"\nerror\n"\n> valid

There's no sense of escaping < or >
E.g.
 <div class="error\>"> will end at \> despite it being inside a quote
 <div class="<span class="error">error</span>"> will not process the larger div

If a tag has no end, it will consume all text instead of not processing

---- NOPROCESSING TAGS (nowiki, pre, syntaxhighlight, source) ----
(In most comments, <source> will not be mentioned. This is because it is the
deprecated version of <syntaxhighlight>)

No-Processing tags have some interesting differences to the above rules.
For example, their syntax is a lot stricter. While an opening tag appears to
follow the same set of rules, A closing tag can't have any sort of extra
formatting period. While </div a/a> is valid, </nowiki a/a> isn't - only
newlines and spaces are allowed in closing tags (except in <pre> tags, which
follow the rules of a regular html tag for formatting).

Both the content inside the tag pair and the content inside each side of the
pair is not processed. E.g. <nowiki |}}>|}}</nowiki> would have both of the |}}
escaped in practice.

When something in the code is referenced to as a "Nowiki Tag", it means a tag
which causes wiki text to not be processed, which includes <nowiki>, <pre>,
and <syntaxhighlight>

Since we only care about these tags, we can ignore the idea of an intercepting
tag preventing processing, and just go straight for the first ending we can find
If there is no ending to find, the tag will NOT consume the rest of the text in
terms of processing behaviour (though <pre> will appear to have an effect).
Even if there is no end of the tag, the content inside the opening half will
still be unprocessed, meaning {{X20|<nowiki }}>}} wouldn't end at the first }}
despite there being no ending to the tag.

Note that there are some tags, like <math>, which also function like <nowiki>
which are included in this aswell. Some other tags, like <ref>, have far too
unpredictable behaviour to be handled currently (they'd have to be split and
processed as something seperate - its complicated, but maybe not impossible.)
I suspect that every tag listed in [[Special:Version]] may behave somewhat like
this, but that's far too many cases worth checking for rarely used tags that may
not even have a good reason to contain {{ or }} anyways, so we leave them alone.

---- HTML COMMENTS AND INCLUDEONLY ----
HTML Comments are about as basic as it could get for this
Start at <!--, end at -->, no extra conditions. Simple enough
If a comment has no end, it will eat all text instead of not being processed

includeonly tags function mostly like a regular nowiki tag, with the exception
that the tag will actually consume all future text if not given an ending as
opposed to simply giving up and not changing anything. Due to complications and
the fact that this is far less likely to be present on a page, aswell as being
something that may not want to be escaped, includeonly tags are ignored during
processing
--]=]
local validtags = {nowiki=1, pre=1, syntaxhighlight=1, source=1, math=1}
--This function expects the string to start with the tag
local function TestForNowikiTag(text)
	local tagName = (string.match(text, "^<([^\n />]+)") or ""):lower()
	if not validtags[tagName] then
		return nil
	end
	local nextOpener = string.find(text, "<", 2) or -1
	local nextCloser = string.find(text, ">", 2) or -1
	if nextCloser > -1 and (nextOpener == -1 or nextCloser < nextOpener) then
		local startingTag = string.sub(text, 1, nextCloser)
		--We have our starting tag (E.g. '<pre style="color:red">')
		--Now find our ending...
		if endswith(startingTag, "/>") then --self-closing tag (we are our own ending)
			return {
				Tag = tagName,
				Start = startingTag,
				Content = "", End = "",
				Length = #startingTag
			}

		else
			local endingTag
			if tagName == "pre" then --Looser restrictions for <pre>
				endingTag = --no | so we just use 2 matches
					string.match(text, "</[Pp][Rr][Ee]>") or
					string.match(text, "</[Pp][Rr][Ee][ \t\n/][^<]*>")
			else
				endingTag = string.match(text, "</"..allcases(tagName).."[ \t\n]*>")
			end

			if endingTag then --Regular tag formation
				local endingTagPosition = string.find(text, endingTag, nextCloser, true)
				local tagContent = string.sub(text, nextCloser+1, endingTagPosition-1)
				return {
					Tag = tagName,
					Start = startingTag,
					Content = tagContent,
					End = endingTag,
					Length = #startingTag + #tagContent + #endingTag
				}

			else --Content inside still needs escaping (also linter error!)
				return {
					Tag = tagName,
					Start = startingTag,
					Content = "", End = "",
					Length = #startingTag
				}
			end
		end
	end
	return nil
end
local function TestForComment(text) --Like TestForNowikiTag but for <!-- -->
	if startswith(text, "<!--") then
		local commentEnd = string.find(text, "-->", 5, true)
		if commentEnd then
			return {
				Start = "<!--", End = "-->",
				Content = string.sub(text, 5, commentEnd-1),
				Length = commentEnd+2
			}
		else --Consumes all text if not given an ending
			return {
				Start = "<!--", End = "",
				Content = string.sub(text, 5),
				Length = #text
			}
		end
	end
	return nil
end

--[[ Implementation notes
The goal of this function is to escape all text that wouldn't be parsed if it
was preprocessed (see above implementation notes).

Using keepComments will keep all HTML comments instead of removing them. They
will still be escaped regardless to avoid processing errors
--]]
local function PrepareText(text, keepComments)
	local newtext = ""
	while text ~= "" do
		local NextCheck = string.find(text, "<[NnSsPpMm!]") --Advance to the next potential tag we care about
		if not NextCheck then --Done
			newtext = newtext .. text
			break
		end
		newtext = newtext .. string.sub(text,1,NextCheck-1)
		text = string.sub(text, NextCheck)
		local Comment = TestForComment(text)
		if Comment then
			if keepComments then
				newtext = newtext .. Comment.Start .. mw.text.nowiki(Comment.Content) .. Comment.End
			end
			text = string.sub(text, Comment.Length+1)
		else
			local Tag = TestForNowikiTag(text)
			if Tag then
				local newTagStart = "<" .. mw.text.nowiki(string.sub(Tag.Start,2,-2)) .. ">"
				local newTagEnd = 
					Tag.End == "" and "" or --Respect no tag ending
					"</" .. mw.text.nowiki(string.sub(Tag.End,3,-2)) .. ">"
				local newContent = mw.text.nowiki(Tag.Content)
				newtext = newtext .. newTagStart .. newContent .. newTagEnd
				text = string.sub(text, Tag.Length+1)
			else --Nothing special, move on...
				newtext = newtext .. string.sub(text, 1, 1)
				text = string.sub(text, 2)
			end
		end
	end
	return newtext
end

--[=[ Implementation notes
This function is an alternative to Transcluder's getParameters which considers
the potential for a singular { or } or other odd syntax that %b doesn't like to
be in a parameter's value. Also theoretically faster as it does a singular pass
through the text instead of multiple gsub runs (though we shall see as this
slowly grows more complex as I theory this).

When handling the difference between {{ and {{{, mediawiki will attempt to match
as many sequences of {{{ as possible before matching a {{
E.g.
 {{{{A}}}} -> { {{{A}}} }
 {{{{{{{{Text|A}}}}}}}} -> {{ {{{ {{{Text|A}}} }}} }}
If there aren't enough triple braces on both sides, the parser will compromise
for a template interpretation.
E.g.
 {{{{A}} }} -> {{ {{ A }} }}

While there are technically concerns about things such as wikilinks breaking
template processing (E.g. {{[[}}]]}} doesn't stop at the first }}), it shouldn't
be our job to process inputs perfectly when the input has garbage ({ / } isn't
legal in titles anyways, so if something's unmatched in a wikilink, it's
guaranteed GIGO)

Setting dontEscape will prevent running the input text through EET. Avoid
setting this to true if you don't have to set it.

TODO: This entire "bounds" method of exclusion is seeming to be significantly expensive. This needs proper thought to fix

Returned values:
A table of all templates. Template data goes as follows:
 Text: The raw text of the template
 Name: The name of the template
 Args: A list of arguments
 Children: A list of immediate template children
--]=]
--Helper functions
local function boundlen(pair)
	return pair.End-pair.Start+1
end

--Main function
local function ParseTemplates(InputText, dontEscape)
	--Setup
	if not dontEscape then
		InputText = PrepareText(InputText)
	end
	local function finalise(text)
		if not dontEscape then
			return mw.text.decode(text)
		else
			return text
		end
	end
	local function CreateContainerObj(Container)
		Container.Text = ""
		Container.Args = {}
		Container.ArgOrder = {}
		Container.Children = {}
		Container.Name = nil
		Container._Value = nil
		Container._Key = nil
		Container._BeyondStart = false
		Container._LastIndex = 1
		function Container:HandleArgInput(character, internalcall)
			if not internalcall then
				self.Text = self.Text .. character
			end
			if character == "=" then
				if self._Key then
					self._Value = self._Value .. character
				else
					self._Key = mw.text.trim(self._Value or "")
					self._Value = ""
				end
			else --"|" or "}"
				if not Container.Name then
					Container.Name = mw.text.trim(self._Value)
					self._Value = nil
				else
					self._Value = finalise(self._Value or "")
					if self._Key then
						self._Key = finalise(self._Key)
						self.Args[self._Key] = mw.text.trim(self._Value)
						table.insert(self.ArgOrder, self._Key)
					else
						local Key = tostring(self._LastIndex)
						self.Args[Key] = self._Value or ""
						table.insert(self.ArgOrder, Key)
						self._LastIndex = self._LastIndex + 1
					end
					self._Key = nil
					self._Value = nil
				end
			end
		end
		function Container:AppendText(text)
			self.Text = self.Text .. finalise(text)
			if not self._Value then
				self._Value = ""
			end
			self._BeyondStart = self._BeyondStart or (text ~= "{") --Ignore starting { in the kv manager
			if self._BeyondStart then
				self._Value = self._Value .. text
			end
		end
		function Container:Clean()
			if self._Value then
				local closing = string.find(self._Value, "}+$")
				if closing then
					self._Value = string.sub(self._Value, 1, closing-1)
				end
				self:HandleArgInput("|", true) --Simulate ending
			end
			self._Value = nil
			self._Key = nil
			self._BeyondStart = nil
			self._LastIndex = nil
			self.HandleArgInput = nil
			self.AppendText = nil
			self.Clean = nil
		end
		return Container
	end
	
	--Step 1: Find and escape the content of all wikilinks on the page, which are stronger than templates (see implementation notes)
	local scannerPosition = 1
	local wikilinks = {}
	local openWikilinks = {}
	while true do
		local NextOpen = string.find(InputText, "%[%[", scannerPosition) or 9e9
		local NextClose = string.find(InputText, "%]%]", scannerPosition) or 9e9
		if NextOpen == NextClose then --Done (both 9e9)
			break
		end

		scannerPosition = math.min(NextOpen, NextClose)+2 --+2 to pass the [[ / ]]
		if NextOpen < NextClose then --Add a [[ to the pending wikilink queue
			table.insert(openWikilinks, NextOpen)
		else --Pair up the ]] to any available [[
			if #openWikilinks >= 1 then
				local start = table.remove(openWikilinks) --Pop the latest [[
				wikilinks[start] = {Start=start, End=NextClose+1, Type="Wikilink"} --Note the pair
			end
		end
	end
	
	--Step 2: Find the bounds of every valid template and variable ({{ and {{{)
	local scannerPosition = 1
	local templates = {}
	local variables = {}
	local openBrackets = {}
	while true do
		local NextOpen, OEnd = string.find(InputText, "{{+", scannerPosition)
		local NextClose, CEnd = string.find(InputText, "}}+", scannerPosition)
		NextOpen = NextOpen or 9e9
		NextClose = NextClose or 9e9
		if NextOpen == NextClose then --Done (both 9e9)
			break
		end

		local BoundStart = math.min(NextOpen, NextClose) --Skip to next notable block
		local BoundEnd = math.min(OEnd or 9e9, CEnd or 9e9)
		scannerPosition = BoundStart --Get to the {{ / }} set
		if NextOpen < NextClose then --Add the {{+ set to the queue
			local BracketCount = #string.match(InputText, "^{+", scannerPosition)
			table.insert(openBrackets, {Start=BoundStart, End=BoundEnd})

		else --Pair up the }} to any available {{, accounting for {{{ / }}}
			local BracketCount = #string.match(InputText, "^}+", scannerPosition)
			while BracketCount >= 2 and #openBrackets >= 1 do
				local OpenSet = table.remove(openBrackets)
				if boundlen(OpenSet) >= 3 and BracketCount >= 3 then --We have a {{{variable}}} (both sides have 3 spare)
					variables[OpenSet.End-2] = {Start=OpenSet.End-2, End=scannerPosition+2, Type="Variable"} --Done like this to ensure chronological order
					BracketCount = BracketCount - 3
					OpenSet.End = OpenSet.End - 3
					scannerPosition = scannerPosition + 3

				else --We have a {{template}} (both sides have 2 spare, but at least one side doesn't have 3 spare)
					templates[OpenSet.End-1] = {Start=OpenSet.End-1, End=scannerPosition+1, Type="Template"} --Done like this to ensure chronological order
					BracketCount = BracketCount - 2
					OpenSet.End = OpenSet.End - 2
					scannerPosition = scannerPosition + 2
				end

				if boundlen(OpenSet) >= 2 then --Still has enough data left, leave it in
					table.insert(openBrackets, OpenSet)
				end
			end
		end
		scannerPosition = BoundEnd --Now move past the bracket set
	end
	
	--Step 3: Re-trace every object using their known bounds, collecting our parameters with (slight) ease
	local scannerPosition = 1
	local activeObjects = {}
	local finalObjects = {}
	while true do
		local NNC, _, Character = string.find(InputText, "([{}%[%]|=])", scannerPosition) --NNC = NextNotableCharacter
		if not NNC then
			break
		end
		local scannedContent = string.sub(InputText, scannerPosition, NNC-1)
		if scannedContent ~= "" then
			for _,Object in next,activeObjects do
				Object:AppendText(scannedContent)
			end
		end

		scannerPosition = NNC+1
		local LatestObject = activeObjects[#activeObjects] --Commonly needed object
		if Character == "{" or Character == "[" then --
			local Container = templates[NNC] or variables[NNC] or wikilinks[NNC]
			if Container then
				CreateContainerObj(Container)
				if LatestObject and Container.Type == "Template" then --Only templates count as children
					table.insert(LatestObject.Children, Container)
				end
				table.insert(activeObjects, Container)
			end
			for _,Object in next,activeObjects do
				Object:AppendText(Character)
			end
		
		elseif Character == "}" or Character == "]" then
			for _,Object in next,activeObjects do
				Object:AppendText(Character)
			end
			if LatestObject and LatestObject.End == NNC then
				LatestObject:Clean()
				if LatestObject.Type == "Template" then
					table.insert(finalObjects, LatestObject)
				end
				activeObjects[#activeObjects] = nil
			end
			
		else --| or =
			for i = 1,#activeObjects-1 do --Insert the text for everything EXCEPT the latest object
				local Object = activeObjects[i]
				Object:AppendText(Character)
			end
			if LatestObject then
				LatestObject:HandleArgInput(Character)
			end
		end
	end
	
	--Step 4: Fix the order
	local FixedOrder = {}
	local SortableReference = {}
	for _,Object in next,finalObjects do
		table.insert(SortableReference, Object.Start)
	end
	table.sort(SortableReference)
	for i = 1,#SortableReference do
		local start = SortableReference[i]
		for n,Object in next,finalObjects do
			if Object.Start == start then
				finalObjects[n] = nil
				Object.Start = nil --Final cleanup
				Object.End = nil
				Object.Type = nil
				table.insert(FixedOrder, Object)
				break
			end
		end
	end
	
	--Finished, return
	return FixedOrder
end

local p = {}
--Main entry points
p.PrepareText = PrepareText
p.ParseTemplates = ParseTemplates
--Extra entry points, not really required
p.TestForNowikiTag = TestForNowikiTag
p.TestForComment = TestForComment

return p

--[==[ console tests

local s = [=[Hey!{{Text|<nowiki | ||>
Hey! }}
A</nowiki>|<!--AAAAA|AAA-->Should see|Shouldn't see}}]=]
local out = p.PrepareText(s)
mw.logObject(out)

local s = [=[<!--
Hey!
-->A]=]
local out = p.TestForComments(s)
mw.logObject(out); mw.log(string.sub(s, 1, out.Length))

local a = p.ParseTemplates([=[
{{User:Aidan9382/templates/dummy
|A|B|C {{{A|B}}} { } } {
|<nowiki>D</nowiki>
|<pre>E
|F</pre>
|G|=|a=|A  =  [[{{PAGENAME}}]]{{Text|1==<nowiki>}}</nowiki>}}|A B=Success}}
]=])
mw.logObject(a)

]==]