Jump to content

Module:WikitextParser

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Sophivorus (talk | contribs) at 14:26, 22 February 2024 (Created page with '-- Module:WikitextParser is a general-purpose wikitext parser -- Documentation and master version: https://en.wikipedia.org/wiki/Module:WikitextParser -- Authors: User:Sophivorus, User:Certes & others -- License: CC-BY-SA-4.0 local WikitextParser = {} -- Get the requested tags from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @param selector Tags to return, for example 'div' or 'div,span,gallery'. Omit to return all tags. -- @retu...'). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
(diff) ← Previous revision | Latest revision (diff) | Newer revision → (diff)

-- Module:WikitextParser is a general-purpose wikitext parser
-- Documentation and master version: https://en.wikipedia.org/wiki/Module:WikitextParser
-- Authors: User:Sophivorus, User:Certes & others
-- License: CC-BY-SA-4.0
local WikitextParser = {}

-- Get the requested tags from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @param selector Tags to return, for example 'div' or 'div,span,gallery'. Omit to return all tags.
-- @return Sequence of strings containing the wikitext of the requested tags.
-- @return Original wikitext minus requested tags.
function WikitextParser.getTags( wikitext, selector )
	local tags = {}
	local tagName, tagText, tagEnd
	local original = wikitext
	local count = 0
	for tagStart, tagOpen in string.gmatch( original, '()(<[^/].->)' ) do
		tagName = string.match( tagOpen, '< ?(.-)[ >]' )

		-- If we're in a self-closing tag, like <ref name="foo" /> or <br/> or <hr>
		if string.match( tagOpen, '<.-/>' ) or tagName == 'br' or tagName == 'hr' then
			tagText = tagOpen

		-- If we're in a tag that may contain others like it, like <div> or <span>
		elseif tagName == 'div' or tagName == 'span' then
			local position = tagStart + string.len(tagOpen) - 1
			local depth = 1
			while depth > 0 do
				tagEnd = string.match( original, '</ ?' .. tagName .. ' ?>()', position )
				if tagEnd then
					tagEnd = tagEnd - 1
				else
					break -- unclosed tag
				end 
				position = string.match( original, '()< ?' .. tagName .. '[ >]', position + 1 )
				if not position then
					position = tagEnd + 1
				end
				if position > tagEnd then
					depth = depth - 1
				else
					depth = depth + 1
				end
			end
			tagText = string.sub( original, tagStart, tagEnd )

		-- Else we're in tag that shouldn't contain others like it, like <math> or <strong>
		else
			tagEnd = string.match( original, '</ ?' .. tagName .. ' ?>()', tagStart ) - 1
			tagText = string.sub( original, tagStart, tagEnd )
		end

		count = count + 1
		if isSelected( selector, count, tagName ) then
			table.insert( tags, tagText )
		else
			wikitext = removeString( wikitext, tagText )
		end
	end
	return tags, wikitext
end

-- Helper function to determine if a given element is filtered or not by a selector
-- @todo Should probably merge with parseSelector
local function isSelected( selector, count, value )
	local map, blacklist = parseSelector( selector )
	if not blacklist and ( not map or map[ count ] or map[ value ] )
	or blacklist and map and not map[ count ] and not map[ value ] then
		return true
	end
end

-- Helper function to convert a comma-separated list of numbers or min-max ranges into a list of booleans
-- @param selector Comma-separated list of numbers or min-max ranges, for example '1,3-5'
-- @return Map from integers to booleans, for example {1=true,2=false,3=true,4=true,5=true}
-- @return Boolean indicating whether the selector should be treated as a blacklist or not
local function parseSelector( selector )
	local map = {}
	local blacklist = false

	if not selector then return nil, false end

	if type( selector ) == 'number' then
		if selector < 0 then
			selector = -selector
			blacklist = true
		end
		map = { [ selector ] = true }

	elseif type( selector ) == 'string' then
		if string.sub( selector, 1, 1 ) == '-' then
			selector = string.sub( selector, 2 )
			blacklist = true
		end
		local ranges = mw.text.split( value, ',' ) -- split ranges: '1,3-5' to {'1','3-5'}
		for _, range in pairs( ranges ) do
			range = mw.text.trim( range )
			local min, max = mw.ustring.match( range, '^(%d+)%s*[-–—]%s*(%d+)$' ) -- '3-5' to min=3 max=5
			if not max then min, max = string.match( range, '^((%d+))$' ) end -- '1' to min=1 max=1
			if max then
				for i = min, max do map[ i ] = true end
			else
				map[ range ] = true -- if we reach this point, the string had the form 'a,b,c' rather than '1,2,3'
			end
		end

	-- List has the form { [1] = false, [2] = true, ['c'] = false }
	-- Convert it to { [1] = true, [2] = true, ['c'] = true }
	-- But if ANY value is set to false, treat the list as a blacklist
	elseif type( selector ) == 'table' then
		for i, v in pairs( selector ) do
			if v == false then blacklist = true end
			map[ i ] = true
		end
	end

	return map, blacklist
end

-- Helper function to remove a string from a text
local function removeString( text, str )
	local pattern = escapeString( str )
	if #pattern > 9999 then -- strings longer than 10000 bytes can't be put into regexes
		pattern = escapeString( mw.ustring.sub( str, 1, 999 ) ) .. '.-' .. escapeString( mw.ustring.sub( str, -999 ) )
	end
	return string.gsub( text, pattern, '' )
end

-- Helper function to escape a string for use in regexes
local function escapeString( str )
	return string.gsub( str, '[%^%$%(%)%.%[%]%*%+%-%?%%]', '%%%0' )
end

return WikitextParser