Module:WikitextParser
![]() | This module is rated as alpha. It is ready for third-party input, and may be used on a few pages to see if problems arise, but should be watched. Suggestions for new features or changes in their input and output mechanisms are welcome. |
This module is a general-purpose wikitext parser. It's designed to be used by other Lua modules and shouldn't be called directly by templates.
Usage
First, require WikitextParser and get some wikitext to parse. For example:
local parser = require( 'Module:WikitextParser' )
local title = mw.title.getCurrentTitle()
local wikitext = title:getContent()
Then, use and combine the available methods. For example:
local sections = parser.getSections( wikitext )
for sectionTitle, sectionContent in pairs( sections ) do
local sectionFiles = parser.getFiles( sectionContent )
-- Do stuff
end
Methods
getLead
getLead( wikitext )
Returns the lead section from the given wikitext. The lead section is defined as everything before the first section title. If there's no lead section, an empty string will be returned.
getSections
getSections( wikitext )
Returns a table with the section titles as keys and the section contents as values. This method doesn't get the lead section (use getLead for that).
getSection
getSection( wikitext, sectionTitle )
Returns the content of the section with the given section title, including subsections. If you don't want subsections, use getSections instead. If the given section title appears more than once, only the first will be returned. If the section is not found, nil will be returned.
getSectionTag
getSectionTag( wikitext, tagName )
Returns the contents of the <section> tag with the given tag name (see Help:Labeled section transclusion). If the tag is not found, nil will be returned.
getLists
getLists( wikitext )
Returns a table with each value being a list (ordered or unordered).
getParagraphs
getParagraphs( wikitext )
Returns a table with each value being a paragraph. Paragraphs are defined as block-level elements that are not lists, templates, files, categories, tables or section titles.
getTemplates
getTemplates( wikitext )
Returns a table with each value being a template.
getTemplate
getTemplate( wikitext, templateName )
Returns the template with the given template name.
getTemplateName
getTemplateName( templateWikitext )
Returns the name of the given template. If the given wikitext is not recognized as that of a template, nil will be returned.
getTemplateParameters
getTemplateParameters( templateWikitext )
Returns a table with the parameter names as keys and the parameter values as values. For unnamed parameters, the keys are numerical. If the given wikitext is not recognized as that of a template, nil will be returned.
getTags
getTags( wikitext )
Returns a table with each value being a tag and its contents (like <div>, <gallery>, <ref>, <noinclude>). Tags inside tags will be ignored. If you're interested in getting them, run this method again for each of the returned tags.
getTagName
getTagName( tagWikitext )
Returns the name of the tag in the given wikitext. For example 'div', 'span', 'gallery', 'ref', etc.
getTagAttribute
getTagAttribute( tagWikitext, attribute )
Returns the value of an attribute in the given tag. For example the id of a div or the name of a reference.
getGalleries
getGalleries( wikitext )
Returns a table with each value being a gallery.
getReferences
getReferences( wikitext )
Returns a table with each value being a reference. This includes self-closing references (like <ref name="foo" />) as well as full references.
getTables
getTables( wikitext )
Returns a table with each value being a wiki table.
getTableAttribute
getTableAttribute( tableWikitext, attribute )
Returns the value of an attribute in the given wiki table. For example the id or the class.
getTable
getTable( wikitext, id )
Returns the wiki table with the given id. If not found, nil will be returned.
getTableData
getTableData( tableWikitext )
Returns a Lua table representing the data of the given wiki table.
getLinks
getLinks( wikitext )
Returns a Lua table with each value being a wiki link. For external links, use getExternalLinks instead.
getFileLinks
getFileLinks( wikitext )
Returns a Lua table with each value being a file link.
getFileName
getFileName( fileWikitext )
Returns the name of the given template. If the given wikitext is not recognized as that of a file, nil will be returned.
getCategories
getCategories( wikitext )
Returns a Lua table with each value being a category link.
getExternalLinks
getExternalLinks( wikitext )
Returns a Lua table with each value being an external link. For internal links, use getLinks instead.
See also
- Module:Excerpt - Main caller of this module
- mw:WikitextParser.js - Similar parser written in JavaScript, for use in gadgets, user scripts and other tools
-- Module:WikitextParser is a general-purpose wikitext parser
-- Documentation and master version: https://en.wikipedia.org/wiki/Module:WikitextParser
-- Authors: User:Sophivorus, User:Certes & others
-- License: CC-BY-SA-4.0
local WikitextParser = {}
-- Get the requested tags from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @param selector Tags to return, for example 'div' or 'div,span,gallery'. Omit to return all tags.
-- @return Sequence of strings containing the wikitext of the requested tags.
-- @return Original wikitext minus requested tags.
function WikitextParser.getTags( wikitext, selector )
local tags = {}
local tagName, tagText, tagEnd
local original = wikitext
local count = 0
for tagStart, tagOpen in string.gmatch( original, '()(<[^/].->)' ) do
tagName = string.match( tagOpen, '< ?(.-)[ >]' )
-- If we're in a self-closing tag, like <ref name="foo" /> or <br/> or <hr>
if string.match( tagOpen, '<.-/>' ) or tagName == 'br' or tagName == 'hr' then
tagText = tagOpen
-- If we're in a tag that may contain others like it, like <div> or <span>
elseif tagName == 'div' or tagName == 'span' then
local position = tagStart + string.len(tagOpen) - 1
local depth = 1
while depth > 0 do
tagEnd = string.match( original, '</ ?' .. tagName .. ' ?>()', position )
if tagEnd then
tagEnd = tagEnd - 1
else
break -- unclosed tag
end
position = string.match( original, '()< ?' .. tagName .. '[ >]', position + 1 )
if not position then
position = tagEnd + 1
end
if position > tagEnd then
depth = depth - 1
else
depth = depth + 1
end
end
tagText = string.sub( original, tagStart, tagEnd )
-- Else we're in tag that shouldn't contain others like it, like <math> or <strong>
else
tagEnd = string.match( original, '</ ?' .. tagName .. ' ?>()', tagStart ) - 1
tagText = string.sub( original, tagStart, tagEnd )
end
count = count + 1
if isSelected( selector, count, tagName ) then
table.insert( tags, tagText )
else
wikitext = removeString( wikitext, tagText )
end
end
return tags, wikitext
end
-- Helper function to determine if a given element is filtered or not by a selector
-- @todo Should probably merge with parseSelector
local function isSelected( selector, count, value )
local map, blacklist = parseSelector( selector )
if not blacklist and ( not map or map[ count ] or map[ value ] )
or blacklist and map and not map[ count ] and not map[ value ] then
return true
end
end
-- Helper function to convert a comma-separated list of numbers or min-max ranges into a list of booleans
-- @param selector Comma-separated list of numbers or min-max ranges, for example '1,3-5'
-- @return Map from integers to booleans, for example {1=true,2=false,3=true,4=true,5=true}
-- @return Boolean indicating whether the selector should be treated as a blacklist or not
local function parseSelector( selector )
local map = {}
local blacklist = false
if not selector then return nil, false end
if type( selector ) == 'number' then
if selector < 0 then
selector = -selector
blacklist = true
end
map = { [ selector ] = true }
elseif type( selector ) == 'string' then
if string.sub( selector, 1, 1 ) == '-' then
selector = string.sub( selector, 2 )
blacklist = true
end
local ranges = mw.text.split( value, ',' ) -- split ranges: '1,3-5' to {'1','3-5'}
for _, range in pairs( ranges ) do
range = mw.text.trim( range )
local min, max = mw.ustring.match( range, '^(%d+)%s*[-–—]%s*(%d+)$' ) -- '3-5' to min=3 max=5
if not max then min, max = string.match( range, '^((%d+))$' ) end -- '1' to min=1 max=1
if max then
for i = min, max do map[ i ] = true end
else
map[ range ] = true -- if we reach this point, the string had the form 'a,b,c' rather than '1,2,3'
end
end
-- List has the form { [1] = false, [2] = true, ['c'] = false }
-- Convert it to { [1] = true, [2] = true, ['c'] = true }
-- But if ANY value is set to false, treat the list as a blacklist
elseif type( selector ) == 'table' then
for i, v in pairs( selector ) do
if v == false then blacklist = true end
map[ i ] = true
end
end
return map, blacklist
end
-- Helper function to remove a string from a text
local function removeString( text, str )
local pattern = escapeString( str )
if #pattern > 9999 then -- strings longer than 10000 bytes can't be put into regexes
pattern = escapeString( mw.ustring.sub( str, 1, 999 ) ) .. '.-' .. escapeString( mw.ustring.sub( str, -999 ) )
end
return string.gsub( text, pattern, '' )
end
-- Helper function to escape a string for use in regexes
local function escapeString( str )
return string.gsub( str, '[%^%$%(%)%.%[%]%*%+%-%?%%]', '%%%0' )
end
return WikitextParser