Jump to content

Module:Make emoji zwj table

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Trappist the monk (talk | contribs) at 16:32, 9 October 2022 (create;). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
(diff) ← Previous revision | Latest revision (diff) | Newer revision → (diff)

--[[

This module creates an associative table emoji code points that may follow a zero-width joiner character (U+200D).

The module reads a copy of the Unicode Emoji ZWJ Sequences for UTS (typically emoji-zwj-sequences.txt found in
https://unicode.org/Public/emoji/VV.V/ where VV.V is the Unicode version number).  The copy of the unicode data
file is held inside html comments in the module's /doc page.  From that file, the module extracts pairs of
<zwj> <emoji code point>. The moculde save each unique code point, transformed as necessary to build a new version
of emoji_t for use in Module:Citation/CS1/Configuration.

The module takes one positional parameter:
	{{#invoke:make emoji zwj table|main|<url>}}

<url> is the url that matches the Unicode data file.  Alas, Lua modules cannot read external data files so <url>
is merely used to document where the data may be found.

Use of this module is documented on its /doc page

]]

require('Module:No globals');

local emoji_names_t = {															-- keys are decimal forms of the hex values in U+xxxx
	[127752] = 'rainbow',
	[127806] = 'ear of rice',
	[127859] = 'cooking',
	[127891] = 'graduation cap',
	[127908] = 'microphone',
	[127912] = 'artist palette',
	[127979] = 'school',
	[127981] = 'factory',
	[128102] = 'boy',
	[128103] = 'girl',
	[128104] = 'man',
	[128105] = 'woman',
	[128139] = 'kiss mark',
	[128187] = 'personal computer',
	[128188] = 'brief case',
	[128295] = 'wrench',
	[128300] = 'microscope',
	[128488] = 'left speech bubble',
	[128640] = 'rocket',
	[128658] = 'fire engine',
	[129309] = 'handshake',
	[129455] = 'probing cane',
	[129456] = 'emoji component red hair',
	[129457] = 'emoji component curly hair',
	[129458] = 'emoji component bald',
	[129459] = 'emoji component white hair',
	[129466] = 'safety vest',
	[129468] = 'motorized wheelchair',
	[129469] = 'manual wheelchair',
	[129489] = 'adult',
	[9760] = 'skull and crossbones',
	[9792] = 'female sign',
	[9794] = 'male sign',
	[9877] = 'staff of aesculapius',
	[9878] = 'scales',
	[9992] = 'airplane',
	[10084] = 'heavy black heart',
	}


--[[--------------------------< M A I N >----------------------------------------------------------------------
]]

local function main (frame)
	local this_wiki = table.concat ({':', mw.language.getContentLanguage():getCode(), ':'});
	local title_obj = mw.title.getCurrentTitle();
	local content = mw.title.new (table.concat ({title_obj.prefixedText})):getContent();
	local code_points_t = {};													-- sequence to hold unique code points that follow U+200D in RGI Emoji ZWJ Sequences in decimal
	local out_t = {};															-- final output goes here
	local tabs_15 = string.rep ('\t', 15);										-- for six-digit keys
	local tabs_16 = string.rep ('\t', 16);										-- for keys that have fewer than six digits
	local file_date = content:match ('# *Date: *(%d%d%d%d%-%d%d%-%d%d)');		-- file date of the Unicode source
	local file_version = content:match ('# *Version: *([%d%.]+)');				-- version of the Unicode source

	for code_point in content:gmatch ('200D (%x+)') do							-- find each <zwj> <code point> pair
		local code_point_dec = tonumber ('0x' .. code_point);					-- convert hex code point to decimal for output table key

		if not code_points_t[code_point] then									-- if we have not seen this <code_point> before
			code_points_t[code_point] = true;									-- remember that we have now seen this <code_point>
			table.insert (out_t, table.concat ({								-- build a line for this code point
				'\t[',															-- open key markup
				code_point_dec,													-- <code_point> in decimal
				'] = true,',													-- close key and assign it the value 'true'
				(100000 <= code_point_dec) and tabs_15 or tabs_16,				-- insert a bunch of tabs between the k/v pair and an associated comment
				'-- U+',														-- start the comment; prefix for the hex <code point>
				code_point,														-- add the <code point>
				' &#x',															-- hex html entity prefix for <code point>
				code_point,														-- add the <code point>
				'; ',															-- finish the html entity
				emoji_names_t[code_point_dec] and emoji_names_t[code_point_dec] or '';	-- if we have a name for this code point, add the name; empty string else
				}));
		end
	end

	local function compare (a, b)												-- local compare function for table.sort() ascending
		a = a:match ('%[(%d+)%]');												-- extract decimal key text
		b = b:match ('%[(%d+)%]');
		return tonumber (a) < tonumber (b);										-- convert decimal key text to numbers and compare
	end

	table.sort (out_t, compare);												-- ascending numerical sort on decimal keys
	
	local prefix_t = {};														-- build a prefix for this version of the table
	table.insert (prefix_t, '<pre>-- list of emoji that use a zwj character (U+200D) to combine with another emoji');
	table.insert (prefix_t, table.concat ({'-- from: ', frame.args[1], '; version: ', file_version, '; ', file_date}));
	table.insert (prefix_t, table.concat ({'-- table created by: ', this_wiki, title_obj.nsText, ':', title_obj.baseText}));
	table.insert (prefix_t, table.concat ({'local emoji_t = {', tabs_16, '-- indexes are decimal forms of the hex values in U+xxxx'}));

	table.insert (out_t, 1, table.concat (prefix_t, '\n'));						-- insert at the head of the output table
	table.insert (out_t, '\t}</pre>');											-- close the <pre> tag
	return frame:preprocess (table.concat (out_t, '\n'));						-- make a big string and done
end


--[[--------------------------< E X P O R T S >----------------------------------------------------------------
]]

return {
	main = main,
	}