Jump to content

Module:Sandbox/Erutuon

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Erutuon (talk | contribs) at 20:52, 3 July 2018 (shorter). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

local p = {}

local Unicode_data = require 'Module:Unicode data/sandbox'

local function errorf(level, ...)
	if type(level) == number then
		return error(string.format(...), level + 1)
	else -- level is actually the format string.
		return error(string.format(level, ...), 2)
	end
end

function mw.logf(...)
	return mw.log(string.format(...))
end

local output_mt = {}
function output_mt:insert(str)
	self.n = self.n + 1
	self[self.n] = str
end

-- also in [[Module:Unicode data/documentation functions]]
function output_mt:insert_format(...)
	self:insert(string.format(...))
end

output_mt.join = table.concat

output_mt.__index = output_mt

local function Output()
	return setmetatable({ n = 0 }, output_mt)
end


local Latn_pattern = table.concat {
	'[',
	'\n\32-\127',
	'\194\160-\194\172',
	'\195\128-\195\191',
	'\196\128-\197\191',
	'\198\128-\201\143',
	'\225\184\128-\225\187\191',
	'\226\177\160-\226\177\191',
	'\234\156\160-\234\159\191',
	'\234\172\176-\234\173\175',
	'\239\172\128-\239\172\134',
	'\239\188\129-\239\188\188',
	'–',
	'—',
	'«', '»',
	']',
};

local get_codepoint = mw.ustring.codepoint
local function expand_range(start, ending)
	local lower, higher = get_codepoint(start), get_codepoint(ending)
	if higher < lower then
		return nil
	end
	local chars = {}
	local i = 0
	for codepoint = lower, higher do
		i = i + 1
		chars[i] = mw.ustring.char(codepoint)
	end
	return table.concat(chars)
end

local fun = require "Module:Fun"
local m_table = require "Module:Table"

local script_to_count_mt = {
	__index = function (self, key)
		self[key] = 0
		return 0
	end,
	__call = function (self, ...)
		return setmetatable({}, self)
	end
}
setmetatable(script_to_count_mt, script_to_count_mt)

-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint
-- each time it is called with an optional state and another value.
local function show_scripts(iterator, state, value)
	local script_to_count = script_to_count_mt()
	for codepoint in iterator, state, value do
		local script = Unicode_data.lookup_script(codepoint)
		script_to_count[script] = script_to_count[script] + 1
	end
	return table.concat(
		fun.mapIter(
			function (count, script)
				return ("%s (%d)"):format(script, count)
			end,
			m_table.sortedPairs(
				script_to_count,
				function (script1, script2)
					return script_to_count[script1] > script_to_count[script2]
				end)),
		", ")
end

local function get_chars_in_scripts(iterator, state, value)
	local script_to_char_set = {}
	for codepoint in iterator, state, value do
		local script = Unicode_data.lookup_script(codepoint)
		script_to_char_set[script] = script_to_char_set[script] or {}
		script_to_char_set[script][codepoint] = true
	end
	
	return script_to_char_set
end

local function print_char_set_map(script_to_char_set, format, separator)
	format = format or "%s: %s"
	separator = separator or "\n"
	return table.concat(
		fun.mapIter(
			function (char_set, script)
				local char_list = fun.mapIter(
					function (_, codepoint)
						return mw.ustring.char(codepoint)
					end,
					m_table.sortedPairs(char_set))
				return (format):format(script, mw.text.nowiki(table.concat(char_list)))
			end,
			m_table.sortedPairs(script_to_char_set)),
		separator)
end

function p.show(frame)
	local expanded_pattern = Latn_pattern
		:gsub('%[(.-)%]', '%1')
		:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.
			'([%z\1-\127\194-\244][\128-\191]*)%-([%z\1-\127\194-\244][\128-\191]*)',
			function (char1, char2)
				return expand_range(char1, char2)
			end)
	
	return ('* <div style="overflow-wrap: break-word;">%s</div><br>%s')
		:format(expanded_pattern
			:gsub('^%s*', ''), -- Remove initial '\n ' to avoid creating unwanted pre element.
			show_scripts(mw.ustring.gcodepoint(expanded_pattern)))
end

local function get_block_info_from_arg(args, arg)
	local block_name = args[1]
		or errorf("Parameter %s is required", tostring(arg))
	
	local block_info = Unicode_data.get_block_info(block_name)
		or errorf("The block '%s' could be found", block_name)
	
	return block_info
end

local function get_boolean_from_arg(args, arg)
	return args[arg] and require "Module:Yesno" (args[arg])
end

function p.scripts_in_block(frame)
	local block_info = get_block_info_from_arg(frame.args, 1)
	local show_block_name = get_boolean_from_arg(frame.args, 2)
	local script_list = show_scripts(fun.range(block_info[1], block_info[2]))
	if show_block_name then
		return ("%s: %s"):format(block_info[3], script_list)
	else
		return script_list
	end
end

local function link_block_name(block_name)
	if block_name:find ' ' then
		return ("[[%s]]"):format(block_name)
	else
		return ("[[%s (Unicode block)|%s]]"):format(block_name, block_name)
	end
end

function p.scripts_in_blocks(frame)
	local output = Output()
	local start = frame.args[1] and tonumber(frame.args[1], 16) or 0
	local ending = frame.args[2] and tonumber(frame.args[2], 16) or 0x4000
	
	local script_data = mw.loadData "Module:Unicode data/scripts"
	local singles = script_data.singles
	local ranges = script_data.ranges
	
	local function clear (self)
		for _, key in ipairs(m_table.keysToList(self, false)) do
			self[key] = nil
		end
	end
	
	local counts = {}
	setmetatable(counts, {
		__index = {
			increment = function(self, script_code, amount)
				self[script_code] = (self[script_code] or 0) + (amount or 1)
			end,
			clear = clear,
		}
	})
	local codepoints_per_script = {}
	setmetatable(codepoints_per_script, {
		__index = {
			add = function(self, script_code, codepoint)
				self[script_code] = self[script_code] or { n = 0 }
				if self[script_code].n <= 0x20
						and not (codepoint <= 0x9F and (codepoint >= 0x80
						or codepoint <= 0x1F)) then
					if self[script_code].n == 0x20 then
						local period = ('.'):byte()
						for _ = 1, 3 do
							self[script_code].n = self[script_code].n + 1
							self[script_code][self[script_code].n] = period
						end
					else
						if script_code == "Zinh" then -- probably combining character
							self[script_code].n = self[script_code].n + 1
							self[script_code][self[script_code].n] = 0x25CC
						end
						self[script_code].n = self[script_code].n + 1
						self[script_code][self[script_code].n] = codepoint
					end
				end
			end,
			clear = clear,
		}
	})
	
	output:insert [[
{| class="wikitable"
|+ Scripts in each Unicode block
! block !! codepoints !! scripts
]]
	
	for _, block in pairs(mw.loadData "Module:Unicode data/blocks") do
		local codepoint = block[1]
		if codepoint > ending then break end
		
		if codepoint >= start then
			while codepoint <= block[2] do
				local script = singles[codepoint]
				local count
				if script then -- Codepoint is in "singles" map.
					counts:increment(script)
					codepoints_per_script:add(script, codepoint)
					codepoint = codepoint + 1
					count = 1 -- for potential future use
				else
					local range, index = Unicode_data.binary_range_search(codepoint, ranges)
					if range then -- Codepoint is in "ranges" array.
						count = 0
						script = range[3]
						while codepoint <= range[2] and codepoint <= block[2] do
							count = count + 1
							codepoints_per_script:add(script, codepoint)
							codepoint = codepoint + 1
						end
						counts:increment(script, count)
					else -- Codepoint doesn't have data; it's Zzzz.
						-- Get range immediately above codepoint.
						while ranges[index][2] < codepoint do
							index = index + 1
						end
						
						count = 0
						script = "Zzzz"
						local range = ranges[index]
						while codepoint < range[1] and codepoint <= block[2]
								and not singles[codepoint] do
							count = count + 1
							codepoint = codepoint + 1
						end
						counts:increment(script, count)
					end
				end
			end
			
			output:insert_format([[
|-
| %s
| U+%04X&ndash;U+%04X
| %s
]], link_block_name(block[3]), block[1], block[2],
				table.concat(
					fun.map(
						function (count, script)
							return ('<abbr title="%s">%s</abbr> (<span title="%s">%d</span>)')
								:format(
									script_data.aliases[script], script,
									codepoints_per_script[script]
										and mw.text.nowiki(mw.ustring.char(
											unpack(codepoints_per_script[script])))
										or "",
									count)
						end,
						m_table.sortedPairs(
							counts,
							function (script1, script2)
								return counts[script1] > counts[script2]
							end)),
					", "))
		end
		
		-- mw.logObject(codepoints_per_script, block[3])
		counts:clear()
		codepoints_per_script:clear()
	end
	output:insert "|}"
	
	return output:join()
end

function p.chars_in_scripts_in_block(frame)
	local block_info = get_block_info_from_arg(frame.args, 1)
	local show_block_name = get_boolean_from_arg(frame.args, 2)
	local script_char_set_map = print_char_set_map(
		get_chars_in_scripts(fun.range(block_info[1], block_info[2])))
	if show_block_name then
		return ("%s: %s"):format(block_info[3], script_char_set_map)
	else
		return script_char_set_map
	end
end

function p.search_for_language_codes(frame)
	local page_name = frame.args[1] or "English language"
	
	local success, title_object = pcall(mw.title.new, page_name)
	if not (success and title_object) then
		mw.logf("Could not make title object for '%s'.", page_name)
		return
	end
	
	local content = title_object:getContent()
	
	local language_codes = {}
	for lang_template in content:gmatch '{{lang[^}]+' do
		local template_name = lang_template:match('{{([^|}]+)')
		local language_code
		if template_name == 'lang' then
			language_code = lang_template:match '{{lang|([^|}]+)'
		elseif template_name:find '^lang-' then
			language_code = lang_template:match '{{lang-([^|}]+)'
		end
		if language_code then
			language_codes[language_code] = true
		end
	end
	
	return table.concat(m_table.keysToList(language_codes), ', ')
end

-- A previous draft, in [[Module:Lang/sandbox]]:
-- https://en.wikipedia.org/w/index.php?oldid=812819217
function p.parse_IETF(tag)
	if not tag or tag == "" or type(tag) ~= "string" then
		return nil
	end
	
	-- This contains the special fields "matched_count" and "invalid".
	-- "matched_count" tracks the number of subtags, "error" indicates why the
	-- tag is invalid (if applicable).
	-- All other fields are subtags, and they appear in the tag in the following
	-- order:
	-- "language", "script", "region", "variant", "private_use", "invalid"
	-- "invalid" is the portion of the tag after the last valid subtag (minus a
	-- hyphen).
	local parsed_subtags = { matched_count = 0 }
	if not tag:find '^[A-Za-z0-9-]+$' then
		parsed_subtags.error = "invalid characters"
		parsed_subtags.invalid = tag
		return parsed_subtags
	end
	
	local subtags = mw.text.split(tag, "-")
	
	-- An array of patterns for each subtag, and a "type" field for the name
	-- of the subtag.
	-- The patterns are checked in order, and any of the subtags can be skipped.
	-- So, for example, the "language" subtag must precede the "script"
	-- subtag, but a tag may contain a "language" subtag, no "script" subtag
	-- and then a "region" subtag.
	-- If the full list of subtags has been iterated over, the remaining subtags
	-- must match the pattern for a private-use subtag, or the tag is invalid.
	local subtag_info = { -- can be put in data module
		{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case
		-- include extlang?
		{ "%a%a%a%a", type = "script" }, -- Ssss
		{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD
		{
			"%d%d%d%d", -- 4 digits
			"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters
			type = "variant"
		}
	}
	
	local index = 1
	local last_matched_subtag_i = 0
	for subtag_i, subtag in ipairs(subtags) do
		local type
		local matched = false
		while not matched do
			-- Check each pattern for the subtag type at "index" in "subtag_info".
			for _, pattern in ipairs(subtag_info[index]) do
				if subtag:find("^" .. pattern .. "$") then
					type = subtag_info[index].type
					matched = true
					break
				end
			end
			if not matched then -- Go to next item in subtag_info.
				index = index + 1
				
				if not subtag_info[index] then
					break
				end
			end
		end
		
		if type then
			parsed_subtags[type] = subtag
			last_matched_subtag_i = subtag_i
			parsed_subtags.matched_count = parsed_subtags.matched_count + 1
		elseif not subtag_info[index] then
			break
		end
	end
	
	if #subtags > parsed_subtags.matched_count then
		-- Not all subtags were matched. The unmatched tail end of the tag
		-- (after the subtag at the index last_matched_subtag_i) is a
		-- private-use subtag if it starts with "x". Otherwise, the tag is
		-- invalid.
		local suffix = table.concat(subtags, "-", last_matched_subtag_i + 1)
		if suffix:sub(1, 1) == "x" then
			parsed_subtags.private_use = suffix
			parsed_subtags.matched_count = parsed_subtags.matched_count + 1
		else
			parsed_subtags.invalid = suffix
			parsed_subtags.error = "invalid subtag"
		end
	end
	
	if not parsed_subtags.language then
		parsed_subtags.error = "no language"
	end
	
	return parsed_subtags
end

return p