Jump to content

Module:Ancient Greek

Permanently protected module
From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Erutuon (talk | contribs) at 23:02, 21 October 2017 (new version based on current versions of wikt:Module:grc-translit and wikt:Module:grc-utilities, hopefully faster). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

local p = {}

local m_table = require "Module:TableTools"
local checkType = require "libraryUtil".checkType

local ustring = mw.ustring
local U = ustring.char
local len = ustring.len
local sub = ustring.sub
local find = ustring.find
local gsub = ustring.gsub
local decompose = ustring.toNFD
local lower = ustring.lower
local upper = ustring.upper

local macron = U(0x304)
local breve = U(0x306)
local rough = U(0x314)
local smooth = U(0x313)
local diaeresis = U(0x308)
local acute = U(0x301)
local grave = U(0x300)
local circumflex = U(0x342)
local Latin_circumflex = U(0x302)
local subscript = U(0x345)
local i_diphthong = "[ΑΕΗΟΥΩαεηουω][Ιι]"
local u_diphthong = "[ΑΕΗΟΩαεηοω][Υυ]"

local list_to_set = function(array)
	local set = {}
	for i, item in ipairs(array) do
		set[item] = true
	end
	return set
end

local is_diacritic = list_to_set{
	macron, breve,
	rough, smooth, diaeresis,
	acute, grave, circumflex,
	subscript,
}

local function _check(funcName)
	return function(argIndex, arg, expectType, nilOk)
		return checkType(funcName, argIndex, arg, expectType, nilOk)
	end
end

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["η"] = "e" .. macron,
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",
	["ω"] = "o" .. macron,

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",
	
	-- Archaic letters
	["ϝ"] = "w",
	["ϻ"] = "ś",
	["ϙ"] = "q",
	["ϡ"] = "š",
	["ͷ"] = "v",
	
	-- Diacritics
	-- unchanged: macron, diaeresis, grave, acute
	[breve] = '',
	[smooth] = '',
	[rough] = '',
	[circumflex] = Latin_circumflex,
	[subscript] = 'i',
}

local function is_diphthong(chars, next_chars)
	return (find(chars, '^' .. i_diphthong .. '$') 
		or find(chars, '^' .. u_diphthong .. '$') )
		and not find(next_chars, "^[" .. macron .. breve .. "]?" .. diaeresis)
end

-- Concatenates a character onto an existing table value, or inserts it,
-- then removes it from the text.
local function add(list, index, chars, text)
	if not chars then
		error("The function add cannot act on a nil character.")
	end
	if list[index] then
		list[index] = list[index] .. chars
	else
		list[index] = chars
	end
	return sub(text, len(chars) + 1)
end

--[=[
		This breaks a word into meaningful "tokens", which are
		individual letters or diphthongs with their diacritics.
		Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]].
--]=]
local function tokenize(text)
	local tokens = {}
	local i = 0
	while len(text) > 0 do
		local char = sub(text, 1, 1) or ""
		local chars = sub(text, 1, 2) or ""
		local next_chars = sub(text, 3, 4) or ""
		-- Look for a diacritic and add it to the current token. Remove it from the text.
		if is_diacritic[char] then
			text = add(tokens, i, char, text)
	--[[	See if the next two characters form a diphthong and if so,
			add them to the current token. Remove them from the text.
			If there's a diaeresis, it will be immediately after
			the second of the two characters, or after a macron or breve.	]]
		elseif is_diphthong(chars, next_chars) then
			i = i + 1
			text = add(tokens, i, chars, text)
		else
		-- Add the current character to the next token. Remove it from the text.
			i = i + 1
			text = add(tokens, i, char, text)
		end
	end
	
	tokens.maxindex = i
	
	return m_table.compressSparseArray(tokens)
end

function p.transliterate(text, lang, sc)
	text = decompose(text)
	--[[
	if text == '῾' then
		return 'h'
	end
	--]]
	
	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except after an ASCII alphanumeric character (to avoid converting
		semicolons in HTML entities).
	--]]
	text = gsub(text, "([^A-Za-z0-9])[;" .. U(0x37E) .. "]", "%1?")
	
	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = gsub(text, "·", ";")
	
	local tokens = tokenize(text)

	--now read the tokens
	local output = {}
	for i, token in pairs(tokens) do
		-- substitute each character in the token for its transliteration
		local translit = gsub(
			mw.ustring.lower(token),
			'.',
			tt
		)
		
		local next_token = tokens[i + 1]
		
		if token == 'γ' and next_token and find(next_token, '[κγχξ]') then
			-- γ before a velar should be <n>
			translit = 'n'
		elseif token == 'ρ' and tokens[i - 1] == 'ρ' then
			-- ρ after ρ should be <rh>
			translit = 'rh'
		elseif find(token, '^[αΑ].*' .. subscript .. '$') then
			-- add macron to ᾳ
			translit = gsub(translit, '([aA])', '%1' .. macron)
		end
		
		if find(token, rough) then
			if find(token, '[Ρρ]') then
				translit = translit .. 'h'
			else -- vowel
				translit = 'h' .. translit
			end
		end
		
		-- Remove macron from a vowel that has a circumflex.
		if find(translit, macron .. diaeresis .. "?" .. Latin_circumflex) then
			translit = gsub(translit, macron, '')
		end
		
		-- Capitalize first character of transliteration.
		if token ~= lower(token) then
			translit = gsub(translit, "^.", upper)
		end
		
		table.insert(output, translit)
	end
	
	return table.concat(output)
end

function p.old_transliterate(text)
	local gsub = mw.ustring.gsub
	local find = mw.ustring.find
	local out
	if text == "" or text == nil then
		out = ""
		error = '<span style="color: red; font-size: 85%;">Please place Ancient Greek text in the first parameter of {{[[Template:grc-transl|grc-transl]]}}.</span>'
	elseif find(mw.ustring.toNFD(text), "[ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψω]") == nil then
		out = text
		error = '<span style="color: red; font-size: 85%;"><code>' .. text .. '</code> is not Ancient Greek; {{[[Template:grc-transl|grc-transl]]}} can only transliterate Ancient Greek text.</span>'
	else
		text = gsub(text, "γ([γκξχ])", "n%1")
		text = gsub(text, "ρρ", "rrh")

		for regex, repl in pairs(tt) do
			text = gsub(text, regex, repl)
		end

		text = gsub(text, "([aáàāâeéèēêiíìīîoóòōôuúùūû])h([iíìîuúùû])", "h%1%2")
		text = gsub(
			text,
			"([AÁÀĀÂEÉÈĒÊIÍÌĪÎOÓÒŌÔUÚÙŪÛ])h([iíìîuúùû])",
			function(c, d) return "H" .. mw.ustring.lower(c) .. d end
		)
		out = text
	end
	return out
end

function p.translit(frame)
	local args = mw.getCurrentFrame():getParent().args
	local text = frame.args[1] or args[1]
	local transliteration = p.transliterate(text)
	if transliteration and not error then
		return '<span title="Ancient Greek transliteration" class="Unicode" style="white-space:normal; text-decoration: none"><i>' .. transliteration .. '</i></span>'
	else
		return error
	end
end


return p