Jump to content

Module:X-SAMPA to IPA

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Erutuon (talk | contribs) at 00:45, 21 February 2017 (use descender and diacritic data to add descender-appropriate diacritics to symbols with descenders). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

local p = {}

local U = mw.ustring.char
local gsub = mw.ustring.gsub
local sub = mw.ustring.sub
local find = mw.ustring.find
local length = mw.ustring.len

-- Slashes \, apostrophes ', and double quotes " are escaped with \.
-- \\ = \, \' = ', \" = "
local data = {
	["a"] = { "a" },
	["b"] = { "b" },
	["b\\"] = { "ⱱ" }, -- not in official X-SAMPA; from http://www.kneequickie.com/kq/Z-SAMPA and used by Wiktionary
	["b_<"] = { "ɓ" },
	["c"] = { "c" },
	["d"] = { "d" },
	["d`"] = { "ɖ", descender = true },
	["d_<"] = { "ɗ" },
	["d`_<"] = { "ᶑ", descender = true }, -- not in official X-SAMPA; Wikipedia-specific
	["e"] = { "e" },
	["f"] = { "f" },
	["g"] = { "ɡ", descender = true  },
	["g_<"] = { "ɠ", descender = true },
	["h"] = { "h" },
	["h\\"] = { "ɦ" },
	["i"] = { "i" },
	["j"] = { "j", descender = true  },
	["j\\"] = { "ʝ", descender = true  },
	["k"] = { "k" },
	["l"] = { "l" },
	["l`"] = { "ɭ", descender = true  },
	["l\\"] = { "ɺ" },
	["m"] = { "m" },
	["n"] = { "n" },
	["n`"] = { "ɳ", descender = true  },
	["o"] = { "o" },
	["p"] = { "p", descender = true  },
	["p\\"] = { "ɸ", descender = true  },
	["q"] = { "q", descender = true  },
	["r"] = { "r" },
	["r`"] = { "ɽ", descender = true  },
	["r\\"] = { "ɹ" },
	["r\\`"] = { "ɻ", descender = true  },
	["s"] = { "s" },
	["s`"] = { "ʂ", descender = true  },
	["s\\"] = { "ɕ" },
	["t"] = { "t" },
	["t`"] = { "ʈ" },
	["u"] = { "u" },
	["v"] = { "v" },
	["v\\"] = { "ʋ" },
	["w"] = { "w" },
	["x"] = { "x" },
	["x\\"] = { "ɧ", descender = true  },
	["y"] = { "y", descender = true  },
	["z"] = { "z" },
	["z`"] = { "ʐ", descender = true  },
	["z\\"] = { "ʑ" },
	["A"] = { "ɑ" },
	["B"] = { "β", descender = true  },
	["B\\"] = { "ʙ" },
	["C"] = { "ç", descender = true  },
	["D"] = { "ð" },
	["E"] = { "ɛ" },
	["F"] = { "ɱ", descender = true  },
	["G"] = { "ɣ", descender = true  },
	["G\\"] = { "ɢ" },
	["G\\_<"] = { "ʛ" },
	["H"] = { "ɥ", descender = true  },
	["H\\"] = { "ʜ" },
	["I"] = { "ɪ" },
	["I\\"] = { "ɪ̈" },
	["J"] = { "ɲ", descender = true  },
	["J\\"] = { "ɟ" },
	["J\\_<"] = { "ʄ", descender = true  },
	["K"] = { "ɬ" },
	["K\\"] = { "ɮ", descender = true  },
	["L"] = { "ʎ" },
	["L\\"] = { "ʟ" },
	["M"] = { "ɯ" },
	["M\\"] = { "ɰ", descender = true  },
	["N"] = { "ŋ", descender = true  },
	["N\\"] = { "ɴ" },
	["O"] = { "ɔ" },
	["O\\"] = { "ʘ" },
	["P"] = { "ʋ" },
	["Q"] = { "ɒ" },
	["R"] = { "ʁ" },
	["R\\"] = { "ʀ" },
	["S"] = { "ʃ", descender = true  },
	["T"] = { "θ" },
	["U"] = { "ʊ" },
	["U\\"] = { "ʊ̈" },
	["V"] = { "ʌ" },
	["W"] = { "ʍ" },
	["X"] = { "χ", descender = true  },
	["X\\"] = { "ħ" },
	["Y"] = { "ʏ" },
	["Z"] = { "ʒ", descender = true  },
	["."] = { "." },
	["\""] = { "ˈ" },
	["%"] = { "ˌ" },
	["%\\"] = { "ᴙ" }, -- not in official X-SAMPA; from http://www.kneequickie.com/kq/Z-SAMPA and used by Wiktionary
	["'"] = { "ʲ", diacritic = true },
	[":"] = { "ː", diacritic = true },
	[":\\"] = { "ˑ", diacritic = true },
	["@"] = { "ə" },
	["@\\"] = { "ɘ" },
	["{"] = { "æ" },
	["}"] = { "ʉ" },
	["1"] = { "ɨ" },
	["2"] = { "ø" },
	["3"] = { "ɜ" },
	["3\\"] = { "ɞ" },
	["4"] = { "ɾ" },
	["5"] = { "ɫ" },
	["6"] = { "ɐ" },
	["7"] = { "ɤ" },
	["8"] = { "ɵ" },
	["9"] = { "œ" },
	["&"] = { "ɶ" },
	["?"] = { "ʔ" },
	["?\\"] = { "ʕ" },
	["<\\"] = { "ʢ" },
	[">\\"] = { "ʡ" },
	["^"] = { "ꜛ" },
	["!"] = { "ꜜ" },
	["!!"] = { "‼" }, -- not in official X-SAMPA
	["!\\"] = { "ǃ" },
	["|"] = { "|", descender = true  },
	["|\\"] = { "ǀ", descender = true  },
	["||"] = { "‖", descender = true  },
	["|\\|\\"] = { "ǁ", descender = true  },
	["=\\"] = { "ǂ", descender = true  },
	["-\\"] = { "‿", diacritic = true }, -- linking mark, liaison
	["__"] = { U(0x361) }, -- coarticulated; not in official X-SAMPA; used by Wiktionary
	["_:"] = { U(0x348) }, -- fortis, strong articulation; not in official X-SAMPA; used by Wiktionary
	["_\""] = { U(0x308), diacritic = true },
	["_+"] = { U(0x31F), descender = "˖", diacritic = true }, -- advanced
	["_-"] = { U(0x320), descender = "˗", diacritic = true }, -- retracted
	["_/"] = { U(0x30C), diacritic = true }, -- rising tone
	["_0"] = { U(0x325), descender = U(0x30A), diacritic = true }, -- voiceless
	["="] = { U(0x329), descender = U(0x30D), diacritic = true }, -- syllabic
	["_="] = { U(0x329), descender = U(0x30D), diacritic = true }, -- syllabic
	["_%\\"] = { U(0x1DFD) }, -- strident: not in official X-SAMPA; from http://www.kneequickie.com/kq/Z-SAMPA and used by Wiktionary
	["_>"] = { "ʼ", diacritic = true }, -- ejective
	["_?\\"] = { "ˤ", diacritic = true }, -- pharyngealized
	["_\\"] = { U(0x302), diacritic = true }, -- falling tone
	["_^"] = { U(0x32F), descender = U(0x311), diacritic = true }, -- non-syllabic
	["_}"] = { U(0x31A), diacritic = true }, -- no audible release
	["`"] = { U(0x2DE), diacritic = true }, -- r-coloring (colouring), rhotacization
	["~"] = { U(0x303), diacritic = true }, -- nasalization
	["_A"] = { U(0x318), diacritic = true }, -- advanced tongue root
	["_a"] = { U(0x33A), diacritic = true }, -- apical
	["_B"] = { U(0x30F), diacritic = true }, -- extra-low tone
	["_B_L"] = { U(0x1DC5), diacritic = true }, -- low rising tone
	["_c"] = { U(0x31C), diacritic = true }, -- less rounded
	["_d"] = { U(0x32A), diacritic = true }, -- dental
	["_e"] = { U(0x334), diacritic = true }, -- velarized or pharyngealized (dark)
	["<F>"] = { "↘" }, -- downstep
	["_F"] = { U(0x302), diacritic = true }, -- falling tone
	["_G"] = { "ˠ", diacritic = true }, -- velarized
	["_H"] = { U(0x301), diacritic = true }, -- high tone
	["_H_T"] = { U(0x1DC4), diacritic = true }, -- high rising tone
	["_h"] = { "ʰ", diacritic = true }, -- aspiration
	["_j"] = { "ʲ", diacritic = true }, -- palatalization
	["_k"] = { U(0x330), diacritic = true }, -- creaky voice, laryngealization, vocal fry
	["_L"] = { U(0x300), diacritic = true }, -- low tone
	["_l"] = { "ˡ", diacritic = true }, -- lateral release
	["_M"] = { U(0x304), diacritic = true }, -- mid tone
	["_m"] = { U(0x33B), diacritic = true }, -- laminal
	["_N"] = { U(0x33C), diacritic = true }, -- linguolabial
	["_n"] = { "ⁿ", diacritic = true }, -- nasal release
	["_O"] = { U(0x339), diacritic = true }, -- more rounded
	["_o"] = { U(0x31E), descender = "˕", diacritic = true }, -- lowered
	["_q"] = { U(0x319), diacritic = true }, -- retracted tongue root
	["<R>"] = { "↗" }, -- global rise
	["_R"] = { U(0x30C), diacritic = true }, -- rising tone
	["_R_F"] = { U(0x1DC8), diacritic = true }, -- rising falling tone
	["_r"] = { U(0x31D), diacritic = true }, -- raised
	["_T"] = { U(0x30B), diacritic = true }, -- extra-high tone
	["_t"] = { U(0x324), diacritic = true }, -- breathy voice, murmured voice, murmur, whispery voice
	["_v"] = { U(0x32C), diacritic = true }, -- voiced
	["_w"] = { "ʷ", diacritic = true }, -- labialized
	["_X"] = { U(0x306), diacritic = true }, -- extra-short
	["_x"] = { U(0x33D), diacritic = true }, -- mid-centralized
}

local function _XSAMPAtoIPA(text)
	local output = {}
	local characteristics = {}
	
	while length(text) > 0 do
		local substrings = { sub(text, 1, 4), sub(text, 1, 3), sub(text, 1, 2), sub(text, 1, 1) }
		
		for i, substring in ipairs(substrings) do
			local result, IPA, descender, diacritic
			
			if data[substring] then
				result = data[substring]
				IPA = result[1]
				descender = result.descender
				diacritic = result.diacritic
				if type(descender) == "string" then
					local i = 0
					while characteristics[#characteristics - i].diacritic do
						i = i + 1
					end
					if characteristics[#characteristics - i].descender then
						IPA = descender
					end
				end
			elseif not substrings[i + 1] then
				IPA = substring
			end
			
			if IPA then
				text = sub(text, 6 - i)
				table.insert(output, IPA)
				table.insert(characteristics, { descender = descender, diacritic = diacritic } )
				break
			end
		end
	end
	
	return table.concat(output)
end

function p.X2IPA(frame)
	local text = frame.getParent and frame:getParent().args[1] or frame.args and frame.args[1] or frame
	
	return _XSAMPAtoIPA(text)
end

local function _IPAspan(text)
	return "<span class=\"IPA\">"..text.."</span>"
end

function p.example(frame)
	local args = frame.args
	local parentargs = frame.getParent and frame:getParent().args
	
	local text = parentargs and parentargs[1]
		or args and args[1]
		or type(frame) == "string" and frame
		or error("No text provided")
	
	local output = { " <code>&#123;&#123;[[mw:Manual:Substitution|subst:]][[Template:x2i|x2i]]&#124;" }
	
	if find(text, "=") then
		table.insert(output, "1=")
	end
	table.insert(output, text)
	
	table.insert(output, "&#125;&#125;</code>")
	
	table.insert(output, "\n| ")
	local IPA = _IPAspan(p.X2IPA(text))
	table.insert(output, IPA)
	
	return table.concat(output)
end

return p