Jump to content

Module:Ko-translit/sandbox

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Grapesurgeon (talk | contribs) at 19:10, 17 April 2025 (wip). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
local p = {}
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local m_data = require('Module:Ko-translit/data')

--[[
IMPORTANT NOTE before editing this module:
1. Make sure that you use a font that displays the following characters differently, and that you know the differences of them:
	ᄀ (U+1100)
	ᆨ (U+11A8)
	ㄱ (U+3131)
2. When dealing with decomposed Hangul,
	a. [ᄀ-ᄒ] should not be directly followed by [ᅡ-ᅵ] because MediaWiki uses Unicode Normalization Form C (NFC), which converts any sequence of [ᄀ-ᄒ][ᅡ-ᅵ] into a precomposed character; write ᄀ[ᅡ] or ᄀ(ᅡ)
	b. ᄀ[ᅡ] or ᄀ(ᅡ) at the end of a pattern is equivalent to not just 가 but [가-갛] in precomposed form. To match a syllabic block without a final consonant at the end of a pattern, use both vowel + [^ᆨ-ᇂ] and vowel + $
		For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$
--]]

local function gsub_iterate(text, table)
	for before, after in pairs(table) do
		text = gsub(text, before, after)
	end
	return text
end

local function remove_links_and_markup(text)
	-- these either are unnecessary or interfere with assimilation

	-- remove bold/italic
	-- it is not impossible to allow bold/italic when it does not interfere with assimilation, but determining when to allow or disallow that adds complication for little practical gain
	text = gsub(text, "'''", "")
	text = gsub(text, "''", "")
	-- remove HTML tags (except br)
	text = gsub(text, "<[Bb][Rr] */?>", "&#10;")
	text = gsub(text, "</?[A-Za-z][^>]->", "")
	text = gsub(text, "&#10;", "<br>")
	-- remove wikilinks
	text = gsub(text, "%[%[[^%|]+%|(..-)%]%]", "%1")
	text = gsub(gsub(text, "%[%[", ""), "%]%]", "")
	-- remove refs
	-- text = gsub(text, "<ref.-</ref>", "")
	text = mw.text.killMarkers(text)
	-- remove templates
	text = gsub(text, "{{.-}}", "")

	return text
end

local function disallow_invalid_input(text)
	-- very first step
	-- Hangul status: precomposed (한)

	-- input must contain Hangul
	if text == nil or text == "" or find(text, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-꥿가-힣ힰ-퟿]") == nil then
		error("Input must contain Hangul")
	end

	-- no direct insertion of reference or footnote
	if find(text, "'\"`UNIQ--") or find(text, "-QINU`\"'") then
		error("Input cannot contain references")
	end

	-- if input contains Hangul not supported by RR and MR, change text to "N/A" and skip everything
	if find(text, "[ᄓ-ᅠᅶ-ᆧᇃ-ᇿ〮〯ㅤ-ㆎꥠ-꥿ힰ-퟿]") then
		text = "N/A"
		return text
	end

    -- Convert html encodings back to ASCII
	text = gsub_iterate(text, m_data.escaped_to_html_enc)

	-- various validations of input
	if find(text, "[ᄀ-ᄒ]") or find(text, "[ᅡ-ᅵᆨ-ᇂ]") then
		error("Do not input conjoining Hangul jamo directly")
	elseif find(text, "`%*") then
		error("Use *` instead of `*")
	elseif find(text, "@%*") then
		error("Use *@ instead of @*")
	elseif find(text, "%^[^가-힣]") then
		error("^ must be immediately followed by Hangul syllabic block")
	elseif find(text, "[^%*0-9A-Za-z]`") or find(text, "[^0-9A-Za-z]%*`") or find(text, "`[^가-깋다-딯바-빟자-짛]") then
		error("Found invalid sequence containing `")
	elseif find(text, "[^%*ㄹ가-힣]@") or find(text, "[^가-힣]%*@") or find(text, "%*@[^가-깋다-딯바-빟자-짛]") or find(text, "ㄹ@[^가-깋다-딯바-빟사-싷자-짛]") or find(text, "@[^가-깋다-딯라-맇바-빟사-싷아어에엔엘여요으은을음읍의이인일임입자-짛하-힣]") then
		error("Found invalid sequence containing @")
	elseif find(text, "[^가-힣]%$") or find(text, "%$[^야-얳여-옣요-욯유-윶윸-윻이-잍잏]") then
		error("Found invalid sequence containing $")
	elseif find(text, "%%$") then
		error("Remove final %")
	elseif find(text, "[ _][ _]") then
		error("No two or more consecutive space characters")
	elseif find(text, "^[%$%*@_`]") or find(text, "^%%[^_가-힣]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[﷐-﷒]") or find(text, "[%$%*@%^`]$") then
		error("Invalid input")
	end

	return text
end

-- verify that hangul input is valid
-- checked right after removing links and markups (before decomposing Hangul)
-- Hangul status: precomposed (한)
local function check_invalid_input(text)
	if find(text, "[ _][ _]") then
		error("No two or more consecutive space characters")
	elseif find(text, "^[%$%*@_`]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[%$%*@%^_`]$") then
		error("Invalid input")
	end

	return text
end

-- verify that hangul was correctly decomposed
-- checked after decomposing Hangul
-- Hangul status: decomposed (ᄒ+ᅡ+ᆫ)
local function verify_decomposed_consonsants(text)
	if find(text, "[ᆨ-ᆪᆬ-ᆮᆴ-ᆶᆸᆹᆻᆽ-ᇂ]%*?﷐?@﷐?[ᄀᄃᄇᄉᄌ]") or find(text, "ᆰ%*?﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆲ﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆺ%*@[ᄀᄇ]") or find(text, "ᆺ%*?﷐?@﷐?[ᄁ-ᄆᄈ-ᄊᄌ-ᄑ]") or find(text, "[ᅡ-ᅵᆨ-ᆪᆬ-ᇂ]﷐?@﷐?ᄅ") or find(text, "[ᅡ-ᅵᆨᆫᆭ-ᆯᆶ-ᆸᆼ]﷐?@﷐?ᄋ") or find(text, "[ᅡ-ᅵᆫ-ᆭᆯᆱ-ᆷᆼ]﷐?@﷐?ᄒ") then
		error("Found invalid sequence containing @")
	elseif find(text, "[ᅡ-ᅵ]﷐?%$") then
		error("Found invalid sequence containing $")
	end

	return text
end

-- processing people names
local function parse_name(text)
	-- Hangul status: precomposed (한)

	local hanja_readings_final_L = "갈걸결골괄굴궐귤글길날녈놜눌닐달돌랄렬률말멸몰물밀발벌별불살설솔술슬실알얼열올왈울월율을일절졸줄즐질찰철촬출칠탈팔필할헐혈홀활훌휼흘힐"
	local hanja_readings_init_DSJ = "다단달담답당대댁덕도독돈돌동두둔득등사삭산살삼삽상새색생서석선설섬섭성세소속손솔송쇄쇠수숙순술숭쉬슬습승시식신실심십자작잔잠잡장재쟁저적전절점접정제조족존졸종좌죄주죽준줄중즉즐즙증지직진질짐집징"

	-- note: internally uses 3 noncharacters
	-- ﷐ (U+FDD0): mostly for given name in RR
	-- ﷑ (U+FDD1): marks beginning of name
	-- ﷒ (U+FDD2): marks end of name

	-- change % to U+FDD1 and U+FDD2 (end of string also terminates name mode)
	text = gsub(text, "%%([^%%]*)%%", "﷑%1﷒")
	text = gsub(text, "%%([^%%]*)$", "﷑%1﷒")
	-- disallow invalid input for name
	if find(text, "﷑﷒") then
		error("Name cannot be empty")
	elseif find(text, "﷑[^﷑﷒]*[^가-힣_ ][^﷑﷒]*﷒") then
		error("Invalid character in name")
	elseif find(text, "﷑ ") then
		error("Name cannot begin with space")
	elseif find(text, " ﷒") then
		error("Name cannot end with space")
	elseif find(text, "﷑[^﷒]*[ _][^﷒]*[ _][^﷒]*﷒") then
		error("No more than two components in name")
	elseif find(text, "﷑[가-힣]_") then
		error("No _ after one-syllable surname")
	elseif find(text, "﷑[^﷒]*[" .. hanja_readings_final_L .. "]@[" .. hanja_readings_init_DSJ .. "][^﷒]*﷒") then
		error("Contains unnecessary @ in name") -- see below
	end
	-- separate surname and given name
	-- if input contains _ or space, separate there
	text = gsub(text, "﷑([가-힣%$@]+)_﷒", "﷑^%1_﷒") -- for surname-only string
	text = gsub(text, "﷑_([가-힣%$@]+)﷒", "﷑_^%1﷒") -- for mononym
	text = gsub(text, "﷑([가-힣%$@]+)[ _]([가-힣%$@]+)﷒", "﷑^%1_^%2﷒")
	-- otherwise, separate after first syllabic block
	text = gsub(text, "﷑([가-힣])﷒", "﷑^%1_﷒") -- for surname-only string
	text = gsub(text, "﷑([가-힣])([가-힣%$@]+)﷒", "﷑^%1_^%2﷒")
	-- check invalid input after separating surname and given name
	if find(text, "﷑[^﷒]*_%^[%$@][^﷒]*﷒") then
		error("No @ or $ between surname and given name")
	end
	-- tensification of ㄹ + {ㄷ, ㅅ, ㅈ} (needed for MR; e.g. 홍길동 [홍길똥], 을지문덕 [을찌문덕])
	-- does not occur when same syllable is repeated (e.g. 구구절절 [구구절절], not [구구절쩔]); just using U+FDD0 here too
	for i = 1, mw.ustring.len(text) do
		text = gsub_iterate(text, p.name_rieul_dsj_tensification)
	end
	-- now apply tensification
	for i = 1, mw.ustring.len(text) do
		text = gsub(text, "﷑([^﷒]*)([" .. hanja_readings_final_L .. "])([" .. hanja_readings_init_DSJ .. "])([^﷒]*)﷒", "﷑%1%2@%3%4﷒")
	end
	-- insert U+FDD0 in given name (needed for RR; e.g. 한복남 Han Boknam, not Han Bongnam)
	for i = 1, mw.ustring.len(text) do
		text = gsub(text, "﷑([^﷒]*)_%^([^﷒]*)([가-힣%$@])([가-힣%$@])([^﷒]*)﷒", "﷑%1_^%2%3﷐%4%5﷒")
	end
	-- remove _ which was needed for surname-only string and mononym
	text = gsub(text, "_﷒", "﷒")
	text = gsub(text, "﷑_%^", "﷑^")
	-- remove U+FDD1 and U+FDD2
	text = gsub(text, "[﷑﷒]", "")

	return text
end

--[[
pre-processing exceptions that apply to both RR and MR
IMPORTANT: Before adding an exception, be sure to check if it can ALWAYS be applied in ALL contexts.
	Good example: 싫증 → 실@증
	Bad example: 문자 → 문@자 (affects words like 방문자 (pronounced [방문자], not [방문짜]))
Hangul status: decomposed (ᄒ+ᅡ+ᆫ)
--]]
local function parse_exceptions(text)
	return gsub_iterate(text, m_data.exceptions)
end

-- processing misc characters that contain hangul
-- Hangul status: precomposed (한)
local function parse_enclosed_hangul(text)
	-- actually not very necessary, but these are also classified as Hangul chars in Unicode
	-- no distinction is made between parenthesized and circled chars
	return gsub_iterate(text, m_data.enclosed_hangul)
end

-- Convert to Revised Romanization
function p.rr(frame)
	local get_args = require('Module:Arguments').getArgs
	local args = get_args(frame)
	return p._rr(args)
end

function p._rr(args)
	local text = args[1]
	text = disallow_invalid_input(text)
	if text == "N/A" then
		return text
	end
	text = parse_name(text)
	text = remove_links_and_markup(text)
	text = check_invalid_input(text)
	text = gsub(text, "`", "") -- ignore ` (only needed for MR; not needed for RR)
	text = parse_enclosed_hangul(text)
	text = gsub(text, "[가-힣]", mw.ustring.toNFD) -- decompose hangul
	text = verify_decomposed_consonsants(text)
	text = parse_exceptions(text)

	text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only
	-- $ for ㄴ-addition
	text = gsub(text, "([ᆨ-ᇂ])%$ᄋ([ᅣᅤᅧᅨᅭᅲᅵ])", "%1ᄂ%2") -- 색연필 [생년필], 물엿 [물렫]
	text = gsub(text, "%$", "")
	-- for null-init consonant ㅇ (연음)
	text = gsub_iterate(text, m_data.null_init_ieung)

	-- for ㅎ
	text = gsub_iterate(text, m_data.process_hieut_rr)

	-- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
	text = gsub(text, "ᆰᄀ", "ᆯᄀ")
	-- neutralization of syl-final consonants
	text = gsub_iterate(text, m_data.neutralize_syl_final_consonants)

	-- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p, 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
	-- other irregularities documented are automatically handled
	text = gsub_iterate(text, m_data.at_irregularities_rr)

	-- consonant assimilations
	text = gsub_iterate(text, m_data.consonant_assimilations_rr)

	-- drop y after {ㅈ, ㅉ, ㅊ}
	text = gsub_iterate(text, m_data.drop_y)

	-- replace hangul vowels with romanized text
	text = gsub_iterate(text, m_data.vowels_rr)

	-- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
	text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;")
	-- replace single consonants with romanized text
	text = gsub_iterate(text, m_data.single_consonants_rr)
	
	-- now remove U+FDD0 that was needed for converting each syllabic block in given name separately (e.g. 한복남 Han Boknam, not Han Bongnam)
	text = gsub(text, "﷐", "")
	-- ^ for capitalization
	text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper)
	text = gsub(text, "%^", "")
	-- final error checking
	if find(text, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-꥿가-힣ힰ-퟿]") then
		error("Result contains Hangul; debugging required")
	end
    -- Convert html encodings back to ASCII
	text = gsub_iterate(text, m_data.html_enc_to_ascii)
	-- if result is nothing (e.g. when input is just ㅇ)
	if text == "" then
		text = "—"
	end

	return text
end

-- Convert to McCune–Reischauer
function p.mr(frame)
	local get_args = require('Module:Arguments').getArgs
	local args = get_args(frame)
	return p._mr(args)
end

function p._mr(args)
	local text = args[1]
	text = disallow_invalid_input(text)
	if text == "N/A" then
		return text
	end
	text = parse_name(text)
	text = gsub(text, "﷐", "") -- remove U+FDD0 (only needed for RR; not needed for MR)
	text = remove_links_and_markup(text)
	text = check_invalid_input(text)
	text = parse_enclosed_hangul(text)
	text = gsub(text, "[가-힣]", mw.ustring.toNFD) -- decompose hangul
	text = verify_decomposed_consonsants(text)
	text = parse_exceptions(text)

	text = gsub(text, "([ᄀᄁᄃ-ᄊᄌ-ᄑ])ᅴ", "%1ᅵ") -- syl-init consonant + ㅢ → syl-init consonant + ㅣ (except 의, 늬, 희)
	text = gsub(text, "(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2") -- 여덟 + particle (tensification does not occur)
	-- $ for ㄴ-addition
	text = gsub(text, "([ᆨ-ᇂ])%$ᄋ([ᅣᅤᅧᅨᅭᅲᅵ])", "%1ᄂ%2") -- 색연필 [생년필], 물엿 [물렫]
	text = gsub(text, "%$", "")
	-- for null-init consonant ㅇ (연음)
	text = gsub_iterate(text, m_data.null_init_ieung)
	-- for ㅎ
	-- trivia: {ㄶ, ㅀ, ㅎ} + ㅂ doesn't actually exist, but added for completeness
	-- (syl-final ㅎ is for aspiration anyway)
	text = gsub_iterate(text, m_data.process_hieut_mr)
	-- ㄵ, ㄼ, ㄾ cause tensification of following consonant
	-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차])
	text = gsub(text, "([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2")
	-- ㄺㄱ [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
	text = gsub(text, "ᆰᄀ", "ᆯ@ᄀ")
	-- @ for written 사이시옷 + ㄱ/ㅂ (should be done before neutralization of syl-final consonants)
	text = gsub(text, "ᆺ@ᄀ", "ᄁ")
	text = gsub(text, "ᆺ@ᄇ", "ᄈ")
	
	-- neutralization of syl-final consonants
	text = gsub_iterate(text, m_data.neutralize_syl_final_consonants)
	
	-- @ for tensification, 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
	-- other irregularities documented are automatically handled
	text = gsub_iterate(text, m_data.at_irregularities_mr)
	
	-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants
	-- * is for additional hyphen in romanization only (voicing is retained after hyphen)
	text = gsub(text, "ᆫᄀ", "ᆫ'`ᄀ") -- n'g
	text = gsub(text, "([ᅡ-ᅵᆫᆯᆷᆼ])([ᄀᄃᄇᄌ])", "%1`%2")
	text = gsub(text, "([ᅡ-ᅵᆫᆯᆷᆼ])%*([ᄀᄃᄇᄌ])", "%1-`%2")
	text = gsub(text, "ᆯ%*ᄅ", "ᆯ-l") -- ㄹ-ㄹ should probably be l-l rather than l-r
	text = gsub(text, "%*", "-")
	text = gsub(text, "@", "")
	-- consonant assimilations
	text = gsub(text, "[ᆨᆼ][ᄂᄅ]", "ᆼᄂ")
	text = gsub(text, "ᆨᄆ", "ᆼᄆ")
	text = gsub(text, "ᆫᄅ", "ᆯᄅ")
	text = gsub(text, "ᆮ[ᄂᄅ]", "ᆫᄂ")
	text = gsub(text, "ᆮᄆ", "ᆫᄆ")
	text = gsub(text, "ᆯᄂ", "ᆯᄅ")
	text = gsub(text, "[ᆷᆸ][ᄂᄅ]", "ᆷᄂ")
	text = gsub(text, "ᆸᄆ", "ᆷᄆ")
	-- no {kkk, ttt, ppp, sss/ts/tss, ttch}
	text = gsub(text, "ᆨᄁ", "ᄁ")
	text = gsub(text, "ᆮᄄ", "ᄄ")
	text = gsub(text, "ᆸᄈ", "ᄈ")
	text = gsub(text, "ᆮ[ᄉᄊ]", "ᄊ")
	text = gsub(text, "ᆮᄍ", "ᄍ")
	-- other misc conversions
	text = gsub(text, "ᆯᄅ", "ᆯl")
	text = gsub(text, "ᆯᄒ", "rᄒ")
	text = gsub(text, "ᄉ[ᅱ]", "shᅱ")
	-- drop y after {ㅈ, ㅉ, ㅊ}
	text = gsub_iterate(text, m_data.drop_y)

	-- replace hangul vowels with romanized text
	text = gsub_iterate(text, m_data.vowels_mr)

	-- ㅏ에 (aë) and ㅗ에 (oë)
	text = gsub(text, "([ao])ᄋe", "%1ë")
	-- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later)
	text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;")
	-- replace single consonants with romanized text
	text = gsub_iterate(text, m_data.single_consonants_mr)

	-- replace ' with &#39; when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup)
	text = gsub(text, "([hkpt])''", "%1&#39;'")
	text = gsub(text, "([hkpt])'$", "%1&#39;")
	-- ^ for capitalization
	text = gsub(text, "%^[acehikm-pr-uwyŏŭ]", mw.ustring.upper)
	text = gsub(text, "%^", "")
	-- final error checking
	if find(text, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-꥿가-힣ힰ-퟿]") then
		error("Result contains Hangul; debugging required")
	end
    -- Convert html encodings back to ASCII
	text = gsub_iterate(text, m_data.html_enc_to_ascii)
	-- if result is nothing (e.g. when input is just ㅇ)
	if text == "" then
		text = "—"
	end

	return text
end

-- Removing special chars (except for escaped ones)
function p.clean_hangul(frame)
	local get_args = require('Module:Arguments').getArgs
	local args = get_args(frame)
	return p._clean_hangul(args)
end

function p._clean_hangul(args)
	local hangul = args[1]

	-- input must contain Hangul
	if hangul == nil or hangul == "" or find(hangul, "[ᄀ-ᇿ〮〯ㄱ-ㆎ㈀-㈞㉠-㉾ꥠ-꥿가-힣ힰ-퟿]") == nil then
		error("Input must contain Hangul")
	end

	-- no direct insertion of reference or footnote
	if find(hangul, "'\"`UNIQ--") or find(hangul, "-QINU`\"'") then
		error("Input cannot contain references")
	end
	-- Replacing escaped special chars with placeholders
	local cleaned = hangul
	cleaned = gsub_iterate(cleaned, m_data.escaped_to_html_enc)
	-- Removing non-escaped special chars
	cleaned = gsub(cleaned, "[%$%%%*@%^_`]", "")
    -- Convert html encodings back to ASCII
	cleaned = gsub_iterate(cleaned, m_data.html_enc_to_ascii)
	-- Unstripping test
	cleaned = mw.text.unstrip(cleaned)

	return cleaned
end

return p