Jump to content

Module:Ko-translit/sandbox

From Wikipedia, the free encyclopedia
local p = {}
local get_args = require('Module:Arguments').getArgs
local m_data = require('Module:Ko-translit/data/sandbox')
local m_clean = require('Module:Ko-translit/clean')
local m_utils = require('Module:Ko-utils')
local find = mw.ustring.find
local gsub = mw.ustring.gsub

--[[
IMPORTANT NOTE before editing this module:
1. Make sure that you use a font that displays the following characters differently, and that you know the differences of them:
	ᄀ (U+1100)
	ᆨ (U+11A8)
	ㄱ (U+3131)
2. When dealing with decomposed Hangul,
	a. [ᄀ-ᄒ] should not be directly followed by [ᅡ-ᅵ] because MediaWiki uses Unicode Normalization Form C (NFC), which converts any sequence of [ᄀ-ᄒ][ᅡ-ᅵ] into a precomposed character; write ᄀ[ᅡ] or ᄀ(ᅡ)
	b. ᄀ[ᅡ] or ᄀ(ᅡ) at the end of a pattern is equivalent to not just 가 but [가-갛] in precomposed form. To match a syllabic block without a final consonant at the end of a pattern, use both vowel + [^ᆨ-ᇂ] and vowel + $
		For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$
--]]

-- _ replaced with additional space that appears in romanization only
local function roman_only_space(text)
	return gsub(text, "_", " ")
end

-- * replaced with additional hyphen that appears in romanization only
local function roman_only_hyphen(text)
	return gsub(text, "%*", "-")
end

local function parse_names(text)
	-- Processing people names
	-- Note: internally uses 3 non-characters
	-- ﷐ (U+FDD0): used to interrupt and prevent euphonic changes between syllables; mostly for given name in RR
	-- ﷑ (U+FDD1): marks beginning of name
	-- ﷒ (U+FDD2): marks end of name

	local hanja_readings_final_L = "갈걸결골괄굴궐귤글길날녈놜눌닐달돌랄렬률말멸몰물밀발벌별불살설솔술슬실알얼열올왈울월율을일절졸줄즐질찰철촬출칠탈팔필할헐혈홀활훌휼흘힐"
	local hanja_readings_init_DSJ = "다단달담답당대댁덕도독돈돌동두둔득등사삭산살삼삽상새색생서석선설섬섭성세소속손솔송쇄쇠수숙순술숭쉬슬습승시식신실심십자작잔잠잡장재쟁저적전절점접정제조족존졸종좌죄주죽준줄중즉즐즙증지직진질짐집징"

	-- Change % to U+FDD1 and U+FDD2
	text = gsub(text, "%%([^%%]*)%%", "﷑%1﷒") -- When name is surrounded by %
	text = gsub(text, "%%([^%%]*)$", "﷑%1﷒") -- When no end %; end of string also terminates name mode

	-- Validate person name syntax
	if find(text, "﷑_?﷒") then
		error("Name cannot be empty")
	elseif find(text, "﷑[^﷑﷒]*[^가-힣_ ][^﷑﷒]*﷒") then
		error("Invalid character in name")
	elseif find(text, "﷑ ") then
		error("Name cannot begin with space")
	elseif find(text, " ﷒") then
		error("Name cannot end with space")
	elseif find(text, "﷑[^﷒]*[ _][^﷒]*[ _][^﷒]*﷒") then
		error("No more than two components in name")
	elseif find(text, "﷑[가-힣]_") then
		error("No _ after one-syllable surname")
	elseif find(text, "﷑[^﷒]*[" .. hanja_readings_final_L .. "]@[" .. hanja_readings_init_DSJ .. "][^﷒]*﷒") then
		error("Contains unnecessary @ in name") -- See below
	end

	-- Separate surname and given name
	-- If input contains _ or space, separate there
	text = gsub(text, "﷑([가-힣%$@]+)_﷒", "﷑^%1_﷒") -- for surname-only string
	text = gsub(text, "﷑_([가-힣%$@]+)﷒", "﷑_^%1﷒") -- for mononym
	text = gsub(text, "﷑([가-힣%$@]+)[ _]([가-힣%$@]+)﷒", "﷑^%1_^%2﷒")
	-- Otherwise, separate after first syllabic block
	text = gsub(text, "﷑([가-힣])﷒", "﷑^%1_﷒") -- for surname-only string
	text = gsub(text, "﷑([가-힣])([가-힣%$@]+)﷒", "﷑^%1_^%2﷒")

	-- Check invalid input after separating surname and given name
	if find(text, "﷑[^﷒]*_%^[%$@][^﷒]*﷒") then
		error("No @ or $ between surname and given name")
	end

	-- Tensification of ㄹ + {ㄷ, ㅅ, ㅈ} (needed for MR; e.g. 홍길동 [홍길똥], 을지문덕 [을찌문덕])
	-- Does not occur when same syllable is repeated (e.g. 구구절절 [구구절절], not [구구절쩔]); just using U+FDD0 here too
	for i = 1, mw.ustring.len(text) do
		text = gsub(text, "﷑([^﷒]*)([달돌살설솔술슬실절졸줄즐질])%2([^﷒]*)﷒", "﷑%1%2﷐%2%3﷒")
	end
	-- Now apply tensification
	for i = 1, mw.ustring.len(text) do
		text = gsub(text, "﷑([^﷒]*)([" .. hanja_readings_final_L .. "])([" .. hanja_readings_init_DSJ .. "])([^﷒]*)﷒", "﷑%1%2@%3%4﷒")
	end

	-- Insert U+FDD0 in given name
    -- (needed for RR; e.g. 한복남 Han Boknam, not Han Bongnam)
    -- Will be removed after romanization is complete
	for i = 1, mw.ustring.len(text) do
		text = gsub(text, "﷑([^﷒]*)_%^([^﷒]*)([가-힣%$@])([가-힣%$@])([^﷒]*)﷒", "﷑%1_^%2%3﷐%4%5﷒")
	end

	-- Remove _ which was needed for surname-only string and mononym
	text = gsub(text, "_﷒", "﷒")
	text = gsub(text, "﷑_%^", "﷑^")

	text = gsub(text, "[﷑﷒]", "") -- Remove U+FDD1 and U+FDD2

	return text
end

-- Convert to Revised Romanization
function p.rr(frame)
	return p._rr(get_args(frame))
end

function p._rr(args)
	local text = args[1]

    -- Preprocessing (input cleaning, parsing people names, validation)
    text = m_clean.escaped_to_html(text) -- replacing escaped special chars with placeholders
	m_clean.validate_input(text)
	text = parse_names(text) -- Parse people names (marked with %)
	text = m_clean.remove_links_and_markup(text)
	m_clean.validate_composed(text)
	text = gsub(text, "`", "") -- ` is not needed (used for voicing in MR)
	text = m_clean.unwrap_enclosed_hangul(text)
	text = m_utils.decompose_hangul(text) -- decompose Hangul syllables to jamo
	m_clean.validate_decomposed(text)
	text = roman_only_hyphen(text) -- *->hyphen in romanization only
    text = roman_only_space(text) -- _->space in romanization only

    -- Convert input to phonemic spelling per standard Korean pronunciation rules
	text = m_data.respell(text, "rr")

    -- Romanize the phonemic spelling
    text = m_data.romanize_rr(text)

    -- Postprocessing
	text = gsub(text, "﷐", "") -- now remove U+FDD0 (was used for name mode)
	-- ^ for capitalization
	text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper)
	text = gsub(text, "%^", "")
	text = m_clean.final_processing(text)

	return text
end

-- Convert to McCune–Reischauer
function p.mr(frame)
	return p._mr(get_args(frame))
end

function p._mr(args)
	local text = args[1]

    -- Preprocessing (input cleaning, parsing people names, validation)
    text = m_clean.escaped_to_html(text) -- replacing escaped special chars with placeholders
	m_clean.validate_input(text)
	text = parse_names(text)
	text = gsub(text, "﷐", "") -- remove U+FDD0 (for RR only)
	text = m_clean.remove_links_and_markup(text)
	m_clean.validate_composed(text)
	text = m_clean.unwrap_enclosed_hangul(text)
	text = m_utils.decompose_hangul(text) -- decompose Hangul to jamo
	m_clean.validate_decomposed(text)
    text = roman_only_space(text) -- _ for additional space in romanization only

    -- Convert input to phonemic spelling
	text = m_data.respell(text, "mr")

    -- Romanize the phonemic spelling
    text = m_data.romanize_mr(text)

    -- Postprocessing
    text = roman_only_hyphen(text) -- * for additional hyphen in romanization only
	-- replace ' with ' when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup)
	text = gsub(text, "([hkpt])''", "%1''")
	text = gsub(text, "([hkpt])'$", "%1'")
	-- ^ for capitalization
	text = gsub(text, "%^[acehikm-pr-uwyŏŭ]", mw.ustring.upper)
	text = gsub(text, "%^", "")
	text = m_clean.final_processing(text)

	return text
end

-- Removing special chars (except for escaped ones)
function p.clean_hangul(frame)
	return p._clean_hangul(get_args(frame))
end

function p._clean_hangul(args)
	local text = args[1]

    -- Validate input
	if not m_utils.contains_hangul(text) then
		error("Input must contain Hangul")
    elseif m_utils.contains_reference(text) then
		error("Input cannot contain references")
	elseif find(text, "[ᄀ-ᅟꥠ-ꥼ][%$%%%*@%^_`][ᅠ-ᆧힰ-ퟆ]") or find(text, "[ᅠ-ᆧ가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기까깨꺄꺠꺼께껴꼐꼬꽈꽤꾀꾜꾸꿔꿰뀌뀨끄끠끼나내냐냬너네녀녜노놔놰뇌뇨누눠눼뉘뉴느늬니다대댜댸더데뎌뎨도돠돼되됴두둬뒈뒤듀드듸디따때땨떄떠떼뗘뗴또똬뙈뙤뚀뚜뚸뛔뛰뜌뜨띄띠라래랴럐러레려례로롸뢔뢰료루뤄뤠뤼류르릐리마매먀먜머메며몌모뫄뫠뫼묘무뭐뭬뮈뮤므믜미바배뱌뱨버베벼볘보봐봬뵈뵤부붜붸뷔뷰브븨비빠빼뺘뺴뻐뻬뼈뼤뽀뽜뽸뾔뾰뿌뿨쀄쀠쀼쁘쁴삐사새샤섀서세셔셰소솨쇄쇠쇼수숴쉐쉬슈스싀시싸쌔쌰썌써쎄쎠쎼쏘쏴쐐쐬쑈쑤쒀쒜쒸쓔쓰씌씨아애야얘어에여예오와왜외요우워웨위유으의이자재쟈쟤저제져졔조좌좨죄죠주줘줴쥐쥬즈즤지짜째쨔쨰쩌쩨쪄쪠쪼쫘쫴쬐쬬쭈쭤쮀쮜쮸쯔쯰찌차채챠챼처체쳐쳬초촤쵀최쵸추춰췌취츄츠츼치카캐캬컈커케켜켸코콰쾌쾨쿄쿠쿼퀘퀴큐크킈키타태탸턔터테텨톄토톼퇘퇴툐투퉈퉤튀튜트틔티파패퍄퍠퍼페펴폐포퐈퐤푀표푸풔풰퓌퓨프픠피하해햐햬허헤혀혜호화홰회효후훠훼휘휴흐희히ힰ-ퟆ][%$%%%*@%^_`][ᆨ-ᇿퟋ-ퟻ]") then
	-- symbol should not appear within single syllabic block
		error("Do not insert symbol within single syllabic block")
	end

    -- Remove special chars
	text = m_clean.escaped_to_html(text) -- replacing escaped special chars with placeholders
	text = gsub(text, "[%$%%%*@%^_`]", "") -- removing non-escaped special chars
	text = m_clean.html_to_ascii(text) -- convert HTML encodings back to ASCII
	text = mw.text.unstrip(text)
	return text
end

return p