Module:Ko-translit/data
Appearance
![]() | This module is rated as pre-alpha. It is unfinished, and may or may not be in active development. It should not be used from article namespace pages. Modules remain pre-alpha until the original editor (or someone who takes one over if it is abandoned for some time) is satisfied with the basic structure. |
Data module for Module:Ko-translit.
local p = {}
--[[
IMPORTANT NOTE before editing this module:
1. Make sure that you use a font that displays the following characters differently, and that you know the differences of them:
ᄀ (U+1100)
ᆨ (U+11A8)
ㄱ (U+3131)
2. When dealing with decomposed Hangul,
a. [ᄀ-ᄒ] should not be directly followed by [ᅡ-ᅵ] because MediaWiki uses Unicode Normalization Form C (NFC), which converts any sequence of [ᄀ-ᄒ][ᅡ-ᅵ] into a precomposed character; write ᄀ[ᅡ] or ᄀ(ᅡ)
b. ᄀ[ᅡ] or ᄀ(ᅡ) at the end of a pattern is equivalent to not just 가 but [가-갛] in precomposed form. To match a syllabic block without a final consonant at the end of a pattern, use both vowel + [^ᆨ-ᇂ] and vowel + $
For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$
--]]
--[[
pre-processing that applies to both RR and MR
IMPORTANT: Before adding a replacement, be sure to check if it can ALWAYS be applied in ALL contexts.
Good example: 싫증 → 실@증
Bad example: 문자 → 문@자 (affects words like 방문자 (pronounced [방문자], not [방문짜]))
--]]
p.preprocessing = {
-- _ for additional space in romanization only
{"_", " "},
-- for linguistic contexts
{"ㄴ([ᄀ-ᄒ])", "ᆫ%1"}, -- -ㄴ다
{"ㄹ([ᄀ-ᄒ])", "ᆯ%1"}, -- -ㄹ까, -ㄹ래
{"ㄹ@([ᄀᄃᄇᄉᄌ])", "ᆯ@%1"}, -- -ㄹ지
{"ㅁ([ᄀ-ᄒ])", "ᆷ%1"},
{"ㅂ([ᄀ-ᄒ])", "ᆸ%1"}, -- -ㅂ니다, -ㅂ시다
-- ㄴ-addition always occurs before 윷 and 잎
{"([ᆨ-ᇂ])ᄋ(ᅲᆾ)", "%1ᄂ%2"},
{"([ᆨ-ᇂ])ᄋ(ᅵᇁ)", "%1ᄂ%2"},
-- 곧이어 [고디어]
{"(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ][^ᆨ-ᇂ])", "%1ᄃ%2"},
{"(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ])$", "%1ᄃ%2"},
-- 싫증 [실쯩]
{"(ᄉ[ᅵ])ᆶ(ᄌ[ᅳ]ᆼ)", "%1ᆯ@%2"},
-- 여덟 + particle (tensification does not occur)
{"(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2"},
-- cases where ㄺㄱ is pronounced [ㄱㄲ]
-- not including very rarely used words such as 삼시욹, 안찱, 우줅거리다, etc.
{"([ᄃᄉᄐ]ᅡ)ᆰᄀ", "%1ᆨᄀ"}, -- 닭, 삵, 수탉/암탉
{"([ᄉᄒ]ᅳ)ᆰᄀ", "%1ᆨᄀ"}, -- 기슭, 흙
{"(ᄎ[ᅵ])ᆰᄀ", "%1ᆨᄀ"}, -- 칡
-- otherwise, ㄺㄱ is pronounced [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))
{"ᆰᄀ", "ᆯ@ᄀ"},
-- palatalization and ㅈ + -히-
{"ᆮᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄌ%1"}, -- 해돋이 [해도지]
{"ᆮᄋ(ᅵ[^ᆨ-ᇂ])", "ᄌ%1"},
{"ᆮᄋ(ᅵ)$", "ᄌ%1"},
{"[ᆮᆽ]ᄒ(ᅧᆻ)", "ᄎ%1"}, -- 굳히다 [구치다], 꽂히다 [꼬치다]
{"[ᆮᆽ]ᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1"},
{"[ᆮᆽ]ᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1"},
{"[ᆮᆽ]ᄒ([ᅧᅵ])$", "ᄎ%1"},
{"ᆴᄋ(ᅧᆻ)", "ᆯᄎ%1"}, -- 훑이다 [훌치다]
{"ᆴᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄎ%1"},
{"ᆴᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄎ%1"},
{"ᆴᄋ([ᅧᅵ])$", "ᆯᄎ%1"},
{"ᇀᄋ(ᅧᆻ)", "ᄎ%1"}, -- 붙이다 [부치다]
{"ᇀᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1"},
{"ᇀᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1"},
{"ᇀᄋ([ᅧᅵ])$", "ᄎ%1"},
-- {ㄵ, ㄺ, ㄼ} + -히-
{"ᆬᄒ(ᅧᆻ)", "ᆫᄎ%1"}, -- 앉히다 [안치다]
{"ᆬᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆫᄎ%1"},
{"ᆬᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆫᄎ%1"},
{"ᆬᄒ([ᅧᅵ])$", "ᆫᄎ%1"},
{"ᆰᄒ(ᅧᆻ)", "ᆯᄏ%1"}, -- 밝히다 [발키다]
{"ᆰᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄏ%1"},
{"ᆰᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄏ%1"},
{"ᆰᄒ([ᅧᅵ])$", "ᆯᄏ%1"},
{"ᆲᄒ(ᅧᆻ)", "ᆯᄑ%1"}, -- 넓히다 [널피다], 밟히다 [발피다]
{"ᆲᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄑ%1"},
{"ᆲᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄑ%1"},
{"ᆲᄒ([ᅧᅵ])$", "ᆯᄑ%1"},
-- cases where 넓- is pronounced [넙] before consonant
{"(ᄂ[ᅥ])ᆲ([ᄁᄄ-ᄈᄊᄍ-ᄒ])", "%1ᆸ%2"},
{"(ᄂ[ᅥ])ᆲ(ᄃ[ᅡ]ᄃ[ᅳ]ᆷ)", "%1ᆸ%2"}, -- 넓다듬이
{"(ᄂ[ᅥ])ᆲ(ᄃ[ᅮ]ᆼ)", "%1ᆸ%2"}, -- 넓둥글다
{"(ᄂ[ᅥ])ᆲ(ᄉ[ᅡ]ᆯᄆ[ᅮ]ᆫ)", "%1ᆸ%2"}, -- 넓살문
{"(ᄂ[ᅥ])ᆲ(ᄌ[ᅥᅮ]ᆨ)", "%1ᆸ%2"}, -- 넓적-, 넓죽-
-- 밟- is [밥] before consonant (except null-init consonant ㅇ)
{"(ᄇ[ᅡ])ᆲ([^ᄋ])", "%1ᆸ%2"},
{"(ᄇ[ᅡ])ᆲ$", "%1ᆸ"},
-- ㄵ, ㄼ, ㄾ cause tensification of following consonant
-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차])
{"([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2"},
-- automatic 절음 법칙
{"(ᄋ[ᅥ])ᆹᄋ(ᅢ[ᆫᆯᆷᆸᆻ])", "%1ᆸᄉ%2"}, -- except 없애다 [업쌔다]
{"(ᄋ[ᅥ])ᆹᄋ(ᅢ[^ᆨ-ᇂ])", "%1ᆸᄉ%2"},
{"(ᄋ[ᅥ])ᆹᄋ(ᅢ)$", "%1ᆸᄉ%2"},
{"(ᄆ[ᅡᅥ])ᆺᄋ(ᅵᆻ)", "%1ᄉ%2"}, -- except 맛있다 and 멋있다 which are usually pronounced [마싣따] and [머싣따] respectively
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅡᅥᅧ][ᆨ-ᆺᆼ-ᇂ])", "%1@%2"}, -- except 아, 았, 어, 었, 여, 였
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅦ][ᆨ-ᆪᆬ-ᆮᆰ-ᇂ])", "%1@%2"}, -- except 에, 엔, 엘
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅭᅴ][ᆨ-ᇂ])", "%1@%2"}, -- except 요, 의 (w/o final consonant)
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅳᅵ][ᆨ-ᆪᆬ-ᆮᆰ-ᆶᆹ-ᇂ])", "%1@%2"}, -- except 으, 은, 을, 음, 읍, 이, 인, 일, 임, 입
{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅢ-ᅤᅨ-ᅬᅮ-ᅲ])", "%1@%2"},
-- $ for ㄴ-addition
{"([ᆨ-ᇂ])%$ᄋ([ᅣᅤᅧᅨᅭᅲᅵ])", "%1ᄂ%2"}, -- 색연필 [생년필], 물엿 [물렫]
{"%$", ""},
-- for null-init consonant ㅇ (연음)
{"ᆨᄋ", "ᄀ"},
{"ᆩᄋ", "ᄁ"},
{"ᆪᄋ", "ᆨᄉ"},
{"ᆬᄋ", "ᆫᄌ"},
{"ᆮᄋ", "ᄃ"},
{"[ᆯᆶ]ᄋ", "ᄅ"},
{"ᆰᄋ", "ᆯᄀ"},
{"ᆱᄋ", "ᆯᄆ"},
{"ᆲᄋ", "ᆯᄇ"},
{"ᆳᄋ", "ᆯᄉ"},
{"ᆴᄋ", "ᆯᄐ"},
{"ᆵᄋ", "ᆯᄑ"},
{"ᆸᄋ", "ᄇ"},
{"ᆹᄋ", "ᆸᄉ"},
{"ᆺᄋ", "ᄉ"},
{"ᆻᄋ", "ᄊ"},
{"ᆽᄋ", "ᄌ"},
{"ᆾᄋ", "ᄎ"},
{"ᆿᄋ", "ᄏ"},
{"ᇀᄋ", "ᄐ"},
{"ᇁᄋ", "ᄑ"},
{"ᇂᄋ", "ᄋ"}, -- silent; 좋아 [조아]
-- convert ㅎ combinations
-- trivia: {ㄶ, ㅀ, ㅎ} + ㅂ doesn't actually exist, but added for completeness (syl-final ㅎ is for aspiration anyway)
{"ᆭᄀ", "ᆫᄏ"},
{"ᆭᄃ", "ᆫᄐ"},
{"ᆭᄇ", "ᆫᄑ"},
{"ᆭᄌ", "ᆫᄎ"},
{"ᆶᄀ", "ᆯᄏ"},
{"ᆶᄃ", "ᆯᄐ"},
{"ᆶᄇ", "ᆯᄑ"},
{"ᆶᄌ", "ᆯᄎ"},
{"ᇂᄀ", "ᄏ"},
{"ᇂᄃ", "ᄐ"},
{"ᇂᄇ", "ᄑ"},
{"ᇂᄌ", "ᄎ"}
}
-- should be done before neutralization of syl-final consonants (MR only)
p.before_neutralizing_syl_final_consonants_mr = {
-- additional ㅎ combinations
{"[ᆬᆭ]ᄉ", "ᆫᄊ"},
{"[ᆲᆴᆶ]ᄉ", "ᆯᄊ"},
{"ᇂᄉ", "ᄊ"},
-- @ for written 사이시옷 + ㄱ/ㅂ
{"ᆺ@ᄀ", "ᄁ"},
{"ᆺ@ᄇ", "ᄈ"}
}
-- neutralization of syl-final consonants
p.neutralize_syl_final_consonants = {
{"[ᆩᆪᆰᆿ]", "ᆨ"},
{"[ᆬᆭ]", "ᆫ"},
{"[ᆺᆻᆽᆾᇀᇂ]", "ᆮ"},
{"[ᆲᆳᆴᆶ]", "ᆯ"},
{"ᆱ", "ᆷ"},
{"[ᆵᆹᇁ]", "ᆸ"}
}
-- @ for 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
-- other irregularities documented are automatically handled
p.at_irregularities = {
{"ᆨ@ᄋ", "ᄀ"},
{"ᆮ@ᄋ", "ᄃ"}, -- 웃어른 [우더른]
{"ᆯ@ᄋ", "ᄅ"},
{"ᆸ@ᄋ", "ᄇ"},
{"ᆫ@ᄅ", "ᆫᄂ"} -- 음운론 [으문논]
}
-- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p for RR only
p.at_irregularities_additional_rr = {
{"ᆨ@ᄒ", "ᄏ"},
{"ᆮ@ᄒ", "ᄐ"},
{"ᆸ@ᄒ", "ᄑ"},
{"@", ""}
}
-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants (MR only)
p.gdbj_mr = {
{"ᆫᄀ", "ᆫ'`ᄀ"}, -- n'g
{"([ᅡ-ᅵᆫᆯᆷᆼ])([ᄀᄃᄇᄌ])", "%1`%2"},
{"([ᅡ-ᅵᆫᆯᆷᆼ])%*([ᄀᄃᄇᄌ])", "%1-`%2"} -- * is for additional hyphen in romanization only (voicing is retained after hyphen)
}
p.consonant_assimilations = {
{"[ᆨᆼ][ᄂᄅ]", "ᆼᄂ"},
{"ᆨᄆ", "ᆼᄆ"},
{"ᆫᄅ", "ᆯᄅ"},
{"ᆮ[ᄂᄅ]", "ᆫᄂ"},
{"ᆮᄆ", "ᆫᄆ"},
{"ᆯᄂ", "ᆯᄅ"},
{"[ᆷᆸ][ᄂᄅ]", "ᆷᄂ"},
{"ᆸᄆ", "ᆷᄆ"}
}
-- additional consonant assimilations that apply to MR only
p.consonant_assimilations_additional_mr = {
-- no {kkk, ttt, ppp, sss/ts/tss, ttch}
{"ᆨᄁ", "ᄁ"},
{"ᆮᄄ", "ᄄ"},
{"ᆸᄈ", "ᄈ"},
{"ᆮ[ᄉᄊ]", "ᄊ"},
{"ᆮᄍ", "ᄍ"},
-- other misc conversions
{"ᆯᄅ", "ᆯl"},
{"ᆯᄒ", "rᄒ"},
{"ᄉ[ᅱ]", "shᅱ"}
}
-- drop y after {ㅈ, ㅉ, ㅊ}
p.drop_y = {
{"([ᄌ-ᄎ])ᅣ", "%1ᅡ"},
{"([ᄌ-ᄎ])ᅤ", "%1ᅢ"},
{"([ᄌ-ᄎ])ᅧ", "%1ᅥ"},
{"([ᄌ-ᄎ])ᅨ", "%1ᅦ"},
{"([ᄌ-ᄎ])ᅭ", "%1ᅩ"},
{"([ᄌ-ᄎ])ᅲ", "%1ᅮ"}
}
-- vowels to romanized text for RR
p.vowels_rr = {
{"[ᅡㅏ]", "a"},
{"[ᅢㅐ]", "ae"},
{"[ᅣㅑ]", "ya"},
{"[ᅤㅒ]", "yae"},
{"[ᅥㅓ]", "eo"},
{"[ᅦㅔ]", "e"},
{"[ᅧㅕ]", "yeo"},
{"[ᅨㅖ]", "ye"},
{"[ᅩㅗ]", "o"},
{"[ᅪㅘ]", "wa"},
{"[ᅫㅙ]", "wae"},
{"[ᅬㅚ]", "oe"},
{"[ᅭㅛ]", "yo"},
{"[ᅮㅜ]", "u"},
{"[ᅯㅝ]", "wo"},
{"[ᅰㅞ]", "we"},
{"[ᅱㅟ]", "wi"},
{"[ᅲㅠ]", "yu"},
{"[ᅳㅡ]", "eu"},
{"[ᅴㅢ]", "ui"},
{"[ᅵㅣ]", "i"}
}
-- vowels to romanized text for MR
p.vowels_mr = {
{"[ᅡㅏ]", "a"},
{"[ᅢㅐ]", "ae"},
{"[ᅣㅑ]", "ya"},
{"[ᅤㅒ]", "yae"},
{"[ᅥㅓ]", "ŏ"},
{"[ᅦㅔ]", "e"},
{"[ᅧㅕ]", "yŏ"},
{"[ᅨㅖ]", "ye"},
{"[ᅩㅗ]", "o"},
{"[ᅪㅘ]", "wa"},
{"[ᅫㅙ]", "wae"},
{"[ᅬㅚ]", "oe"},
{"[ᅭㅛ]", "yo"},
{"[ᅮㅜ]", "u"},
{"[ᅯㅝ]", "wŏ"},
{"[ᅰㅞ]", "we"},
{"[ᅱㅟ]", "wi"},
{"[ᅲㅠ]", "yu"},
{"[ᅳㅡ]", "ŭ"},
{"[ᅴㅢ]", "ŭi"},
{"[ᅵㅣ]", "i"}
}
-- single consonants to romanized text for RR
p.single_consonants_rr = {
{"[ᄀㄱ]", "g"},
{"[ᄁㄲ]", "kk"},
{"ㄳ", "ks"},
{"[ᄂᆫㄴ]", "n"},
{"ㄵ", "nj"},
{"ㄶ", "nh"},
{"[ᄃㄷ]", "d"},
{"[ᄄㄸ]", "tt"},
{"[ᄅㄹ]", "r"},
{"ᆯ", "l"},
{"ㄺ", "lg"},
{"ㄻ", "lm"},
{"ㄼ", "lb"},
{"ㄽ", "ls"},
{"ㄾ", "lt"},
{"ㄿ", "lp"},
{"ㅀ", "lh"},
{"[ᄆᆷㅁ]", "m"},
{"[ᄇㅂ]", "b"},
{"[ᄈㅃ]", "pp"},
{"ㅄ", "ps"},
{"[ᄉㅅ]", "s"},
{"[ᄊㅆ]", "ss"},
{"[ᄋㅇ]", ""},
{"ᆼ", "ng"},
{"[ᄌㅈ]", "j"},
{"[ᄍㅉ]", "jj"},
{"[ᄎㅊ]", "ch"},
{"[ᄏᆨㅋ]", "k"},
{"[ᄐᆮㅌ]", "t"},
{"[ᄑᆸㅍ]", "p"},
{"[ᄒㅎ]", "h"}
}
-- single consonants to romanized text for MR
p.single_consonants_mr = {
{"`ᄀ", "g"},
{"`ᄃ", "d"},
{"`ᄇ", "b"},
{"`ᄌ", "j"},
{"`", ""},
{"[ᄀᆨㄱ]", "k"},
{"[ᄁㄲ]", "kk"},
{"ㄳ", "ks"},
{"[ᄂᆫㄴ]", "n"},
{"ㄵ", "nj"},
{"ㄶ", "nh"},
{"[ᄃᆮㄷ]", "t"},
{"[ᄄㄸ]", "tt"},
{"[ᄅㄹ]", "r"},
{"ᆯ", "l"},
{"ㄺ", "lg"},
{"ㄻ", "lm"},
{"ㄼ", "lb"},
{"ㄽ", "ls"},
{"ㄾ", "lt'"},
{"ㄿ", "lp'"},
{"ㅀ", "rh"},
{"[ᄆᆷㅁ]", "m"},
{"[ᄇᆸㅂ]", "p"},
{"[ᄈㅃ]", "pp"},
{"ㅄ", "ps"},
{"[ᄉㅅ]", "s"},
{"[ᄊㅆ]", "ss"},
{"[ᄋㅇ]", ""},
{"ᆼ", "ng"},
{"[ᄌㅈ]", "ch"},
{"[ᄍㅉ]", "tch"},
{"[ᄎㅊ]", "ch'"},
{"[ᄏㅋ]", "k'"},
{"[ᄐㅌ]", "t'"},
{"[ᄑㅍ]", "p'"},
{"[ᄒㅎ]", "h"}
}
-- unwrapping enclosed Hangul text
-- actually not very necessary, but these are also classified as Hangul chars in Unicode
-- no distinction is made between parenthesized and circled chars
-- needs to be executed before decomposing Hangul
p.enclosed_hangul = {
{"[㈀㉠]", "(기역)"},
{"[㈁㉡]", "(니은)"},
{"[㈂㉢]", "(디귿)"},
{"[㈃㉣]", "(리을)"},
{"[㈄㉤]", "(미음)"},
{"[㈅㉥]", "(비읍)"},
{"[㈆㉦]", "(시옷)"},
{"[㈇㉧]", "(이응)"},
{"[㈈㉨]", "(지읒)"},
{"[㈉㉩]", "(치읓)"},
{"[㈊㉪]", "(키읔)"},
{"[㈋㉫]", "(티읕)"},
{"[㈌㉬]", "(피읖)"},
{"[㈍㉭]", "(히읗)"},
{"[㈎㉮]", "(가)"},
{"[㈏㉯]", "(나)"},
{"[㈐㉰]", "(다)"},
{"[㈑㉱]", "(라)"},
{"[㈒㉲]", "(마)"},
{"[㈓㉳]", "(바)"},
{"[㈔㉴]", "(사)"},
{"[㈕㉵]", "(아)"},
{"[㈖㉶]", "(자)"},
{"[㈗㉷]", "(차)"},
{"[㈘㉸]", "(카)"},
{"[㈙㉹]", "(타)"},
{"[㈚㉺]", "(파)"},
{"[㈛㉻]", "(하)"},
{"㈜", "(주)"},
{"㈝", "(오전)"},
{"㈞", "(오후)"},
{"㉼", "(참고)"},
{"㉽", "(주의)"},
{"㉾", "(우)"}
}
-- converting escaped special chars to HTML tags to preserve them
p.escaped_to_html_enc = {
{"\\%$", "$"},
{"\\%%", "%"},
{"\\%*", "*"},
{"\\@", "@"},
{"\\%^", "^"},
{"\\_", "_"},
{"\\`", "`"}
}
-- converting HTML tags back to unescaped chars
p.html_enc_to_ascii = {
{"$", "$"},
{"%", "%%"},
{"*", "*"},
{"@", "@"},
{"^", "^"},
{"_", "_"},
{"`", "`"}
}
return p