Module:Ko-translit/data
Appearance
![]() | This module is rated as pre-alpha. It is unfinished, and may or may not be in active development. It should not be used from article namespace pages. Modules remain pre-alpha until the original editor (or someone who takes one over if it is abandoned for some time) is satisfied with the basic structure. |
Data module for Module:Ko-translit.
local p = {}
-- converting escaped special characters to html tags to preserve them
p.escaped_to_html_enc = {
["\\%$"] = "$",
["\\%%"] = "%",
["\\%*"] = "*",
["\\@"] = "@",
["\\%^"] = "^",
["\\_"] = "_",
["\\`"] = "`"
}
-- converting html tags back to unescaped characters
p.html_enc_to_ascii = {
["$"] = "$",
["%"] = "%%",
["*"] = "*",
["@"] = "@",
["^"] = "^",
["_"] = "_",
["`"] = "`"
}
--[[
pre-processing exceptions that apply to both RR and MR
IMPORTANT: Before adding an exception, be sure to check if it can ALWAYS be applied in ALL contexts.
Good example: 싫증 → 실@증
Bad example: 문자 → 문@자 (affects words like 방문자 (pronounced [방문자], not [방문짜]))
Hangul status: decomposed (ᄒ+ᅡ+ᆫ)
--]]
p.exceptions = {
-- for linguistic contexts
["ㄴ([ᄀ-ᄒ])"] = "ᆫ%1", -- -ㄴ다
["ㄹ([ᄀ-ᄒ])"] = "ᆯ%1", -- -ㄹ까, -ㄹ래
["ㄹ@([ᄀᄃᄇᄉᄌ])"] = "ᆯ@%1", -- -ㄹ지
["ㅁ([ᄀ-ᄒ])"] = "ᆷ%1",
["ㅂ([ᄀ-ᄒ])"] = "ᆸ%1", -- -ㅂ니다, -ㅂ시다
-- ㄴ-addition always occurs before 윷 and 잎
["([ᆨ-ᇂ])ᄋ(ᅲᆾ)"] = "%1ᄂ%2",
["([ᆨ-ᇂ])ᄋ(ᅵᇁ)"] = "%1ᄂ%2",
-- 곧이어 [고디어]
["(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ][^ᆨ-ᇂ])"] = "%1ᄃ%2",
["(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ])$"] = "%1ᄃ%2",
-- 싫증 [실쯩]
["(ᄉ[ᅵ])ᆶ(ᄌ[ᅳ]ᆼ)"] = "%1ᆯ@%2",
-- cases where ㄺㄱ is pronounced [ㄱㄲ]
-- not including very rarely used words such as 삼시욹, 안찱, 우줅거리다, etc.
["([ᄃᄉᄐ]ᅡ)ᆰᄀ"] = "%1ᆨᄀ", -- 닭, 삵, 수탉/암탉
["([ᄉᄒ]ᅳ)ᆰᄀ"] = "%1ᆨᄀ", -- 기슭, 흙
["(ᄎ[ᅵ])ᆰᄀ"] = "%1ᆨᄀ", -- 칡
-- palatalization and ㅈ + -히-
["ᆮᄋ(ᅵ[ᆫᆯᆷᆸ])"] = "ᄌ%1", -- 해돋이 [해도지]
["ᆮᄋ(ᅵ[^ᆨ-ᇂ])"] = "ᄌ%1",
["ᆮᄋ(ᅵ)$"] = "ᄌ%1",
["[ᆮᆽ]ᄒ(ᅧᆻ)"] = "ᄎ%1", -- 굳히다 [구치다], 꽂히다 [꼬치다]
["[ᆮᆽ]ᄒ(ᅵ[ᆫᆯᆷᆸ])"] = "ᄎ%1",
["[ᆮᆽ]ᄒ([ᅧᅵ][^ᆨ-ᇂ])"] = "ᄎ%1",
["[ᆮᆽ]ᄒ([ᅧᅵ])$"] = "ᄎ%1",
["ᆴᄋ(ᅧᆻ)"] = "ᆯᄎ%1", -- 훑이다 [훌치다]
["ᆴᄋ(ᅵ[ᆫᆯᆷᆸ])"] = "ᆯᄎ%1",
["ᆴᄋ([ᅧᅵ][^ᆨ-ᇂ])"] = "ᆯᄎ%1",
["ᆴᄋ([ᅧᅵ])$"] = "ᆯᄎ%1",
["ᇀᄋ(ᅧᆻ)"] = "ᄎ%1", -- 붙이다 [부치다]
["ᇀᄋ(ᅵ[ᆫᆯᆷᆸ])"] = "ᄎ%1",
["ᇀᄋ([ᅧᅵ][^ᆨ-ᇂ])"] = "ᄎ%1",
["ᇀᄋ([ᅧᅵ])$"] = "ᄎ%1",
-- {ㄵ, ㄺ, ㄼ} + -히-
["ᆬᄒ(ᅧᆻ)"] = "ᆫᄎ%1", -- 앉히다 [안치다]
["ᆬᄒ(ᅵ[ᆫᆯᆷᆸ])"] = "ᆫᄎ%1",
["ᆬᄒ([ᅧᅵ][^ᆨ-ᇂ])"] = "ᆫᄎ%1",
["ᆬᄒ([ᅧᅵ])$"] = "ᆫᄎ%1",
["ᆰᄒ(ᅧᆻ)"] = "ᆯᄏ%1", -- 밝히다 [발키다]
["ᆰᄒ(ᅵ[ᆫᆯᆷᆸ])"] = "ᆯᄏ%1",
["ᆰᄒ([ᅧᅵ][^ᆨ-ᇂ])"] = "ᆯᄏ%1",
["ᆰᄒ([ᅧᅵ])$"] = "ᆯᄏ%1",
["ᆲᄒ(ᅧᆻ)"] = "ᆯᄑ%1", -- 넓히다 [널피다], 밟히다 [발피다]
["ᆲᄒ(ᅵ[ᆫᆯᆷᆸ])"] = "ᆯᄑ%1",
["ᆲᄒ([ᅧᅵ][^ᆨ-ᇂ])"] = "ᆯᄑ%1",
["ᆲᄒ([ᅧᅵ])$"] = "ᆯᄑ%1",
-- cases where 넓- is pronounced [넙] before consonant
["(ᄂ[ᅥ])ᆲ([ᄁᄄ-ᄈᄊᄍ-ᄒ])"] = "%1ᆸ%2",
["(ᄂ[ᅥ])ᆲ(ᄃ[ᅡ]ᄃ[ᅳ]ᆷ)"] = "%1ᆸ%2", -- 넓다듬이
["(ᄂ[ᅥ])ᆲ(ᄃ[ᅮ]ᆼ)"] = "%1ᆸ%2", -- 넓둥글다
["(ᄂ[ᅥ])ᆲ(ᄉ[ᅡ]ᆯᄆ[ᅮ]ᆫ)"] = "%1ᆸ%2", -- 넓살문
["(ᄂ[ᅥ])ᆲ(ᄌ[ᅥᅮ]ᆨ)"] = "%1ᆸ%2", -- 넓적-, 넓죽-
-- 밟- is [밥] before consonant (except null-init consonant ㅇ)
["(ᄇ[ᅡ])ᆲ([^ᄋ])"] = "%1ᆸ%2",
["(ᄇ[ᅡ])ᆲ$"] = "%1ᆸ",
-- automatic 절음 법칙
["(ᄋ[ᅥ])ᆹᄋ(ᅢ[ᆫᆯᆷᆸᆻ])"] = "%1ᆸᄉ%2", -- except 없애다 [업쌔다]
["(ᄋ[ᅥ])ᆹᄋ(ᅢ[^ᆨ-ᇂ])"] = "%1ᆸᄉ%2",
["(ᄋ[ᅥ])ᆹᄋ(ᅢ)$"] = "%1ᆸᄉ%2",
["(ᄆ[ᅡᅥ])ᆺᄋ(ᅵᆻ)"] = "%1ᄉ%2", -- except 맛있다 and 멋있다 which are usually pronounced [마싣따] and [머싣따] respectively
["([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅡᅥᅧ][ᆨ-ᆺᆼ-ᇂ])"] = "%1@%2", -- except 아, 았, 어, 었, 여, 였
["([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅦ][ᆨ-ᆪᆬ-ᆮᆰ-ᇂ])"] = "%1@%2", -- except 에, 엔, 엘
["([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅭᅴ][ᆨ-ᇂ])"] = "%1@%2", -- except 요, 의 (w/o final consonant)
["([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅳᅵ][ᆨ-ᆪᆬ-ᆮᆰ-ᆶᆹ-ᇂ])"] = "%1@%2", -- except 으, 은, 을, 음, 읍, 이, 인, 일, 임, 입
["([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅢ-ᅤᅨ-ᅬᅮ-ᅲ])"] = "%1@%2",
-- _ for additional space in romanization only
["_"] = " "
}
-- unwrapping enclosed hangul text
p.enclosed_hangul = {
-- actually not very necessary, but these are also classified as Hangul chars in Unicode
-- no distinction is made between parenthesized and circled chars
["[㈀㉠]"] = "(기역)",
["[㈁㉡]"] = "(니은)",
["[㈂㉢]"] = "(디귿)",
["[㈃㉣]"] = "(리을)",
["[㈄㉤]"] = "(미음)",
["[㈅㉥]"] = "(비읍)",
["[㈆㉦]"] = "(시옷)",
["[㈇㉧]"] = "(이응)",
["[㈈㉨]"] = "(지읒)",
["[㈉㉩]"] = "(치읓)",
["[㈊㉪]"] = "(키읔)",
["[㈋㉫]"] = "(티읕)",
["[㈌㉬]"] = "(피읖)",
["[㈍㉭]"] = "(히읗)",
["[㈎㉮]"] = "(가)",
["[㈏㉯]"] = "(나)",
["[㈐㉰]"] = "(다)",
["[㈑㉱]"] = "(라)",
["[㈒㉲]"] = "(마)",
["[㈓㉳]"] = "(바)",
["[㈔㉴]"] = "(사)",
["[㈕㉵]"] = "(아)",
["[㈖㉶]"] = "(자)",
["[㈗㉷]"] = "(차)",
["[㈘㉸]"] = "(카)",
["[㈙㉹]"] = "(타)",
["[㈚㉺]"] = "(파)",
["[㈛㉻]"] = "(하)",
["㈜"] = "(주)",
["㈝"] = "(오전)",
["㈞"] = "(오후)",
["㉼"] = "(참고)",
["㉽"] = "(주의)",
["㉾"] = "(우)"
}
-- for null-init consonant ㅇ (연음) (applies to both rr and mr)
p.null_init_ieung = {
["ᆨᄋ"] = "ᄀ",
["ᆩᄋ"] = "ᄁ",
["ᆪᄋ"] = "ᆨᄉ",
["ᆬᄋ"] = "ᆫᄌ",
["ᆮᄋ"] = "ᄃ",
["[ᆯᆶ]ᄋ"] = "ᄅ",
["ᆰᄋ"] = "ᆯᄀ",
["ᆱᄋ"] = "ᆯᄆ",
["ᆲᄋ"] = "ᆯᄇ",
["ᆳᄋ"] = "ᆯᄉ",
["ᆴᄋ"] = "ᆯᄐ",
["ᆵᄋ"] = "ᆯᄑ",
["ᆸᄋ"] = "ᄇ",
["ᆹᄋ"] = "ᆸᄉ",
["ᆺᄋ"] = "ᄉ",
["ᆻᄋ"] = "ᄊ",
["ᆽᄋ"] = "ᄌ",
["ᆾᄋ"] = "ᄎ",
["ᆿᄋ"] = "ᄏ",
["ᇀᄋ"] = "ᄐ",
["ᇁᄋ"] = "ᄑ",
["ᇂᄋ"] = "ᄋ" -- silent; 좋아 [조아]
}
-- vowels to romanized text for rr
p.rr_vowels = {
["[ᅡㅏ]"] = "a",
["[ᅢㅐ]"] = "ae",
["[ᅣㅑ]"] = "ya",
["[ᅤㅒ]"] = "yae",
["[ᅥㅓ]"] = "eo",
["[ᅦㅔ]"] = "e",
["[ᅧㅕ]"] = "yeo",
["[ᅨㅖ]"] = "ye",
["[ᅩㅗ]"] = "o",
["[ᅪㅘ]"] = "wa",
["[ᅫㅙ]"] = "wae",
["[ᅬㅚ]"] = "oe",
["[ᅭㅛ]"] = "yo",
["[ᅮㅜ]"] = "u",
["[ᅯㅝ]"] = "wo",
["[ᅰㅞ]"] = "we",
["[ᅱㅟ]"] = "wi",
["[ᅲㅠ]"] = "yu",
["[ᅳㅡ]"] = "eu",
["[ᅴㅢ]"] = "ui",
["[ᅵㅣ]"] = "i"
}
-- vowels to romanized text for mr
p.mr_vowels = {
["[ᅡㅏ]"] = "a",
["[ᅢㅐ]"] = "ae",
["[ᅣㅑ]"] = "ya",
["[ᅤㅒ]"] = "yae",
["[ᅥㅓ]"] = "ŏ",
["[ᅦㅔ]"] = "e",
["[ᅧㅕ]"] = "yŏ",
["[ᅨㅖ]"] = "ye",
["[ᅩㅗ]"] = "o",
["[ᅪㅘ]"] = "wa",
["[ᅫㅙ]"] = "wae",
["[ᅬㅚ]"] = "oe",
["[ᅭㅛ]"] = "yo",
["[ᅮㅜ]"] = "u",
["[ᅯㅝ]"] = "wŏ",
["[ᅰㅞ]"] = "we",
["[ᅱㅟ]"] = "wi",
["[ᅲㅠ]"] = "yu",
["[ᅳㅡ]"] = "ŭ",
["[ᅴㅢ]"] = "ŭi",
["[ᅵㅣ]"] = "i"
}
-- single consonants to romanized text for rr
p.rr_single_consonants = {
["[ᄀㄱ]"] = "g",
["[ᄁㄲ]"] = "kk",
["ㄳ"] = "ks",
["[ᄂᆫㄴ]"] = "n",
["ㄵ"] = "nj",
["ㄶ"] = "nh",
["[ᄃㄷ]"] = "d",
["[ᄄㄸ]"] = "tt",
["[ᄅㄹ]"] = "r",
["ᆯ"] = "l",
["ㄺ"] = "lg",
["ㄻ"] = "lm",
["ㄼ"] = "lb",
["ㄽ"] = "ls",
["ㄾ"] = "lt",
["ㄿ"] = "lp",
["ㅀ"] = "lh",
["[ᄆᆷㅁ]"] = "m",
["[ᄇㅂ]"] = "b",
["[ᄈㅃ]"] = "pp",
["ㅄ"] = "ps",
["[ᄉㅅ]"] = "s",
["[ᄊㅆ]"] = "ss",
["[ᄋㅇ]"] = "",
["ᆼ"] = "ng",
["[ᄌㅈ]"] = "j",
["[ᄍㅉ]"] = "jj",
["[ᄎㅊ]"] = "ch",
["[ᄏᆨㅋ]"] = "k",
["[ᄐᆮㅌ]"] = "t",
["[ᄑᆸㅍ]"] = "p",
["[ᄒㅎ]"] = "h"
}
-- single consonants to romanized text for mr
p.mr_single_consonants = {
["`ᄀ"] = "g",
["`ᄃ"] = "d",
["`ᄇ"] = "b",
["`ᄌ"] = "j",
["[ᄀᆨㄱ]"] = "k",
["[ᄁㄲ]"] = "kk",
["ㄳ"] = "ks",
["[ᄂᆫㄴ]"] = "n",
["ㄵ"] = "nj",
["ㄶ"] = "nh",
["[ᄃᆮㄷ]"] = "t",
["[ᄄㄸ]"] = "tt",
["[ᄅㄹ]"] = "r",
["ᆯ"] = "l",
["ㄺ"] = "lg",
["ㄻ"] = "lm",
["ㄼ"] = "lb",
["ㄽ"] = "ls",
["ㄾ"] = "lt'",
["ㄿ"] = "lp'",
["ㅀ"] = "rh",
["[ᄆᆷㅁ]"] = "m",
["[ᄇᆸㅂ]"] = "p",
["[ᄈㅃ]"] = "pp",
["ㅄ"] = "ps",
["[ᄉㅅ]"] = "s",
["[ᄊㅆ]"] = "ss",
["[ᄋㅇ]"] = "",
["ᆼ"] = "ng",
["[ᄌㅈ]"] = "ch",
["[ᄍㅉ]"] = "tch",
["[ᄎㅊ]"] = "ch'",
["[ᄏㅋ]"] = "k'",
["[ᄐㅌ]"] = "t'",
["[ᄑㅍ]"] = "p'",
["[ᄒㅎ]"] = "h",
["`"] = ""
}
-- drop y after {ㅈ, ㅉ, ㅊ} (applies to rr and mr)
p.drop_y = {
["([ᄌ-ᄎ])ᅣ"] = "%1ᅡ",
["([ᄌ-ᄎ])ᅤ"] = "%1ᅢ",
["([ᄌ-ᄎ])ᅧ"] = "%1ᅥ",
["([ᄌ-ᄎ])ᅨ"] = "%1ᅦ",
["([ᄌ-ᄎ])ᅭ"] = "%1ᅩ",
["([ᄌ-ᄎ])ᅲ"] = "%1ᅮ",
}
-- convert ㅎ combinations (used for rr)
-- trivia: {ㄶ, ㅀ, ㅎ} + ㅂ doesn't actually exist, but added for completeness
-- (syl-final ㅎ is for aspiration anyway)
p.process_hieut_rr = {
["ᆭᄀ"] = "ᆫᄏ",
["ᆭᄃ"] = "ᆫᄐ",
["ᆭᄇ"] = "ᆫᄑ",
["ᆭᄌ"] = "ᆫᄎ",
["ᆶᄀ"] = "ᆯᄏ",
["ᆶᄃ"] = "ᆯᄐ",
["ᆶᄇ"] = "ᆯᄑ",
["ᆶᄌ"] = "ᆯᄎ",
["ᇂᄀ"] = "ᄏ",
["ᇂᄃ"] = "ᄐ",
["ᇂᄇ"] = "ᄑ",
["ᇂᄉ"] = "ᄉ",
["ᇂᄌ"] = "ᄎ"
}
p.process_hieut_mr = {
["ᆭᄀ"] = "ᆫᄏ",
["ᆭᄃ"] = "ᆫᄐ",
["ᆭᄇ"] = "ᆫᄑ",
["[ᆬᆭ]ᄉ"] = "ᆫᄊ",
["ᆭᄌ"] = "ᆫᄎ",
["ᆶᄀ"] = "ᆯᄏ",
["ᆶᄃ"] = "ᆯᄐ",
["ᆶᄇ"] = "ᆯᄑ",
["[ᆲᆴᆶ]ᄉ"] = "ᆯᄊ",
["ᆶᄌ"] = "ᆯᄎ",
["ᇂᄀ"] = "ᄏ",
["ᇂᄃ"] = "ᄐ",
["ᇂᄇ"] = "ᄑ",
["ᇂᄉ"] = "ᄊ",
["ᇂᄌ"] = "ᄎ"
}
-- neutralization of syl-final consonants (both rr and mr)
p.neutralize_syl_final_consonants = {
["[ᆩᆪᆰᆿ]"] = "ᆨ",
["[ᆬᆭ]"] = "ᆫ",
["[ᆺᆻᆽᆾᇀᇂ]"] = "ᆮ",
["[ᆲᆳᆴᆶ]"] = "ᆯ",
["ᆱ"] = "ᆷ",
["[ᆵᆹᇁ]"] = "ᆸ"
}
-- handle irregularities associated with special symbol "@" (both rr and mr)
-- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p, 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
-- other irregularities documented are automatically handled
p.at_irregularities_rr = {
["ᆨ@ᄒ"] = "ᄏ",
["ᆮ@ᄒ"] = "ᄐ",
["ᆸ@ᄒ"] = "ᄑ",
["ᆨ@ᄋ"] = "ᄀ",
["ᆮ@ᄋ"] = "ᄃ", -- 웃어른 [우더른]
["ᆯ@ᄋ"] = "ᄅ",
["ᆸ@ᄋ"] = "ᄇ",
["ᆫ@ᄅ"] = "ᆫᄂ", -- 음운론 [으문논]
["@"] = ""
}
-- @ for tensification, 절음 법칙, ㄴㄹ pronounced [ㄴㄴ]
-- other irregularities documented are automatically handled
p.at_irregularities_mr = {
["([ᅡ-ᅵᆫᆷᆼ])@ᄉ"] = "%1ᄊ",
["ᆨ@ᄋ"] = "ᄀ",
["ᆮ@ᄋ"] = "ᄃ", -- 웃어른 [우더른]
["ᆯ@ᄋ"] = "ᄅ",
["ᆸ@ᄋ"] = "ᄇ",
["ᆫ@ᄅ"] = "ᆫᄂ" -- 음운론 [으문논]
}
p.consonant_assimilations_rr = {
["[ᆨᆼ][ᄂᄅ]"] = "ᆼᄂ",
["ᆨᄆ"] = "ᆼᄆ",
["ᆫᄅ"] = "ᆯᄅ",
["ᆮ[ᄂᄅ]"] = "ᆫᄂ",
["ᆮᄆ"] = "ᆫᄆ",
["ᆯᄂ"] = "ᆯᄅ",
["[ᆷᆸ][ᄂᄅ]"] = "ᆷᄂ",
["ᆸᄆ"] = "ᆷᄆ",
["ᆯᄅ"] = "ᆯl"
}
p.name_rieul_dsj_tensification = {
["([^]*)달달([^]*)"] = "%1달달%2",
["([^]*)돌돌([^]*)"] = "%1돌돌%2",
["([^]*)살살([^]*)"] = "%1살살%2",
["([^]*)설설([^]*)"] = "%1설설%2",
["([^]*)솔솔([^]*)"] = "%1솔솔%2",
["([^]*)술술([^]*)"] = "%1술술%2",
["([^]*)슬슬([^]*)"] = "%1슬슬%2",
["([^]*)실실([^]*)"] = "%1실실%2",
["([^]*)절절([^]*)"] = "%1절절%2",
["([^]*)졸졸([^]*)"] = "%1졸졸%2",
["([^]*)줄줄([^]*)"] = "%1줄줄%2",
["([^]*)즐즐([^]*)"] = "%1즐즐%2",
["([^]*)질질([^]*)"] = "%1질질%2"
}
return p