Modul:Lang/data

local lang_obj = mw.language.getContentLanguage();
local this_wiki_lang_tag = lang_obj.code;										-- get this wiki's language tag


--[[--------------------------< L A N G _ N A M E _ T A B L E >------------------------------------------------

primary table of tables that decode:
	lang -> language tags and names
	script -> ISO 15924 script tags
	region -> ISO 3166 region tags
	variant -> iana registered variant tags
	suppressed -> map of scripts tags and their associated language tags
	
all of these data come from separate modules that are derived from the IANA language-subtag-registry file

key_to_lower() avoids the metatable trap and sets all keys in the subtables to lowercase. Many language codes
have multiple associated names; Module:lang is only concerned with the first name so key_to_lower() only fetches
the first name.

]]

local function key_to_lower (module, src_type)
	local out = {};
	local source = (('var_sup' == src_type) and require (module)) or mw.loadData (module);		-- fetch data from this module; require() avoids metatable trap for variant data
	if 'var_sup' == src_type then
		for k, v in pairs (source) do
			out[k:lower()] = v;													-- for variant and suppressed everything is needed
		end

	elseif 'lang' == src_type and source.active then							-- for ~/iana_languages (active)
		for k, v in pairs (source.active) do
			out[k:lower()] = v[1];												-- ignore multiple names; take first name only
		end

	elseif 'lang_dep' == src_type and source.deprecated then					-- for ~/iana_languages (deprecated)
		for k, v in pairs (source.deprecated) do
			out[k:lower()] = v[1];												-- ignore multiple names; take first name only
		end

	else																		-- here for all other sources
		for k, v in pairs (source) do
			out[k:lower()] = v[1];												-- ignore multiple names; take first name only
		end
	end
	return out;
end

local lang_name_table_t = {
	lang = key_to_lower ('Module:Language/data/iana languages', 'lang'),
	lang_dep = key_to_lower ('Module:Language/data/iana languages', 'lang_dep'),
	script = key_to_lower ('Module:Language/data/iana scripts'),				-- script keys are capitalized; set to lower
	region = key_to_lower ('Module:Language/data/iana regions'),				-- region keys are uppercase; set to lower
	variant = key_to_lower ('Module:Language/data/iana variants', 'var_sup'),
	suppressed = key_to_lower ('Module:Language/data/iana suppressed scripts', 'var_sup'),	-- script keys are capitalized; set to lower
	}


--[[--------------------------< I 1 8 N   M E D I A W I K I   O V E R R I D E >--------------------------------

For internationalization; not used at en.wiki

The language names taken from the IANA language-subtag-registry file are given in English. That may not be ideal.
Translating ~8,000 language names is also not ideal.  MediaWiki maintains (much) shorter lists of language names
in most languages for which there is a Wikipedia edition.  When desired, Module:Lang can use the MediaWiki 
language list for the local language.

Caveat lector: the list of MediaWiki language names for your language may not be complete or may not exist at all.
When incomplete, MediaWiki's list will 'fall back' to another language (typically English).  When that happens
add an appropriate entry to the override table below.

Caveat lector: the list of MediaWiki language names for your language may not be correct.  At en.wiki, the
MediaWiki language names do not agree with the IANA language names for these ISO 639-1 tags.  Often it is simply
spelling differences:
	bh: IANA: Bihari languages MW: Bhojpuri – the ISO 639-3 tag for Bhojpuri is bho
	bn: IANA: Bengali MW: Bangla – Bengali is the exonym, Bangla is the endonym
	dv: IANA: Dhivehi MW: Divehi
	el: IANA: Modern Greek MW: Greek
	ht: IANA: Haitian MW: Haitian Creole
	ky: IANA: Kirghiz MW: Kyrgyz
	li: IANA: Limburgan MW: Limburgish
	or: IANA: Oriya MW: Odia
	os: IANA: Ossetian MW: Ossetic
	"pa: IANA: Panjabi MW: Punjabi
	"ps: IANA: Pushto MW: Pashto
	"to: IANA: Tonga MW: Tongan
	"ug: IANA: Uighur MW: Uyghur
use the override table to override language names that are incorrect for your project

To see the list of names that MediaWiki has for your language, enter this in the Debug colsole:
	=mw.dumpObject (mw.language.fetchLanguageNames ('<tag>', 'all'))
(replacing <tag> with the language tag for your language)

Use of the MediaWiki language names lists is enabled when media_wiki_override_enable is set to boolean true.
	
]]

local media_wiki_override_enable = false;										-- set to true to override IANA names with MediaWiki names; always false at en.wiki
																				-- caveat lector: the list of MediaWiki language names for your language may not be complete or may not exist at all
	if true == media_wiki_override_enable then
		local mw_languages_by_tag_t = mw.language.fetchLanguageNames (this_wiki_lang_tag, 'all');	-- get a table of language tag/name pairs known to MediaWiki
		for tag, name in pairs (mw_languages_by_tag_t) do						-- loop through each tag/name pair in the MediaWiki list
			if lang_name_table_t.lang[tag] then									-- if the tag is in the main list
				lang_name_table_t.lang[tag] = name;								-- overwrite exisiting name with the name from MediaWiki
			end
		end
	end


--[[--------------------------< O V E R R I D E >--------------------------------------------------------------

Language codes and names in this table override the BCP47 names in Module:Language/name/data.

code indexes in this table shall always be lower case
]]

local override = {
-- ISO 639-1 codes
	["ab"] = {"Abkhazia"},														-- IANA name is Abkhazian; override wp_languages {"Abkhaz"}; to achieve this, use |label=
	["bh"] = {"rumpun bahasa Bihari"},												-- only ISO 639-1 collective; defined here to override improper redefinition (Bihari) in wp_languages
	["cu"] = {"Slavonik Gereja"},												-- 2nd IANA name;
	["de-at"] = {"Jerman Austria"},											-- these code-region and code-variant to match en.wiki article names
	["de-ch"] = {"Jerman Swiss"},
	["en-au"] = {"Inggris Australia"},
	["en-ca"] = {"Inggris Kanada"},
	["en-emodeng"] = {"Inggris Modern Awal"},
	["en-gb"] = {"Inggris Britania"},
	["en-ie"] = {"Inggris Irlandia"},
	["en-nz"] = {"Inggris Selandia Baru"},
	["en-us"] = {"Inggris Amerika"},
	["en-za"] = {"Inggris Afrika Selatan"},
	["fy"] = {"Friesland Barat"},													-- IANA name is Western Frisian
	["ps"] = {"Pashtun"},														-- IANA name is Pushto
	["sr-cyrl"] = {"Serbia"},													-- override wp_languages Serbian Cyrillic; to achieve this, use |label=

-- ISO 639-2, -3 codes
	["arc"] = {"Aramaik"},														-- IANA names are: Official Aramaic (700-300 BCE), Imperial Aramaic (700-300 BCE);
	["ber"] = {"rumpun bahasa Berber"},												-- ISO 639-2 collective; defined here to override redefinition in wp_languages
	["bua"] = {"Buryat"},														-- IANA name Buriat; this is a macro language; these four use wp preferred transliteration;
	["bxm"] = {"Buryat Mongolia"},												-- IANA name Mongolia Buriat; these three all redirect to Buryat
	["bxr"] = {"Buryat Rusia"},												-- IANA name Russia Buriat;
	["bxu"] = {"Buryat Tiongkok"},												-- IANA name China Buriat;
	["byr"] = {"Yipma"},														-- IANA names are Baruya and Yipma
	["cel"] = {"rumpun bahasa Keltik"},												-- ISO 639-2 collective; defined here to override improper redefinition ('Proto-Celtic') in wp_languages; use cel-x-proto instead
	["egy"] = {"Mesir Kuno"},												-- IANA name is Egyptian (Ancient); distinguish from contemporary arz: Egyptian Arabic 
	["frr"] = {"Friesland Utara"},												-- IANA name is Northern Frisian
	["frs"] = {"Niedersachsen Friesland Timur"},										-- IANA name is Eastern Frisian
	["gem"] = {"rumpun bahasa Jermanik"},											-- ISO 639-2 collective; defined here to override improper redefinition ('Proto-Germanic') in wp_languages; use gem-x-proto instead
	["jam"] = {"Patois Jamaika"},												-- IANA name is Jamaican Creole English
	["mhr"] = {"Meadow Mari"},													-- IANA name is Eastern Mari
	["mid"] = {"Mandaik Modern"},												-- IANA name is Mandaic
	["mla"] = {"Tamambo"},														-- wp_languages name is Medieval Latin; IANA and ISO 639-1 name is Malo
	["myn"] = {"rumpun bahasa Maya"},												-- ISO 639-2 collective; defined here to override redefinition in wp_languages
	["nah"] = {"rumpun bahasa Nahuatl"},											-- ISO 639-2 collective; defined here to override redefinition in wp_languages
	["nrf"] = {"Norman"},														-- not quite a collective - IANA name: Jèrriais; categorizes to Norman-language text
	["orv"] = {"Slavik Timur Lama"},												-- IANA name is Old Russian
	["pms"] = {"Piedmont"},													-- IANA / 639-3 name is Piemontese; to match en.wiki article title
	["pra"] = {"rumpun bahasa Prakrit"},											-- ISO 639-2 collective; defined here to override redefinition in wp_languages
	["roa"] = {"rumpun bahasa Roman"},											-- ISO 639-2 collective; defined here to override improper redefinition (Jèrriais) in wp_language; IANA name is "rumpun bahasa Roman"
	["sal"] = {"rumpun bahasa Salisha"},											-- ISO 639-2 collective; defined here to override redefinition in wp_languages
	["sla"] = {"rumpun bahasa Slavik"},												-- ISO 639-2 collective; defined here to override redefinition in wp_languages
	["son"] = {"rumpun bahasa Songhai"},											-- ISO 639-2 collective; defined here to override redefinition in wp_languages
	["stq"] = {"Friesland Saterland"},											-- IANA name is Saterfriesisch
	["und"] = {"tidak ditentukan"},													-- capitalization to match existing category
	["wen"] = {"rumpun bahasa Sorbia"},											-- ISO 639-2 collective; defined here to override redefinition in wp_languages
	["wrg"] = {"Warrongo"},														-- IANA name is Warungu
	["xal-ru"] = {"Kalmyk"},													-- to match en.wiki article title
	
-- private use codes
	["cel-x-proto"] = {"Proto-Keltik"},											-- cel in IANA is Celtic languages
	["gem-x-proto"] = {"Proto-Jermanik"},										-- gem in IANA is Germanic languages
	["grc-x-aeolic"] = {"Yunani Aeolik"},										-- these grc-x-... codes are preferred alternates to the non-standard catchall code grc-gre
	["grc-x-attic"] = {"Yunani Attik"},
	["grc-x-biblical"] = {"Yunani Alkitab"},
	["grc-x-byzant"] = {"Yunani Bizantium"},
	["grc-x-classic"] = {"Yunani Klasik"},
	["grc-x-doric"] = {"Yunani Dorik"},
	["grc-x-hellen"] = {"Yunani Helenistik"},
	["grc-x-ionic"] = {"Yunani Ionik"},
	["grc-x-koine"] = {"Yunani Koinē"},
	["grc-x-medieval"] = {"Yunani Pertengahan"},
	["grc-x-patris"] = {"Yunani Patristik"},
	["grk-x-proto"] = {"Proto-Yunani"},											-- grk in IANA is Greek languages
	["iir-x-proto"] = {"Proto-Indo-Iran"},									-- iir in IANA is Indo-Iranian Languages
	["ira-x-proto"] = {"Proto-Iran"},										-- ira in IANA is Iranian languages
	["itc-x-proto"] = {"Proto-Italik"},											-- itc in IANA is Italic languages
	["sla-x-proto"] = {"Proto-Slavik"},											-- sla in IANA is Slavic languages
	["yuf-x-hav"] = {"Havasupai"},												-- IANA name for these three is Havasupai-Walapai-Yavapai
	["yuf-x-wal"] = {"Walapai"},
	["yuf-x-yav"] = {"Yavapai"},
	}


--[[--------------------------< A R T I C L E _ L I N K >------------------------------------------------------

for those rare occasions when article titles don't fit with the normal '<language name>-language', this table
maps language code to article title.  Use of this table should be avoided and the use of redirects preferred as
that is the long-standing method of handling article names that don't fit with the normal pattern

]]

local article_name = {
	["lij"] = {"Liguria (bahasa Roman)"},									-- see Template_talk:Lang#Ligurian_dab
	["xlg"] = {"Liguria (kuno)"},									-- see Template_talk:Lang#Ligurian_dab
	}


--[=[-------------------------< R T L _ S C R I P T S >--------------------------------------------------------

ISO 15924 scripts that are written right-to-left.  Data in this table taken from [[ISO 15924#List of codes]]

last update to this list: 2017-12-24

]=]

local rtl_scripts = {
	'adlm', 'arab', 'aran', 'armi', 'avst', 'cprt', 'egyd', 'egyh', 'hatr', 'hebr',
	'hung', 'inds', 'khar', 'lydi', 'mand', 'mani', 'mend', 'merc', 'mero', 'narb',
	'nbat', 'nkoo', 'orkh', 'palm', 'phli', 'phlp', 'phlv', 'phnx', 'prti', 'rohg',
	'samr', 'sarb', 'sogd', 'sogo', 'syrc', 'syre', 'syrj', 'syrn', 'thaa', 'wole',
	};


--[[--------------------------< T R A N S L I T   T I T L E S >------------------------------------------------

This is a table of tables of transliteration standards and the language codes or language scripts that apply to
those standards.  This table is used to create the tool-tip text associated with the transliterated text displayed
by some of the {{lang-??}} templates.

These tables are more-or-less copied directly from {{transl}}.  The standard 'NO_STD' is a construct to allow for
the cases when no |std= parameter value is provided.

]]

local translit_title_table = {
	['ahl'] = {
		['default'] = 'alih aksara Academy of the Hebrew Language',
		},

	['ala'] = {
		['default'] = 'alih aksara American Library Association – Library of Congress',
		},

	['ala-lc'] = {
		['default'] = 'alih aksara American Library Association – Library of Congress',
		},

	['batr'] = {
		['default'] = 'Bikdash Arabic Transliteration Rules',
		},

	['bgn/pcgn'] = {
		['default'] = 'alih aksara Board on Geographic Names / Permanent Committee on Geographical Names',
		},

	['din'] = {
		['ar'] = 'DIN 31635 Arabic',
		['fa'] = 'DIN 31635 Arabic',
		['ku'] = 'DIN 31635 Arabic',
		['ps'] = 'DIN 31635 Arabic',
		['tg'] = 'DIN 31635 Arabic',
		['ug'] = 'DIN 31635 Arabic',
		['ur'] = 'DIN 31635 Arabic',
		['arab'] = 'DIN 31635 Arabic',

		['default'] = 'alih aksara DIN',
		},

	['eae'] = {
		['default'] = 'alih aksara Encyclopaedia Aethiopica',
		},

	['hepburn'] = {
		['default'] = 'alih aksara Hepburn',
		},

	['iast'] = {
		['default'] = 'alih aksara International Alphabet of Sanskrit',
		},

	['iso'] = {																	-- when a transliteration standard is supplied
		['ab'] = 'ISO 9 Cyrillic',
		['ba'] = 'ISO 9 Cyrillic',
		['be'] = 'ISO 9 Cyrillic',
		['bg'] = 'ISO 9 Cyrillic',
		['kk'] = 'ISO 9 Cyrillic',
		['ky'] = 'ISO 9 Cyrillic',
		['mn'] = 'ISO 9 Cyrillic',
		['ru'] = 'ISO 9 Cyrillic',
		['tg'] = 'ISO 9 Cyrillic',
		['uk'] = 'ISO 9 Cyrillic',
		['bua'] = 'ISO 9 Cyrillic',
		['sah'] = 'ISO 9 Cyrillic',
		['tut'] = 'ISO 9 Cyrillic',
		['xal'] = 'ISO 9 Cyrillic',
		['cyrl'] = 'ISO 9 Cyrillic',

		['ar'] = 'ISO 233 Arabic',
		['ku'] = 'ISO 233 Arabic',
		['ps'] = 'ISO 233 Arabic',
		['ug'] = 'ISO 233 Arabic',
		['ur'] = 'ISO 233 Arabic',
		['arab'] = 'ISO 233 Arabic',

		['he'] = 'ISO 259 Hebrew',
		['yi'] = 'ISO 259 Hebrew',
		['hebr'] = 'ISO 259 Hebrew',

		['el'] = 'ISO 843 Greek',
		['grc'] = 'ISO 843 Greek',

		['ja'] = 'ISO 3602 Japanese',
		['hira'] = 'ISO 3602 Japanese',
		['hrkt'] = 'ISO 3602 Japanese',
		['jpan'] = 'ISO 3602 Japanese',
		['kana'] = 'ISO 3602 Japanese',

		['zh'] = 'ISO 7098 Chinese',
		['chi'] = 'ISO 7098 Chinese',
		['pny'] = 'ISO 7098 Chinese',
		['zho'] = 'ISO 7098 Chinese',
--		['han'] = 'ISO 7098 Chinese',											-- unicode alias of Hani? doesn't belong here? should be Hani?
		['hans'] = 'ISO 7098 Chinese',
		['hant'] = 'ISO 7098 Chinese',

		['ka'] = 'ISO 9984 Georgian',
		['kat'] = 'ISO 9984 Georgian',

		['arm'] = 'ISO 9985 Armenian',
		['hy'] = 'ISO 9985 Armenian',

		['th'] = 'ISO 11940 Thai',
		['tha'] = 'ISO 11940 Thai',

		['ko'] = 'ISO 11941 Korean',
		['kor'] = 'ISO 11941 Korean',

		['bn'] = 'ISO 15919 Indic',
		['dra'] = 'ISO 15919 Indic',
		['gu'] = 'ISO 15919 Indic',
		['hi'] = 'ISO 15919 Indic',
		['inc'] = 'ISO 15919 Indic',
		['kn'] = 'ISO 15919 Indic',
		['ml'] = 'ISO 15919 Indic',
		['mr'] = 'ISO 15919 Indic',
		['sa'] = 'ISO 15919 Indic',
		['ta'] = 'ISO 15919 Indic',
		['te'] = 'ISO 15919 Indic',
		['beng'] = 'ISO 15919 Indic',
		['brah'] = 'ISO 15919 Indic',
		['deva'] = 'ISO 15919 Indic',

		['default'] = 'alih aksara ISO',
		},

	['mr'] = {
		['default'] = 'alih aksara McCune–Reischauer',
		},

	['nihon-shiki'] = {
		['default'] = 'alih aksara Nihon-shiki',
		},

	['no_std'] = {																-- when no transliteration standard is supplied
		['akk'] = 'alih aksara Semitik',
		['sem'] = 'alih aksara Semitik',
		['phnx'] = 'alih aksara Semitik',
		['xsux'] = 'alih aksara Kuneiform',
		},

	['pinyin'] = {
		['default'] = 'alih aksara Pinyin',
		},

	['rr'] = {
		['default'] = 'alih aksara Revised Romanization of Korean',
		},

	['rtgs'] = {
		['default'] = 'Royal Thai General System of Transcription',
		},
	
	['satts'] = {
		['default'] = 'alih aksara Standard Arabic Technical Transliteration System',
		},

	['ungegn'] = {
		['default'] = 'alih aksara United Nations Group of Experts on Geographical Names',
		},

	['wehr'] = {
		['default'] = 'alih aksara Hans Wehr',
		},
	};


return
	{
	this_wiki_lang_tag = this_wiki_lang_tag,
	this_wiki_lang_dir = lang_obj:getDir(),										-- wiki's language direction
	
	article_name = article_name,
	lang_name_table = lang_name_table_t,
	override = override,
	rtl_scripts = rtl_scripts,
	special_tags_table = special_tags_table,
	translit_title_table = translit_title_table,
	};