Module:grc-translit


This module will transliterate Ancient Greek language text per WT:GRC TR. It is also used to transliterate Proto-Brythonic, Gaulish, Messapic, Eteocretan, Demotic, Greek, Paeonian, Pre-Samnite, Oscan, Sicel, Thracian, Bactrian, Dacian, Galatian, Alanic, Elymian, Old Median, Ancient Macedonian, Phrygian, and Punic. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:grc-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

2 of 57 tests failed. (refresh)

TextExpectedActual
testcases for tr function in Module:grc-translit:
Passedλόγοςlógoslógos
Passedοἷαιhoîaihoîai
velar
Passedἄγγελοςángelosángelos
Passedἔγκειμαιénkeimaiénkeimai
Passedσφίγξsphínxsphínx
Passedτυγχάνωtunkhánōtunkhánō
PassedἈγϙυλίωνAnqulíōnAnqulíōn
archaic letters
PassedϘόρῐνθοϻQórĭnthosQórĭnthos
Passedϝάναξwánaxwánax
Passedἀρκͱᾱγέταςarkhāgétasarkhāgétas
Passed*-ϳω*-jō*-jō
current problems
FailedΥἱός'''Hu'''iós'''U'''hiós
u/y
Passedταῦροςtaûrostaûros
Passedνηῦςnēûsnēûs
Passedσῦςsûssûs
Passedὗςhûshûs
Passedγυῖονguîonguîon
Passedἀναῡ̈τέωanaṻtéōanaṻtéō
Passedδαΐφρωνdaḯphrōndaḯphrōn
Passedπρηῠ́ςprēŭ́sprēŭ́s
vowel length
Passedτῶνtôntôn
Passedτοὶtoìtoì
Passedτῷtōîtōî
Passedτούτῳtoútōitoútōi
Passedσοφίᾳsophíāisophíāi
PassedΘρᾴκηThrāíkēThrāíkē
Passedπροσηύδᾱprosēúdāprosēúdā
PassedΚαῖσᾰρKaîsărKaîsăr
Passedᾰ̓γᾰ́πηăgắpēăgắpē
Passedμᾱ̆νόςmā̆nósmā̆nós
Failedὑπόγυͅον hupógūionhupógūion
breathing
Passedhoho
Passedοἱhoihoi
Passedεὕρισκεheúriskeheúriske
Passedὑϊκόςhuïkóshuïkós
Passedπυρρόςpurrhóspurrhós
Passedῥέωrhéōrhéō
Passedῤάριονrárionrárion
PassedΡ̓ᾶροςRârosRâros
Passedσάἁμονsáhamonsáhamon
Passedϝ̔έwhéwhé
Passedμύῤῥᾱmúrrhāmúrrhā
Passed**ἔῥῥευσᾰ**érhrheusă**érhrheusă
Passed**Βοῤῤᾶς**Borrâs**Borrâs
capitals
PassedὈδυσσεύςOdusseúsOdusseús
PassedΕἵλωςHeílōsHeílōs
PassedᾍδηςHāídēsHāídēs
Passedἡ Ἑλήνηhē Helḗnēhē Helḗnē
PassedΙΧΘΥΣIKHTHUSIKHTHUS
punctuation
Passedἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή;ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?
Passedτί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν;tí tēnikáde aphîxai, ô Krítōn? ḕ ou prōì éti estín?tí tēnikáde aphîxai, ô Krítōn? ḕ ou prōì éti estín?
Passedτούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω.toútōn phōnḗenta mén estin heptá; a e ē i o u ō.toútōn phōnḗenta mén estin heptá; a e ē i o u ō.
Passedπήγ(νῡμῐ)pḗg(nūmĭ)pḗg(nūmĭ)
Passedἄ(γ)γελοςá(n)gelosá(n)gelos
Passedἄγκυρ(ρ)αánkur(rh)aánkur(rh)a
HTML entities
Passedκαλός καὶ ἀγαθόςkalós kaì agathóskalós kaì agathós
Passedκαλός καὶ ἀγαθόςkalós kaì agathóskalós kaì agathós

local export = {}

local m_grc_utils = require("Module:grc-utilities")
local m_grc_utils_data = require("Module:grc-utilities/data")
local m_str_utils = require("Module:string utilities")

local tokenize = require("Module:grc-utilities").tokenize

local canonicalize = m_grc_utils.canonicalize
local concat = table.concat
local insert = table.insert
local split = m_str_utils.split
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulower = m_str_utils.lower
local umatch = mw.ustring.match
local uupper = m_str_utils.upper

-- Diacritics
local diacritic = m_grc_utils_data.diacritic
local diacritics = m_grc_utils_data.diacritics

-- Greek
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local smooth = diacritics.smooth
local rough = diacritics.rough
local breve = diacritics.breve
local macron = diacritics.macron
local subscript = diacritics.subscript
local vowel = m_grc_utils_data.vowel

-- Latin
local hat = diacritics.Latin_circum

local au_subscript = "^[αυ].*" .. subscript .. "$"
local question_mark = u(0x37E)
local velar = "[γκξχϙ]"

local long_vowels = { -- Macron will be added.
	["η"] = "e",
	["ω"] = "o",
}

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",

	-- Other letters
	["ϛ"] = "st",
	["ϝ"] = "w",
	["ͱ"] = "h",
	["ϳ"] = "j",
	["ϙ"] = "q",
	["ϻ"] = "s",
	["ϸ"] = "š",
	["ͳ"] = "s",
	--["ͷ"] = "v", Differs by dialect.

	-- Diacritics
	-- unchanged: macron, diaeresis, grave, acute
	[smooth] = "",
	[rough] = "",
	[circumflex] = hat,
	[subscript] = "i",
}

local function get_next_token(tokens, i)
	local new = i + 1
	local token = tokens[new]
	while token and token:match("[()[%]{}]") do
		new = new + 1
		token = tokens[new]
	end
	return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1)
end

local function translit_letter(letter, trail)
	local tr = long_vowels[letter]
	return (tr and (tr .. (trail:find(breve) and "" or macron)) or tt[letter] or letter) .. trail:gsub(".[\128-\191]*", tt)
end

local function do_translit(token)
	-- Put iota subscript before accent marks, so that they appear on "i".
	token = ugsub(token, "([" .. acute .. grave .. circumflex .. "]+)" .. subscript, subscript .. "%1")
	return ugsub(token, "(.)(%W*)", translit_letter)
end

local function remove_macron_if_hat(m)
	return m:find(hat) and m:gsub(macron, "") or m
end

local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
	-- Remove any duplicate diacritics (this shouldn't really happen).
	local n
	repeat
		translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2")
	until n == 0
	-- Remove macron from a vowel that has a circumflex.
	translit = ugsub(translit, "%W+", remove_macron_if_hat)
	-- If capitalized, only capitalize the first letter unless the following token is capitalized as well.
	insert(
		output,
		(token == lower_token and translit or
			next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or
			uupper(translit)
		) .. suffix
	)
end

function export.tr(text, lang, sc)
	if text == "῾" then
		return "h"
	end
	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except any that occur in HTML entities. Use split to separate out the
		chunks between any entities.
	]]
	text = split(canonicalize(text), "(&#?%w+;)")
	for i = 1, #text, 2 do
		text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?")
	end
	text = concat(text)

	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = text:gsub("·", ";")

	local tokens = tokenize(text)

	--now read the tokens
	local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0)
	local output = {suffix}
	while next_token do
		local i, token, lower_token, is_rough = next_i, next_token, next_token_lower
		local translit = do_translit(lower_token)
		next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)

		-- γ before a velar should be <n>
		if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then
			translit = translit:gsub("g", "n")
		elseif lang == "xbc" and lower_token:find("φ") then
			translit = translit:gsub("ph", "f")
		elseif token == "ρ"..rough then
			translit = "rh"
		elseif token == "ρ"..smooth then
			translit = "r"
		-- ρ after ρ should be <rh>
		elseif lower_token:find("ρ") then
			-- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended.
			while next_token_lower and next_token_lower:find("ρ") do
				insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
				i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true
				translit = do_translit(lower_token)
				next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
			end
		-- add macron to ᾳ
		elseif umatch(lower_token, au_subscript) then
			translit = translit:gsub("[au]", "%0" .. macron)
		end

		if is_rough or lower_token:find(rough) then
			if umatch(lower_token, vowel) then
				translit = "h" .. translit
			else
				local final = umatch(translit, "(%w)%W*$")
				if final and final ~= "h" then
					translit = translit .. "h"
				end
			end
		end

		insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
	end

	return concat(output)
end

return export
Category:Alanic modules Category:Ancient Greek modules Category:Ancient Macedonian modules Category:Bactrian modules Category:Dacian modules Category:Demotic modules Category:Elymian modules Category:Eteocretan modules Category:Failing testcase modules Category:Galatian modules Category:Gaulish modules Category:Greek modules Category:Messapic modules Category:Old Median modules Category:Oscan modules Category:Paeonian modules Category:Phrygian modules Category:Pre-Samnite modules Category:Proto-Brythonic modules Category:Punic modules Category:Sicel modules Category:Thracian modules Category:Transliteration modules Category:Transliteration modules used by 21 languages