跳转到内容

模組:Grc-translit

維基詞典,自由的多語言詞典

這個模組會將古希臘語未確定的文字拉丁化。

最好不要直接從模板或其他模組調用此模組。要從模板中使用它,請以{{xlit}}做為替代;若要在模組中使用,則以Module:languages#Language:transliterate替代。

關於測試用例,請參閱Module:Grc-translit/testcases

函數

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang. When the transliteration fails, returns nil.

3測試s失敗。 (刷新)

文字 預期 實際
testcases for tr function in Module:grc-translit:
Passed λόγος lógos lógos
Passed σφίγξ sphínx sphínx
Passed ϝάναξ wánax wánax
Passed οἷαι hoîai hoîai
current problems
Passed ΙΧΘΥΣ IKHTHUS IKHTHUS
Failed Υἱός '''Hu'''iós '''U'''hiós
u/y
Passed ταῦρος taûros taûros
Passed νηῦς nēûs nēûs
Passed σῦς sûs sûs
Passed ὗς hûs hûs
Passed γυῖον guîon guîon
Passed ἀναῡ̈τέω anaṻtéō anaṻtéō
Passed δαΐφρων daḯphrōn daḯphrōn
vowel length
Passed τῶν tôn tôn
Passed τοὶ toì toì
Passed τῷ tôi tôi
Passed τούτῳ toútōi toútōi
Passed σοφίᾳ sophíāi sophíāi
Failed μᾱ̆νός mānós mā̆nós
h (rough breathing)
Passed ho ho
Passed οἱ hoi hoi
Passed εὕρισκε heúriske heúriske
Passed ὑϊκός huïkós huïkós
Passed πυρρός purrhós purrhós
Passed ῥέω rhéō rhéō
Passed σάἁμον sáhamon sáhamon
capitals
Passed Ὀδυσσεύς Odusseús Odusseús
Passed Εἵλως Heílōs Heílōs
Passed ᾍδης Hā́idēs Hā́idēs
Passed ἡ Ἑλήνη hē Helḗnē hē Helḗnē
punctuation
Passed ἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή; ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ? ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?
Passed τί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν; tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín? tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?
Passed τούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω. toútōn phōnḗenta mén estin heptá; a e ē i o u ō. toútōn phōnḗenta mén estin heptá; a e ē i o u ō.
Failed πήγ(νῡμῐ) pḗg(nūmi) pḗg(nūmĭ)
HTML entities
Passed καλός καὶ ἀγαθός kalós kaì agathós kalós kaì agathós
Passed καλός καὶ ἀγαθός kalós kaì agathós kalós kaì agathós



local export = {}

local m_data = require("Module:grc-utilities/data")
local m_str_utils = require("Module:string utilities")

local tokenize = require("Module:grc-utilities").tokenize

local concat = table.concat
local insert = table.insert
local split = m_str_utils.split
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulower = m_str_utils.lower
local umatch = mw.ustring.match
local uupper = m_str_utils.upper

-- Diacritics
local diacritic = m_data.diacritic
local diacritics = m_data.named

-- Greek
local circumflex = diacritics.circum
local smooth = diacritics.smooth
local rough = diacritics.rough
local breve = diacritics.breve
local macron = diacritics.macron
local subscript = diacritics.subscript
local vowel = m_data.vowel

-- Latin
local hat = diacritics.Latin_circum

local a_subscript = "^α.*" .. subscript .. "$"
local question_mark = u(0x37E)
local velar = "[γκξχϟϙ]"

local long_vowels = { -- Macron will be added.
	["η"] = "e",
	["ω"] = "o",
}

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",

	-- Other letters
	["ϛ"] = "st",
	["ϝ"] = "w",
	["ͱ"] = "h",
	["ϳ"] = "j",
	["ϟ"] = "q",
	["ϙ"] = "q",
	["ϻ"] = "ś",
	["ϸ"] = "š",
	["ϡ"] = "s",
	["ͳ"] = "s",
	["ͷ"] = "v",

	-- Incorrect characters: see [[Wiktionary:About Ancient Greek#Miscellaneous]].
	-- These are tracked by [[Module:script utilities]].
	["ϐ"] = "b",
	["ϑ"] = "th",
	["ϰ"] = "k",
	["ϱ"] = "r",
	["ϲ"] = "s",
	["ϕ"] = "ph",

	-- Diacritics
	-- unchanged: macron, diaeresis, grave, acute
	[smooth] = "",
	[rough] = "",
	[circumflex] = hat,
	[subscript] = "i",
}

local function get_next_token(tokens, i)
	local new = i + 1
	local token = tokens[new]
	while token and token:match("[()[%]{}]") do
		new = new + 1
		token = tokens[new]
	end
	return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1)
end

local function translit_long_vowel(ch)
	local tr = long_vowels[ch]
	return tr and (tr .. macron) or ch
end

local function do_translit(token)
	return (token:gsub(".[\128-\191]*", token:find(breve) and long_vowels or translit_long_vowel)
		:gsub(".[\128-\191]*", tt))
end

local function remove_macron_if_hat(m)
	return m:find(hat) and m:gsub(macron, "") or m
end

local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
	-- Remove any duplicate diacritics (this shouldn't really happen).
	local n
	repeat
		translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2")
	until n == 0
	-- Remove macron from a vowel that has a circumflex.
	translit = ugsub(translit, "%W+", remove_macron_if_hat)
	-- If capitalized, only capitalize the first letter unless the following token is capitalized as well.
	insert(
		output,
		(token == lower_token and translit or
			next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or
			uupper(translit)
		) .. suffix
	)
end

function export.tr(text, lang, sc)
	if text == "῾" then
		return "h"
	end

	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except any that occur in HTML entities. Use split to separate out the
		chunks between any entities.
	]]
	text = split(text, "(&#?%w+;)")
	for i = 1, #text, 2 do
		text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?")
	end
	text = concat(text)

	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = text:gsub("·", ";")

	local tokens = tokenize(text)

	--now read the tokens
	local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0)
	local output = {suffix}
	while next_token do
		local i, token, lower_token, is_rough = next_i, next_token, next_token_lower
		local translit = do_translit(lower_token)
		next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)

		-- γ before a velar should be <n>
		if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then
			translit = translit:gsub("g", "n")
		elseif lang == "xbc" and lower_token:find("φ") then
			translit = translit:gsub("ph", "f")
		-- ρ after ρ should be <rh>
		elseif lower_token:find("ρ") then
			-- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended.
			while next_token_lower and next_token_lower:find("ρ") do
				insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
				i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true
				translit = do_translit(lower_token)
				next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
			end
		-- add macron to ᾳ
		elseif umatch(lower_token, a_subscript) then
			translit = translit:gsub("a", "a" .. macron)
		end

		if is_rough or lower_token:find(rough) then
			if umatch(lower_token, vowel) then
				translit = "h" .. translit
			else
				local final = umatch(translit, "(%w)%W*$")
				if final and final ~= "h" then
					translit = translit .. "h"
				end
			end
		end

		insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
	end

	return concat(output)
end

return export