Jump to content

Module:grc-translit

From Wiktionary, the free dictionary

This module will transliterate Ancient Greek language text per WT:GRC TR. It is also used to transliterate Demotic, Greek, Paeonian, Old Ossetic, Oscan, Dacian, Alanic, Ancient Macedonian, and Phrygian. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:grc-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

2 of 36 tests failed. (refresh)

TextExpectedActual
testcases for tr function in Module:grc-translit:
Passedλόγοςlógoslógos
Passedσφίγξsphínxsphínx
Passedϝάναξwánaxwánax
Passedοἷαιhoîaihoîai
current problems
FailedΙΧΘΥΣIKHTHUSIKhThUS
FailedΥἱός'''Hu'''iós'''U'''hiós
u/y
Passedταῦροςtaûrostaûros
Passedνηῦςnēûsnēûs
Passedσῦςsûssûs
Passedὗςhûshûs
Passedγυῖονguîonguîon
Passedἀναῡ̈τέωanaṻtéōanaṻtéō
Passedδαΐφρωνdaḯphrōndaḯphrōn
vowel length
Passedτῶνtôntôn
Passedτοὶtoìtoì
Passedτῷtôitôi
Passedτούτῳtoútōitoútōi
Passedσοφίᾳsophíāisophíāi
Passedμᾱ̆νόςmānósmānós
h (rough breathing)
Passedhoho
Passedοἱhoihoi
Passedεὕρισκεheúriskeheúriske
Passedὑϊκόςhuïkóshuïkós
Passedπυρρόςpurrhóspurrhós
Passedῥέωrhéōrhéō
Passedσάἁμονsáhamonsáhamon
capitals
PassedὈδυσσεύςOdusseúsOdusseús
PassedΕἵλωςHeílōsHeílōs
PassedᾍδηςHā́idēsHā́idēs
Passedἡ Ἑλήνηhē Helḗnēhē Helḗnē
punctuation
Passedἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή;ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?
Passedτί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν;tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?
Passedτούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω.toútōn phōnḗenta mén estin heptá; a e ē i o u ō.toútōn phōnḗenta mén estin heptá; a e ē i o u ō.
Passedπήγ(νῡμῐ)pḗg(nūmi)pḗg(nūmi)
HTML entities
Passedκαλός καὶ ἀγαθόςkalós kaì agathóskalós kaì agathós
Passedκαλός καὶ ἀγαθόςkalós kaì agathóskalós kaì agathós

local export = {}

local m_data = require("Module:grc-utilities/data")
local m_str_utils = require("Module:string utilities")

local tokenize = require('Module:grc-utilities').tokenize

local ufind = m_str_utils.find
local ugsub = m_str_utils.gsub
local U = m_str_utils.char
local ulower = m_str_utils.lower
local uupper = m_str_utils.upper

-- Diacritics
local diacritics = m_data.named

-- Greek
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local diaeresis = diacritics.diaeresis
local smooth = diacritics.smooth
local rough = diacritics.rough
local macron = diacritics.macron
local breve = diacritics.breve
local subscript = diacritics.subscript

-- Latin
local hat = diacritics.Latin_circum

local macron_diaeresis = macron .. diaeresis .. "?" .. hat
local a_subscript = '^[αΑ].*' .. subscript .. '$'
local velar = 'κγχξ'

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["η"] = "e" .. macron,
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",
	["ω"] = "o" .. macron,

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",
	
	-- Archaic letters
	["ϛ"] = "st",
	["ϝ"] = "w",
	["ϻ"] = "ś",
	["ϙ"] = "q",
	["ϡ"] = "š",
	["ͷ"] = "v",
	
	-- Incorrect characters: see [[Wiktionary:About Ancient Greek#Miscellaneous]].
	-- These are tracked by [[Module:script utilities]].
	["ϐ"] = "b",
	["ϑ"] = "th",
	["ϰ"] = "k",
	["ϱ"] = "r",
	["ϲ"] = "s",
	["ϕ"] = "ph",
	
	-- Diacritics
	-- unchanged: macron, diaeresis, grave, acute
	[breve] = '',
	[smooth] = '',
	[rough] = '',
	[circumflex] = hat,
	[subscript] = 'i',
}

function export.tr(text, lang, sc)
	if text == '῾' then
		return 'h'
	end
	
	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except after an ASCII alphanumeric character (to avoid converting
		semicolons in HTML entities).
	]]
	text = ugsub(text, "([^A-Za-z0-9])[;" .. U(0x37E) .. "]", "%1?")
	
	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = text:gsub("·", ";")
	
	local tokens = tokenize(text)

	--now read the tokens
	local output = {}
	for i, token in pairs(tokens) do
		-- Convert token to lowercase and substitute each character
		-- for its transliteration
		local translit = ulower(token):gsub(".[\128-\191]*", tt)
		
		local next_token = tokens[i + 1]
		
		if token == 'γ' and next_token and velar:find(next_token, 1, true) then
			-- γ before a velar should be <n>
			translit = 'n'
		elseif token == 'ρ' and tokens[i - 1] == 'ρ' then
			-- ρ after ρ should be <rh>
			translit = 'rh'
		elseif ufind(token, a_subscript) then
			-- add macron to ᾳ
			translit = ugsub(translit, '([aA])', '%1' .. macron)
		end
		
		if token:find(rough) then
			if ufind(token, '^[Ρρ]') then
				translit = translit .. 'h'
			else -- vowel
				translit = 'h' .. translit
			end
		end
		
		-- Remove macron from a vowel that has a circumflex.
		if ufind(translit, macron_diaeresis) then
			translit = translit:gsub(macron, '')
		end
		
		-- Capitalize first character of transliteration.
		if token ~= ulower(token) then
			translit = translit:gsub("^" .. ".[\128-\191]*", uupper)
		end
		
		table.insert(output, translit)
	end
	output = table.concat(output)
	
	return output
end

return export