Modul:fa-cls-translit

A modult a Modul:fa-cls-translit/doc lapon tudod dokumentálni
-- Authors: Sameerhameedy

local U = mw.ustring.char
local gsub = mw.ustring.gsub
local export = {}

local fatHataan = U(0x64B) -- اً, tanvin-e nasb (تنوین نصب)
local Dammataan = U(0x64C) -- un
local kasrataan = U(0x64D) -- in
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local tashdid = U(0x651) -- also called shadda
local jazm = "ْ"
local he = "ه"
local zwnj = U(0x200C)
local highhmz = U(0x654)
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local balticons = "ڃڇڑڗݜݨݩǩ"

local consonants = "بپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنؤهئء" .. balticons
local consonants2 = "ءبپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنوؤهیئyw" .. balticons -- including semivowels
local vowels = "āēīōū"
local semivowel = "یو"
local hes = "هح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local alif_wasla = "ٱ"
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. zwnj .. "]"

--- The characters ټ ٹ ډ ڈ ے are included only for Mughal Persian and Hazaragi. 

local mapping = {
	["آ"] = 'ā', ["ب"] = 'b', ["پ"] = 'p', ["ت"] = 't', ["ث"] = 's',
	["ج"] = 'j', ["چ"] = 'č', ["ح"] = 'h', ["خ"] = 'x', 
	["د"] = 'd', ["ذ"] = 'z', ["ر"] = 'r', ["ز"] = 'z', ["ژ"] = 'ž',
	["س"] = 's', ["ش"] = 'š', ["ص"] = 's', ["ض"] = 'z', 
	["ط"] = 't', ["ظ"] = 'z', ["غ"] = 'ğ', ["ف"] = 'f', ["ق"] = 'q',
	["ک"] = 'k', ["گ"] = 'g',
	["ل"] = 'l', ["م"] = 'm', ["ن"] = 'n', 
	["و"] = 'ō', ["ی"] = 'ē', ["۔"] = ".",

	["ه"] = "h",
	
	["ع"] = "'",
	["ء"] = "'",
	["ئ"] = "'", 
	["ؤ"] = "'",
	["أ"] = "'",
	
	-- diacritics
	[zabar] = "a",
	[zer] = "i",
	[pesh] = "u",
	[jazm] = "", -- also sukun - no vowel
	[zwnj] = "-", -- ZWNJ (zero-width non-joiner)
	[highhmz] = "-yi",
	
	-- ligatures
	["ﻻ"] = "lā",
	["ﷲ"] = "allāh",
	
	-- kashida
	["ـ"] = "-", -- kashida, no sound
	
	-- alif_wasla
	[alif_wasla] = "",	-- nothing	

	-- numerals
	["۱"] = "1", ["۲"] = "2", ["۳"] = "3", ["۴"] = "4", ["۵"] = "5",
	["۶"] = "6", ["۷"] = "7", ["۸"] = "8", ["۹"] = "9", ["۰"] = "0",
	
	-- punctuation (leave on separate lines)
	["؟"] = "?", -- question mark
	["،"] = ",", -- comma
	["؛"] = ";", -- semicolon
	["«"] = '“', -- quotation mark
	["»"] = '”', -- quotation mark
	["٪"] = "%", -- percent
	["؉"] = "‰", -- per mille
	["٫"] = ".", -- decimals
	["٬"] = ",", -- thousand
	
	-- regional characters (FOR VERY SPECIFIC USECASES)
	["ټ"] = "ṭ", ["ٹ"] = "ṭ", ["ډ"] = "ḍ", ["ڈ"] = "ḍ",
	-- balti
	-- cant do anything about ژ because it conflicts with persian
	["ڃ"]= "ž",
	["ڇ"] = "č̣",
	["ڑ"] = "ṛ",
	["ڗ"] = "dz",
	["ݜ"] = "ṣ",
	["ݨ"] = "ng",
	["ݩ"] = "ny",
	["ھ"] = "h",
	["ے"] = "e",
}

local punctuation = ":%(%)%[%]*&٫؛؟،ـ«\".\'!»٪؉۔`,/"
local numbers = "۱۲۳۴۵۶۷۸۹۰"

local ain = 'ع'
local alif = 'ا'
local malif = "آ"
local hamza = "ء"
local ye = 'ی'
local ye2 = 'ئ'
local vao = "و"
local dagger_alif = U(0x670)
local marbuta = U(0x629)
local te = "ت"
local ye3 = "ے"
local laam = "ل"
local vowel = "[" .. vowels .. ZZP .. jazm .. semivowel .. malif .."]"
local sun_letters = "تثدذرزسشصضطظلن"

local before_diacritic_checking_subs = {
	------------ transformations prior to checking for diacritics --------------
	{U(0x06E5), "و"},
	{U(0x06E6), "ی"},
	{ alif_wasla, ""},
	{ 'ہ', he}, -- get rid of balti he (allows balti to transliterate) 
	{ 'ک' .. highhmz, "ǩ"},
	{"([" .. fatHataan .. ZZP .. dagger_alif .. "])" .. tashdid, tashdid .. "%1"},
	{ alif .. fatHataan, zabar .. "ن"},
	{ fatHataan .. alif, zabar .. "ن"},
	{ jazm .. ye .. dagger_alif , jazm .. ye.. zabar .. alif},
	{ zabar .. ye .. dagger_alif , zabar .. alif},
	{ ye .. dagger_alif , zabar .. alif}, -- the first letter is U+06CC
	{ ye3 , ye},
	{ "[أإ]" , ye2},
	{ zabar .. dagger_alif, zabar .. alif},
	{ dagger_alif, zabar .. alif},
	{ fatHataan, zabar .. "ن"}, -- fatḥatan
	{ Dammataan, pesh .. "ن"}, -- ḍammatan
	{ kasrataan, zer .. "ن"}, -- kasratan
	{ marbuta .. "([" .. ZZP .. "])" .. alif, te .. "%1-"},
	{ marbuta .. "([" .. ZZP .. jazm .. "])", te .. "%1"},
	{ marbuta , he},
    
    -- allah ligatures and arabic al
   	{"([" .. consonants2 .. "][".. ZZP .."])(" .. space_like_class .. ")" .. alif .. laam .. "([" .. jazm .. laam .. "])" , "%1%2" .. laam .. "%3" },
	{ laam .. laam .. tashdid , laam .. tashdid},
	-- use jazm/sukoon to prevent this conversion
	{ "(خ)" .. vao .. zabar .. alif , "%1" .. zabar .. alif},
	{ "(خ)" .. vao .. zabar , "%1" .. pesh},
	{ "(خ)" .. vao .. ye .. "([^" .. ZZP .. jazm .. "])" , "%1" .. ye .. "%2"},
	-- izāfa
	{ zwnj, "-"}, 
	{ jazm .. alif, "-"},-- vowel killing, invisible ZWNJ
	{ zabar .. jazm, "-"},-- vowel killing, invisible ZWNJ
}

local has_diacritics_subs = {
	-- remove punctuation and tashdid
	{ "[" .. punctuation .. tashdid .. highhmz .. numbers .. fatHataan .."]", ""},
	{"[" .. consonants .. "]$", ""},
	{"[" .. consonants .. "](" .. space_like_class .. ")", "%1"},
	{"[" .. consonants .. "]%-", "-"},
	-- these are required for arabic al- to work
	{"[" .. consonants2 .. "]" .. "([".. zer .. pesh .."])" .. alif .. laam , laam },
	{"[" .. consonants2 .. "]([".. zer .. pesh .."])%-" .. alif .. laam , laam },
	-- remove CV pairs
	 -- consonants paired to alif
	{ "[" .. consonants2 .. "]" .. jazm , ""},
	{ "[" .. consonants2 .. "]" .. jazm .. malif, ""},
	{ "[" .. consonants2 .. "]" .. zabar .. alif, ""},
	 -- consonants paired to a semivowel
	{ "[" .. consonants .. alif .. "][" .. semivowel .. ZZP .. "]([" .. semivowel .. "])([" .. semivowel .. "])", "%1%2"},
	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "][" .. semivowel .. "]", ""},
	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. jazm .. semivowel .. "]", ""},
	{ "[" .. alif .. consonants2 .. "][" .. ZZP .. "][" .. semivowel .. "]", ""},
	{ malif , ""}, -- counts as a CV pair
	{ jazm .. alif .. "[" .. ZZP .. "]", ""},
	{ "[" .. consonants2 .. alif .."][" .. ZZP .. "]", ""},
	{ "[" .. consonants2 .. alif .. semivowel .. "][" .. semivowel .. "]", ""},
	-- remove numbers, hamzatu l-waṣl, alif madda and ZWNJ
	{ "[" .. numbers .. "ٱ" .. "آ" .. "]", ""},
	{ "%s", ""},
	{ "%-", ""},
	{ "[" .. semivowel .. "]", ""},
	{ "(" .. vowel .. ")", ""},
}
 
local function has_diacritics(text)
	local count
	text, count = gsub(text, "[" .. lrm .. rlm .. "]", "")
	if count > 0 then
		require("Module:debug").track("fa-translit/lrm or rlm")
	end
	for _, sub in ipairs(has_diacritics_subs) do
		text = gsub(text, unpack(sub))
	end
	return #text == 0
end

function export.tr(text, lang, sc)
	if type(text) == "table" then
		local function f(x) return (x ~= "") and x or nil end
		text, lang, sc, omit_i3raab, force_translit =
			f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
	end
	for _, sub in ipairs(before_diacritic_checking_subs) do
		text = gsub(text, sub[1], sub[2])
	end

	if not force_translit and not has_diacritics(text) then
		require("Module:debug").track("fa-translit/lacking diacritics")
		return nil
	end
	
	--define the "end" of a word
	text = gsub(text, "#", "HASHTAG")
	text = gsub(text, "^", "#")
	text = gsub(text, "$", "#")
	text = gsub(text, " | ", "# | #")
	text = gsub(text, "%s", "# #")
	text = gsub(text, "\n" , "#".."\n" .. "#")
	text = gsub(text, "(["..punctuation.."])" , "#".."%1" .. "#")
	text = "##" .. gsub(text, " ", "# #") .. "##"
	text = gsub(text, "%-", "#-#")
	-- hastags now mark the beginning and end of a word
	--character reformatting and exceptions
	text = gsub(text, highhmz, "#"..highhmz.."#")
	text = gsub(text, "#" .. vao .. "#", "#u#")
	-- prevent izafa from converting until later
	
	-- Tashdeed
	text = gsub(text, '([' .. consonants .. '])' .. tashdid, "%1%1")
	text = gsub(text, '([' .. consonants .. '])' .. tashdid .. '([' .. ZZP .. '])', "%1%1%2")
	text = gsub(text, '([' .. consonants .. '])' .. '([' .. ZZP .. '])' .. tashdid, "%1%1%2")
	text = gsub(text, ye .. '([' .. ZZP .. '])' .. tashdid, "yy%1")
	text = gsub(text, vao .. '([' .. ZZP .. '])' .. tashdid, "ww%1")
	text = gsub(text, ye .. tashdid .. '([' .. ZZP .. '])', "yy%1")
	text = gsub(text, vao .. tashdid .. '([' .. ZZP .. '])', "ww%1")
	
	-- distinguish initial alif from vowel alif
    text = gsub(text, "(["..consonants2.."])" .. zabar .. alif, "%1ā")
    text = gsub(text, "(["..consonants2.."])" .. alif, "%1ā")
    text = gsub(text, jazm .. malif, "'ā")-- invisible ZWNJ
    text = gsub(text, "(["..consonants2.."])" .. malif, "%1'ā")
    text = gsub(text, alif..ye, "ē")
    text = gsub(text, alif..vao, "ō")
    text = gsub(text, alif..zer..ye, "ī")
    text = gsub(text, alif..pesh..vao, "ū")
    text = gsub(text, tashdid .. alif, tashdid .. "ā")
    
    
    -- convert semi vowels
    text = gsub(text, ye .. "ā", "yā")
    text = gsub(text, vao.. "ā", "wā")
    text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "w%1")
    text = gsub(text, ye.. "(["..diacritics..ZZP.."])", "y%1")
    text = gsub(text, ye .. "(["..semivowel.."])(["..semivowel.."])", "ē%1%2")
    text = gsub(text, vao .. "(["..semivowel.."])(["..semivowel.."])", "ō%1%2")
    text = gsub(text, "(["..diacritics..ZZP.."])" .. ye .. "(["..semivowel.."])", "%1y%2")
    text = gsub(text, "(["..diacritics..ZZP.."])" .. vao .. "(["..semivowel.."])", "%1w%2")
     text = gsub(text, "(["..consonants.."])" .. ye .. "(["..semivowel.."])", "%1y%2")
    text = gsub(text, "(["..consonants.."])" .. vao .. "(["..semivowel.."])", "%1w%2")
    
    -- conversions for vaav/waaw/vao
    text = gsub(text, pesh.. vao, "ū")
    text = gsub(text, vao.. "(["..diacritics..ZZP.."])", "w%1")
    text = gsub(text, "(" .. vowel .. ")" .. vao, "%1w")
    -- conversions for ye
    text = gsub(text, zer.. ye, "ī")
    text = gsub(text, ye .. "(["..diacritics..ZZP.."])", "y%1")
    text = gsub(text, "(" .. vowel ..")" .. ye , "%1y")
	
	--Alif with short vowel
	text = gsub(text, alif.."(["..ZZP.."])", "%1")
    
    -- final changes 
    -- izafa
    text = gsub(text, "ē" .. zer .. "#", "ē-yi#")
    text = gsub(text, zer .. "y" .. zer .. "#", "ī-yi#")
    text = gsub(text, '([^' .. consonants .. '])' .. "y" .. zer .. "#", "%1-yi#")
    text = gsub(text, '([' .. consonants2 .. '])' .. zer .. "#", "%1-i#")
    -- arabic al, must happen after alif conversions and before he deletion
    text = gsub(text, "([".. ZZP .."])#%-#" .. alif .. laam , "%1#-#" .. laam )
    text = gsub(text, "([".. ZZP .."])" .. alif .. laam .. jazm .. "([".. sun_letters .."])" .. "%2" , "%1%2%2")
    text = gsub(text, "([".. ZZP .."])" .. alif .. laam .. laam , "%1#-#" .. laam .. laam)
    text = gsub(text, "([".. ZZP .."])" .. alif .. laam , "%1#-#" .. laam .. "#-#")
    text = gsub(text, "#([".. ZZP .."]?)" .. laam .. jazm .. "([".. sun_letters .."])" .. "%2" , "#%1%2%2")
    -- he deletion
    text = gsub(text, "(["..ZZP.."])" .. he .. "#" .. zwnj, "%1-")
    text = gsub(text, "(["..ZZP.."])" .. he .. "#", "%1#")
    
    -- get rid of hashtags (not needed)
    text = gsub(text, "#", "")
    text = gsub(text, "HASHTAG", "#")
    text = string.gsub(text, lrm, "")
	text = string.gsub(text, rlm, "")
    -- convert all characters
    text = mw.ustring.gsub(text, '.', mapping)
	
	-- alif
	-- Final corrections
	text = mw.ustring.gsub(text, "āa", "ā")
	text = mw.ustring.gsub(text, "aaa", "ā")
	text = mw.ustring.gsub(text, "āā", "ā")
	text = mw.ustring.gsub(text, "aa", "ā")
	
	text = mw.ustring.toNFC(text)
	
	return text
end

return export