Module:Unicode data/testcases

local p = require "Module:UnitTests" local Unicode_data = require(mw.title.getCurrentTitle.subpageText == "sandbox"	and "Module:Unicode data/sandbox" or "Module:Unicode data")

local U = mw.ustring.char local get_codepoint = mw.ustring.codepoint local function show(codepoint) if Unicode_data.is_printable(codepoint) then local printed_codepoint = U(codepoint) if mw.ustring.toNFC(printed_codepoint) ~= printed_codepoint then printed_codepoint = ("&#x26;#x%X;"):format(codepoint) end if Unicode_data.is_combining(codepoint) then printed_codepoint = "◌" .. printed_codepoint end return ("U+%04X: %s"):format(codepoint, printed_codepoint) else return ("U+%04X"):format(codepoint) end end

local function show_codepoint_and_name(codepoint) return ("%s (%s)"):format(show(codepoint),		Unicode_data.lookup_name(codepoint)) end

function p:test_lookup_name local examples = { {  0x0000, "&#x3c;control-0000&#x3e;" }, {  0x007F, "&#x3c;control-007F&#x3e;" }, {  0x00C1, "LATIN CAPITAL LETTER A WITH ACUTE" }, {  0x0300, "COMBINING GRAVE ACCENT" }, {  0x0378, "&#x3c;reserved-0378&#x3e;" }, {  0x1B44, "BALINESE ADEG ADEG" }, {  0x1F71, "GREEK SMALL LETTER ALPHA WITH OXIA" }, {  0x3555, "CJK UNIFIED IDEOGRAPH-3555" }, {  0xAC01, "HANGUL SYLLABLE GAG" }, {  0xD5FF, "HANGUL SYLLABLE HEH" }, {  0xDC00, "&#x3c;surrogate-DC00&#x3e;", }, {  0xEEEE, "&#x3c;private-use-EEEE&#x3e;" }, {  0xFDD1, "&#x3c;noncharacter-FDD1&#x3e;", }, {  0xFFFD, "REPLACEMENT CHARACTER" }, {  0xFFFF, "&#x3c;noncharacter-FFFF&#x3e;" }, { 0x1F4A9, "PILE OF POO" }, { 0xE0000, "&#x3c;reserved-E0000&#x3e;" }, { 0xF0F0F, "&#x3c;private-use-F0F0F&#x3e;" }, { 0x10FFFF, "&#x3c;noncharacter-10FFFF&#x3e;" }, }	self:iterate(examples,		function (self, codepoint, name)			self:equals(show(codepoint), Unicode_data.lookup_name(codepoint), name)		end) end

function p:test_lookup_age local examples = { {  0x0061, "1.1" }, {  0x0378, "NA" }, {  0x1B44, "5.0" }, {  0x3555, "3.0" }, {  0xAC01, "2.0" }, {  0xDC00, "2.0", }, {  0xEEEE, "1.1" }, {  0xFDD1, "3.1", }, { 0x1F4A9, "6.0" }, { 0xE0000, "NA" }, { 0xF0F0F, "2.0" }, { 0x10FFFF, "2.0" }, }	self:iterate(examples,		function (self, codepoint, age)			-- Remove pcall when this function is added to Module:Unicode data.			pcall(function self:equals(show(codepoint),					Unicode_data.lookup_age(codepoint), age) end)		end) end

function p:test_is_combining local examples = { { 0x0300, true }, { 0x0060, false }, }	self:iterate(examples,		function (self, codepoint, expected)			self:equals( show_codepoint_and_name(codepoint), Unicode_data.is_combining(codepoint), expected)		end) end

function p:test_is_default_ignorable local examples = { { 0x0061, false }, { 0x00AD, true }, }	self:iterate(examples,		function (self, codepoint, expected)			-- Remove pcall when this function is added to Module:Unicode data.			pcall(function self:equals(					show_codepoint_and_name(codepoint),					Unicode_data.is_default_ignorable(codepoint),					expected) end)		end) end

function p:test_lookup_script local examples = { { 0x0061, "Latn" }, { 0x002F, "Zyyy" }, { 0x0300, "Zinh" }, { 0x0378, "Zzzz" }, { 0x0398, "Grek" }, { 0x03E2, "Copt" }, { 0x2014, "Zyyy" }, }	self:iterate(examples,		function (self, codepoint, expected)			self:equals( show_codepoint_and_name(codepoint), Unicode_data.lookup_script(codepoint), expected)		end) end

function p:test_lookup_category local examples = { { get_codepoint "\t", "Cc" }, { get_codepoint " ", "Zs" }, { get_codepoint "[", "Ps" }, { get_codepoint "]", "Pe" }, { get_codepoint "^", "Sk" }, { get_codepoint "A", "Lu" }, { 0x00AD,            "Cf" }, { get_codepoint "¾", "No" }, { get_codepoint "«", "Pi" }, { get_codepoint "»", "Pf" }, { 0x0300,            "Mn" }, { 0x0488,            "Me" }, { get_codepoint "٣", "Nd" }, { get_codepoint "子", "Lo" }, { get_codepoint "ᾮ", "Lt" }, { 0x1B44,            "Mc" }, { get_codepoint "∈", "Sm" }, { get_codepoint "‿", "Pc" }, { get_codepoint "↹", "So" }, { get_codepoint "⸗", "Pd" }, { get_codepoint "Ⅷ", "Nl" }, { 0x2028,             "Zl" }, { 0x2029,             "Zp" }, { get_codepoint "ゞ", "Lm" }, { 0xD800,             "Cs" }, { get_codepoint "￡", "Sc" }, { 0xFFFF,             "Cn" }, { 0x100000,           "Co" }, }	self:iterate(examples,		function (self, codepoint, expected)			self:equals( show_codepoint_and_name(codepoint), Unicode_data.lookup_category(codepoint), expected)		end) end

local fun = require "Module:Fun" local m_table = require "Module:TableTools"

local script_to_count_mt = { __index = function (self, key) self[key] = 0 return 0 end, __call = function (self, ...) return setmetatable({}, self) end } setmetatable(script_to_count_mt, script_to_count_mt)

local script_counts = setmetatable({}, {	__index = function (self, str)		if type(str) ~= "string" then return nil end		local script_to_count = script_to_count_mt		for codepoint in mw.ustring.gcodepoint(str) do			local script = Unicode_data.lookup_script(codepoint)			script_to_count[script] = script_to_count[script] + 1		end		local printed = table.concat( fun.mapIter(				function (count, script)					return ("%s (%d)"):format(script, count)				end,				m_table.sortedPairs( script_to_count, function (script1, script2) return script_to_count[script1] &#x3e; script_to_count[script2] end)), ", ")		self[str] = printed		return printed	end, })

local script_examples = { -- To demonstrate that "is_Latin" doesn't treat a string of Zyyy and Zinh -- characters as Latn. -- This particular example only has characters below U+0340, so -- lookup_script doesn't have to be called. { "%!?́", nil }, { "’ʼ“”†‡•‰′‽⁕", nil }, { "col·legi", "Latn" }, "HTML character references", { "𐘀", "Lina" }, { "&#x26;#x10600;", "Lina" }, { "–", nil }, { "&#x26;ndash;", nil }, -- Examples from Template talk:Lang "Halkomelem", { "lá:yelhp", "Latn" }, { "xʷməθkʷəy̓əm", nil }, -- one Greek (Grek) character { "hən̓q̓əmin̓əm̓", "Latn" }, "Quotes", -- Divina Commedia/Inferno/Canto I { Tant’è amara che poco è più morte; ma per trattar del ben ch’i’ vi trovai, dirò de l’altre cose ch’i’ v’ ho scorte., "Latn" },	{  -- A blessing in Navajo: --User talk:Stephen G. Brown/text8 Díí Késhmish biyiʼ yáʼąąshdę́ę́ʼ ląʼígóó bee nikʼihojidlíi dooleeł. Niheechʼínáánáháhígíí biyiʼ iłhodeezyéél, iłhózhǫ́, ayóóʼóʼóʼní bee nikʼihojidlíi dooleeł. Tʼáá sahdiigiʼ átʼéego baa hózhǫ́ǫgo nihił hanááhoolzhiizhígí biyiʼ tʼáá ałtsojįʼ iłhózhǫ́ nííʼ dooleeł., "Latn" },	{	-- The opening of the Iliad (Ιλιάς/Α), with macrons and -- breves added to mark the length of the monophthongs α, ι, υ: Μῆνῐν ᾰ̓́ειδε, θεᾱ́, Πηληῐ̈ᾰ́δεω Ᾰ̓χῐλῆος οὐλομένην, ἣ μῡρῐ́᾽ Ᾰ̓χαιοῖς ᾰ̓́λγε᾽ ἔθηκε, πολλᾱ̀ς δ᾽ ῐ̓φθῑ́μους ψῡχᾱ̀ς Ἄῐ̈δῐ προῐ̈́ᾰψεν ἡρώων, αὐτοὺς δὲ ἑλώρῐᾰ τεῦχε κῠ́νεσσιν οἰωνοῖσῐ́ τε πᾶσῐ· Δῐὸς δ᾽ ἐτελείετο βουλή·, "Grek" },	{ -- The Brothers Karamazov: Братья Карамазовы (Достоевский)/Книга первая Вот если вы не согласитесь с этим последним тезисом и ответите: «Не так» или «не всегда так», то я, пожалуй, и ободрюсь духом насчет значения героя моего Алексея Федоровича. Ибо не только чудак «не всегда» частность и обособление, а напротив, бывает так, что он-то, пожалуй, и носит в себе иной раз сердцевину целого, а остальные люди его эпохи — все, каким-нибудь наплывным ветром, на время почему-то от него оторвались…, "Cyrl" },	{ -- Rig Veda: [] ॐ अग्निमीळे पुरोहितं यज्ञस्य देवमृत्विजम् । होतारं रत्नधातमम् ॥१॥ अग्निः पूर्वेभिरृषिभिरीड्यो नूतनैरुत । स देवाँ एह वक्षति ॥२॥ अग्निना रयिमश्नवत् पोषमेव दिवेदिवे । यशसं वीरवत्तमम् ॥३॥ अग्ने यं यज्ञमध्वरं विश्वतः परिभूरसि । स इद्देवेषु गच्छति ॥४॥ अग्निर्होता कविक्रतुः सत्यश्चित्रश्रवस्तमः । देवो देवेभिरा गमत् ॥५॥ यदङ्ग दाशुषे त्वमग्ने भद्रं करिष्यसि । तवेत् तत् सत्यमङ्गिरः ॥६॥ उप त्वाग्ने दिवेदिवे दोषावस्तर्धिया वयम् । नमो भरन्त एमसि ॥७॥ राजन्तमध्वराणां गोपामृतस्य दीदिविम् । वर्धमानं स्वे दमे ॥८॥ स नः पितेव सूनवेऽग्ने सूपायनो भव । सचस्वा नः स्वस्तये ॥९॥, "Deva" }, }

local ends_in_punctuation = setmetatable({}, {	__index = function (self, key)		local val = mw.ustring.match(mw.ustring.sub(key, -1), "%p") ~= nil		self[key] = val		return val	end, }) local function show_script_example(script_example) local separator = ": " -- If last character is punctuation, place script counts on their own line -- Could use Unicode_data.lookup_category, but that is more memory-intensive. if ends_in_punctuation[script_example] then separator = "&#x3c;br&#x3e;&#x26;bull; " end return script_example:gsub('\n', '&#x3c;br&#x3e;') .. separator .. script_counts[script_example] end

function p:test_get_best_script self:iterate(script_examples,		function (self, str, expected)			self:equals( show_script_example(str), Unicode_data.get_best_script(str), expected)		end) end

function p:test_is_Latin self:iterate(script_examples,		function (self, str, best_script, is_Latin)			self:equals(show_script_example(str), Unicode_data.is_Latin(str), is_Latin or best_script == "Latn")		end) end

function p:test_lookup_block local examples = { {  0x0064, "Basic Latin"                      }, {  0x030B, "Combining Diacritical Marks"      }, {  0x03A3, "Greek and Coptic"                 }, {  0x0411, "Cyrillic"                         }, {  0x10E6, "Georgian"                         }, {  0x3175, "Hangul Compatibility Jamo"        }, {  0xAC01, "Hangul Syllables"                 }, {  0x4E0A, "CJK Unified Ideographs"           }, { 0x1F608, "Emoticons"                        }, { 0x30000, "No Block"                         }, { 0x10FFFF, "Supplementary Private Use Area-B" }, }	self:iterate(examples,		function (self, codepoint, block_name)			self:equals( show(codepoint), Unicode_data.lookup_block(codepoint), block_name)		end) end

function p:test_is_rtl local examples = { { "أبو عبد الله محمد بن عبد الله اللواتي الطنجي بن بطوطة", true }, -- Ibn Battuta's full name { "أدب القاضي Adab al-qādī", false }, -- Example of incorrect input { "ܛܘܼܒܲܝܗܘܿܢ ܠܐܲܝܠܹܝܢ ܕܲܕ݂ܟܹܝܢ ܒܠܸܒ̇ܗܘܿܢ܄ ܕܗܸܢ݂ܘܿܢ ܢܸܚܙܘܿܢ ܠܐܲܠܵܗܵܐ܂‬", true }, -- Syriac, sixth beatitude (Matthew 5:8) { "בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ.", true }, -- Hebrew: Genesis 1:1 { "𞤀𞤣𞤤𞤢𞤥 𞤆𞤵𞤤𞤢𞤪", true }, -- Adlam: name of alphabet {			-- Avestan: Hymn to Haoma: Yasna 10.8 (𐬀𐬉𐬴𐬨𐬀) "𐬬𐬍𐬯𐬞𐬈 ⸱ 𐬰𐬍 ⸱ 𐬀𐬥𐬌𐬌𐬉 ⸱ 𐬨𐬀𐬜𐬃𐬢𐬵𐬋 ⸱ 𐬀𐬉𐬴𐬨𐬀 ⸱ 𐬵𐬀𐬗𐬌𐬧𐬙𐬈 ⸱ 𐬑𐬭𐬎𐬎𐬍𐬨 ⸱ 𐬛𐬭𐬎𐬎𐬋 ⸱ 𐬁𐬀𐬝 ⸱ 𐬵𐬋 ⸱ 𐬫𐬋 ⸱ 𐬵𐬀𐬊𐬨𐬀𐬵𐬈 ⸱ 𐬨𐬀𐬜𐬋 ⸱ 𐬀𐬴𐬀 ⸱ 𐬵𐬀𐬗𐬀𐬌𐬙𐬈", true },		{ "ދިވެހި", true }, -- the word dhivehi written in Thaana script { "𐤀𐤓𐤍𐤟𐤆𐤐𐤏𐤋𐤟𐤀𐤕𐤁𐤏𐤋𐤟𐤁𐤍𐤀𐤇𐤓𐤌𐤟𐤌𐤋𐤊𐤂𐤁𐤋𐤟𐤋𐤀𐤇𐤓𐤌𐤟𐤀𐤁𐤄", true }, -- Phoenician: Ahiram sarcophagus (𐤀𐤓𐤍) { "ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ", true }, -- Mandaic: manda ḏ'haije ("knowledge of life"; ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ) { "ࠄࠟࠓࠂࠝࠓࠜࠉࠆࠜࠉࠌ", true }, -- Samaritan Hebrew: īargerēzēm ("Mount Gerizim"; Mount Gerizim) { "%$!^&#x26;", false }, }	self:iterate(examples,		function (self, str, expected)			self:equals(str, Unicode_data.is_rtl(str), expected)		end) end

-- Change function names into more readable headers for the testcases tables. for k, v in m_table.sortedPairs(p) do	if type(k) == "string" then local new_k = k:gsub("^test_(.+)$", "testcases for &#x3c;code&#x3e;%1&#x3c;/code&#x3e;") if new_k ~= k then p[k] = nil p[new_k] = v		end end end

return p