Module:Unicode data/testcases

This is the test cases page for the module Module:Unicode data. Results of the test cases.
 local p = require "Module:UnitTests"
 local Unicode_data = require(mw.title.getCurrentTitle().subpageText == "sandbox"
	and "Module:Unicode data/sandbox" or "Module:Unicode data")

 local U = mw.ustring.char
 local get_codepoint = mw.ustring.codepoint
 local function show(codepoint)
	if Unicode_data.is_printable(codepoint) then
		local printed_codepoint = U(codepoint)
		if mw.ustring.toNFC(printed_codepoint) ~= printed_codepoint then
			printed_codepoint = ("&#x%X;"):format(codepoint)
		end
		if Unicode_data.is_combining(codepoint) then
			printed_codepoint = "◌" .. printed_codepoint
		end
		return ("U+%04X: %s"):format(codepoint, printed_codepoint)
	else
		return ("U+%04X"):format(codepoint)
	end
 end

 local function show_codepoint_and_name(codepoint)
	return ("%s (%s)"):format(show(codepoint),
		Unicode_data.lookup_name(codepoint))
 end

 function p:test_lookup_name()
	local examples = {
		{ 0x0000, "<control-0000>" },
		{ 0x007F, "<control-007F>" },
		{ 0x00C1, "LATIN CAPITAL LETTER A WITH ACUTE" },
		{ 0x0300, "COMBINING GRAVE ACCENT" },
		{ 0x0378, "<reserved-0378>" },
		{ 0x1B44, "BALINESE ADEG ADEG" },
		{ 0x1F71, "GREEK SMALL LETTER ALPHA WITH OXIA" },
		{ 0x3555, "CJK UNIFIED IDEOGRAPH-3555" },
		{ 0xAC01, "HANGUL SYLLABLE GAG" },
		{ 0xD5FF, "HANGUL SYLLABLE HEH" },
		{ 0xDC00, "<surrogate-DC00>", },
		{ 0xEEEE, "<private-use-EEEE>" },
		{ 0xFDD1, "<noncharacter-FDD1>", },
		{ 0xFFFD, "REPLACEMENT CHARACTER" },
		{ 0xFFFF, "<noncharacter-FFFF>" },
		{ 0x1F4A9, "PILE OF POO" },
		{ 0xE0000, "<reserved-E0000>" },
		{ 0xF0F0F, "<private-use-F0F0F>" },
		{ 0x10FFFF, "<noncharacter-10FFFF>" },
	}
	
	self:iterate(examples,
		function (self, codepoint, name)
			self:equals(show(codepoint),
				Unicode_data.lookup_name(codepoint), name)
		end)
 end

 function p:test_lookup_age()
	local examples = {
		{ 0x0061, "1.1" },
		{ 0x0378, "NA" },
		{ 0x1B44, "5.0" },
		{ 0x3555, "3.0" },
		{ 0xAC01, "2.0" },
		{ 0xDC00, "2.0", },
		{ 0xEEEE, "1.1" },
		{ 0xFDD1, "3.1", },
		{ 0x1F4A9, "6.0" },
		{ 0xE0000, "NA" },
		{ 0xF0F0F, "2.0" },
		{ 0x10FFFF, "2.0" },
	}
	
	self:iterate(examples,
		function (self, codepoint, age)
			-- Remove pcall when this function is added to [[Module:Unicode data]].
			pcall(function ()
				self:equals(show(codepoint),
					Unicode_data.lookup_age(codepoint), age)
			end)
		end)
 end

 function p:test_is_combining()
	local examples = {
		{ 0x0300, true },
		{ 0x0060, false },
	}
	
	self:iterate(examples,
		function (self, codepoint, expected)
			self:equals(
				show_codepoint_and_name(codepoint),
				Unicode_data.is_combining(codepoint),
				expected)
		end)
 end

 function p:test_is_default_ignorable()
	local examples = {
		{ 0x0061, false },
		{ 0x00AD, true },
	}
	
	self:iterate(examples,
		function (self, codepoint, expected)
			-- Remove pcall when this function is added to [[Module:Unicode data]].
			pcall(function ()
				self:equals(
					show_codepoint_and_name(codepoint),
					Unicode_data.is_default_ignorable(codepoint),
					expected)
			end)
		end)
 end

 function p:test_lookup_script()
	local examples = {
		{ 0x0061, "Latn" },
		{ 0x002F, "Zyyy" },
		{ 0x0300, "Zinh" },
		{ 0x0378, "Zzzz" },
		{ 0x0398, "Grek" },
		{ 0x03E2, "Copt" },
		{ 0x2014, "Zyyy" },
	}
	
	self:iterate(examples,
		function (self, codepoint, expected)
			self:equals(
				show_codepoint_and_name(codepoint),
				Unicode_data.lookup_script(codepoint),
				expected)
		end)
 end

 function p:test_lookup_category()
	local examples = {
		{ get_codepoint "\t", "Cc" },
		{ get_codepoint " ", "Zs" },
		{ get_codepoint "[", "Ps" },
		{ get_codepoint "]", "Pe" },
		{ get_codepoint "^", "Sk" },
		{ get_codepoint "A", "Lu" },
		{ 0x00AD, "Cf" },
		{ get_codepoint "3⁄4", "No" },
		{ get_codepoint "«", "Pi" },
		{ get_codepoint "»", "Pf" },
		{ 0x0300, "Mn" },
		{ 0x0488, "Me" },
		{ get_codepoint "٣", "Nd" },
		{ get_codepoint "子", "Lo" },
		{ get_codepoint "ᾮ", "Lt" },
		{ 0x1B44, "Mc" },
		{ get_codepoint "∈", "Sm" },
		{ get_codepoint "‿", "Pc" },
		{ get_codepoint "↹", "So" },
		{ get_codepoint "⸗", "Pd" },
		{ get_codepoint "VIII", "Nl" },
		{ 0x2028, "Zl" },
		{ 0x2029, "Zp" },
		{ get_codepoint "ゞ", "Lm" },
		{ 0xD800, "Cs" },
		{ get_codepoint "£", "Sc" },
		{ 0xFFFF, "Cn" },
		{ 0x100000, "Co" },
	}
	
	self:iterate(examples,
		function (self, codepoint, expected)
			self:equals(
				show_codepoint_and_name(codepoint),
				Unicode_data.lookup_category(codepoint),
				expected)
		end)
 end

 local fun = require "Module:Fun"
 local m_table = require "Module:TableTools"

 local script_to_count_mt = {
	__index = function (self, key)
		self[key] = 0
		return 0
	end,
	__call = function (self, ...)
		return setmetatable({}, self)
	end
 }
 setmetatable(script_to_count_mt, script_to_count_mt)

 local script_counts = setmetatable({}, {
	__index = function (self, str)
		if type(str) ~= "string" then return nil end
		
		local script_to_count = script_to_count_mt()
		
		for codepoint in mw.ustring.gcodepoint(str) do
			local script = Unicode_data.lookup_script(codepoint)
			script_to_count[script] = script_to_count[script] + 1
		end
		
		local printed = table.concat(
			fun.mapIter(
				function (count, script)
					return ("%s (%d)"):format(script, count)
				end,
				m_table.sortedPairs(
					script_to_count,
					function (script1, script2)
						return script_to_count[script1] > script_to_count[script2]
					end)),
			", ")
		
		self[str] = printed
		
		return printed
	end,
 })

 local script_examples = {
	-- To demonstrate that "is_Latin" doesn't treat a string of Zyyy and Zinh
	-- characters as Latn.
	
	-- This particular example only has characters below U+0340, so
	-- lookup_script doesn't have to be called.
	{ "%!?́", nil },
	{ "’ʼ""†‡•‰′‽⁕", nil },
	{ "col·legi", "Latn" },
	"HTML character references",
	{ "𐘀", "Lina" },
	{ "&#x10600;", "Lina" },
	{ "–", nil },
	{ "&ndash;", nil },
	-- Examples from [[Template talk:Lang#Italicisation of Halkomelem]]
	"Halkomelem",
	{ "lá:yelhp", "Latn" },
	{ "xwməθkwəy̓əm", nil }, -- one Greek (Grek) character
	{ "hən̓q̓əmin̓əm̓", "Latn" },
	"Quotes",
	 -- [[s:it:Divina Commedia/Inferno/Canto I]]
	{
 [[Tant’è amara che poco è più morte;
 ma per trattar del ben ch’i’ vi trovai,
 dirò de l’altre cose ch’i’ v’ ho scorte.]],
		"Latn"
		
	},
	{ -- A blessing in Navajo:
		--[[User talk:Stephen G. Brown/text8]]
 [[Díí Késhmish biyiʼ yáʼąąshdę́ę́ʼ ląʼígóó bee nikʼihojidlíi dooleeł. 
 Niheechʼínáánáháhígíí biyiʼ iłhodeezyéél, iłhózhǫ́, ayóóʼóʼóʼní
 bee nikʼihojidlíi dooleeł. Tʼáá sahdiigiʼ átʼéego baa hózhǫ́ǫgo
 nihił hanááhoolzhiizhígí biyiʼ tʼáá ałtsojįʼ iłhózhǫ́ nííʼ dooleeł.]],
		"Latn"
	},
	{	-- The opening of the Iliad ([[s:el:Ιλιάς/Α]]), with macrons and
		-- breves added to mark the length of the monophthongs α, ι, υ: 
 [[Μῆνῐν ᾰ̓́ειδε, θεᾱ́, Πηληῐ̈ᾰ́δεω Ᾰ̓χῐλῆος
 οὐλομένην, ἣ μῡρῐ́ ̓ Ᾰ̓χαιοῖς ᾰ̓́λγε ̓ ἔθηκε,
 πολλᾱ̀ς δ ̓ ῐ̓φθῑ́μους ψῡχᾱ̀ς Ἄῐ̈δῐ προῐ̈́ᾰψεν
 ἡρώων, αὐτοὺς δὲ ἑλώρῐᾰ τεῦχε κῠ́νεσσιν
 οἰωνοῖσῐ́ τε πᾶσῐ· Δῐὸς δ ̓ ἐτελείετο βουλή·]],
		"Grek"
	},
	{ -- The Brothers Karamazov: [[w:ru:Братья Карамазовы (Достоевский)/Книга первая]]
 [[Вот если вы не согласитесь с этим последним тезисом и
 ответите: «Не так» или «не всегда так», то я, пожалуй, и
 ободрюсь духом насчет значения героя моего Алексея
 Федоровича. Ибо не только чудак «не всегда» частность и
 обособление, а напротив, бывает так, что он-то, пожалуй,
 и носит в себе иной раз сердцевину целого, а остальные
 люди его эпохи — все, каким-нибудь наплывным ветром,
 на время почему-то от него оторвались...]],
		"Cyrl"
	},
	{ -- Rig Veda: [[https://sa.wikisource.org/wiki/ऋग्वेदः_सूक्तं_१.१]]
 [[ॐ अग्निमीळे पुरोहितं यज्ञस्य देवमृत्विजम् ।
 होतारं रत्नधातमम् ॥१॥
 अग्निः पूर्वेभिरृषिभिरीड्यो नूतनैरुत ।
 स देवाँ एह वक्षति ॥२॥
 अग्निना रयिमश्नवत् पोषमेव दिवेदिवे ।
 यशसं वीरवत्तमम् ॥३॥
 अग्ने यं यज्ञमध्वरं विश्वतः परिभूरसि ।
 स इद्देवेषु गच्छति ॥४॥
 अग्निर्होता कविक्रतुः सत्यश्चित्रश्रवस्तमः ।
 देवो देवेभिरा गमत् ॥५॥
 यदङ्ग दाशुषे त्वमग्ने भद्रं करिष्यसि ।
 तवेत् तत् सत्यमङ्गिरः ॥६॥
 उप त्वाग्ने दिवेदिवे दोषावस्तर्धिया वयम् ।
 नमो भरन्त एमसि ॥७॥
 राजन्तमध्वराणां गोपामृतस्य दीदिविम् ।
 वर्धमानं स्वे दमे ॥८॥
 स नः पितेव सूनवेऽग्ने सूपायनो भव ।
 सचस्वा नः स्वस्तये ॥९॥]],
		"Deva"
	},
 }

 local ends_in_punctuation = setmetatable({}, {
	__index = function (self, key)
		local val = mw.ustring.match(mw.ustring.sub(key, -1), "%p") ~= nil
		self[key] = val
		return val
	end,
 })
 local function show_script_example(script_example)
	local separator = ": "
	
	-- If last character is punctuation, place script counts on their own line
	-- Could use Unicode_data.lookup_category, but that is more memory-intensive.
	if ends_in_punctuation[script_example] then
		separator = "<br>&bull; "
	end
	
	return script_example:gsub('\n', '<br>') .. separator
		.. script_counts[script_example]
 end

 function p:test_get_best_script()
	self:iterate(script_examples,
		function (self, str, expected)
			self:equals(
				show_script_example(str),
				Unicode_data.get_best_script(str),
				expected)
		end)
 end

 function p:test_is_Latin()
	self:iterate(script_examples,
		function (self, str, best_script, is_Latin)
			self:equals(show_script_example(str), Unicode_data.is_Latin(str),
				is_Latin or best_script == "Latn")
		end)
 end

 function p:test_lookup_block()
	local examples = {
		{ 0x0064, "Basic Latin" },
		{ 0x030B, "Combining Diacritical Marks" },
		{ 0x03A3, "Greek and Coptic" },
		{ 0x0411, "Cyrillic" },
		{ 0x10E6, "Georgian" },
		{ 0x3175, "Hangul Compatibility Jamo" },
		{ 0xAC01, "Hangul Syllables" },
		{ 0x4E0A, "CJK Unified Ideographs" },
		{ 0x1F608, "Emoticons" },
		{ 0x30000, "CJK Unified Ideographs Extension G"},
		{ 0x10FFFF, "Supplementary Private Use Area-B" },
	}
	
	self:iterate(examples,
		function (self, codepoint, block_name)
			self:equals(
				show(codepoint),
				Unicode_data.lookup_block(codepoint),
				block_name)
		end)
 end

 function p:test_is_rtl()
	local examples = {
		{ "أبو عبد الله محمد بن عبد الله اللواتي الطنجي بن بطوطة", true }, -- Ibn Battuta's full name
		{ "أدب القاضي Adab al-qādī", false }, -- Example of incorrect input
		{ "ܛܘܼܒܲܝܗܘܿܢ ܠܐܲܝܠܹܝܢ ܕܲܕ݂ܟܹܝܢ ܒܠܸܒ̇ܗܘܿܢ܄ ܕܗܸܢ݂ܘܿܢ ܢܸܚܙܘܿܢ ܠܐܲܠܵܗܵܐ܂‬", true }, -- Syriac, sixth beatitude (Matthew 5:8)
		{ "בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ.", true }, -- Hebrew: Genesis 1:1
		{ "𞤀𞤣𞤤𞤢𞤥 𞤆𞤵𞤤𞤢𞤪", true }, -- Adlam: name of alphabet
		{
			-- Avestan: Hymn to Haoma: Yasna 10.8 ([[wikt:𐬀𐬉𐬴𐬨𐬀]])
			"𐬬𐬍𐬯𐬞𐬈 ⸱ 𐬰𐬍 ⸱ 𐬀𐬥𐬌𐬌𐬉 ⸱ 𐬨𐬀𐬜𐬃𐬢𐬵𐬋 ⸱ 𐬀𐬉𐬴𐬨𐬀 ⸱ 𐬵𐬀𐬗𐬌𐬧𐬙𐬈 ⸱ 𐬑𐬭𐬎𐬎𐬍𐬨 ⸱ 𐬛𐬭𐬎𐬎𐬋 ⸱ 𐬁𐬀𐬝 ⸱ 𐬵𐬋 ⸱ 𐬫𐬋 ⸱ 𐬵𐬀𐬊𐬨𐬀𐬵𐬈 ⸱ 𐬨𐬀𐬜𐬋 ⸱ 𐬀𐬴𐬀 ⸱ 𐬵𐬀𐬗𐬀𐬌𐬙𐬈",
			true
		},
		{ "ދިވެހި", true }, -- the word dhivehi written in Thaana script
		{ "𐤀𐤓𐤍𐤟𐤆𐤐𐤏𐤋𐤟𐤀𐤕𐤁𐤏𐤋𐤟𐤁𐤍𐤀𐤇𐤓𐤌𐤟𐤌𐤋𐤊𐤂𐤁𐤋𐤟𐤋𐤀𐤇𐤓𐤌𐤟𐤀𐤁𐤄", true }, -- Phoenician: Ahiram sarcophagus ([[wikt:𐤀𐤓𐤍]])
		{ "ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ", true }, -- Mandaic: manda ḏ'haije ("knowledge of life"; [[wikt:ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ]])
		{ "ࠄࠟࠓࠂࠝࠓࠜࠉࠆࠜࠉࠌ", true }, -- Samaritan Hebrew: īargerēzēm ("Mount Gerizim"; [[wikt:Mount Gerizim]])
		{ "%$!^&", false },
	}
	
	self:iterate(examples,
		function (self, str, expected)
			self:equals(str, Unicode_data.is_rtl(str), expected)
		end)
 end

 -- Change function names into more readable headers for the testcases tables.
 for k, v in m_table.sortedPairs(p) do
	if type(k) == "string" then
		local new_k = k:gsub("^test_(.+)$", "testcases for <code>%1</code>")
		if new_k ~= k then
			p[k] = nil
			p[new_k] = v
		end
	end
 end

 return p
Retrieved from "https://en.wikipedia.org/w/index.php?title=Module:Unicode_data/testcases&oldid=1090926269"