Module:Unicode data/testcases
Appearance
From Wikipedia, the free encyclopedia
localp=require"Module:UnitTests" localUnicode_data=require(mw.title.getCurrentTitle().subpageText=="sandbox" and"Module:Unicode data/sandbox"or"Module:Unicode data") localU=mw.ustring.char localget_codepoint=mw.ustring.codepoint localfunctionshow(codepoint) ifUnicode_data.is_printable(codepoint)then localprinted_codepoint=U(codepoint) ifmw.ustring.toNFC(printed_codepoint)~=printed_codepointthen printed_codepoint=("&#x%X;"):format(codepoint) end ifUnicode_data.is_combining(codepoint)then printed_codepoint="◌"..printed_codepoint end return("U+%04X: %s"):format(codepoint,printed_codepoint) else return("U+%04X"):format(codepoint) end end localfunctionshow_codepoint_and_name(codepoint) return("%s (%s)"):format(show(codepoint), Unicode_data.lookup_name(codepoint)) end functionp:test_lookup_name() localexamples={ {0x0000,"<control-0000>"}, {0x007F,"<control-007F>"}, {0x00C1,"LATIN CAPITAL LETTER A WITH ACUTE"}, {0x0300,"COMBINING GRAVE ACCENT"}, {0x0378,"<reserved-0378>"}, {0x1B44,"BALINESE ADEG ADEG"}, {0x1F71,"GREEK SMALL LETTER ALPHA WITH OXIA"}, {0x3555,"CJK UNIFIED IDEOGRAPH-3555"}, {0xAC01,"HANGUL SYLLABLE GAG"}, {0xD5FF,"HANGUL SYLLABLE HEH"}, {0xDC00,"<surrogate-DC00>",}, {0xEEEE,"<private-use-EEEE>"}, {0xFE04,"VARIATION SELECTOR-5"}, {0xFDD1,"<noncharacter-FDD1>",}, {0xFFFD,"REPLACEMENT CHARACTER"}, {0xFFFF,"<noncharacter-FFFF>"}, {0x1F4A9,"PILE OF POO"}, {0x2ED9D,"CJK UNIFIED IDEOGRAPH-2ED9D"}, {0xE0000,"<reserved-E0000>"}, {0xF0F0F,"<private-use-F0F0F>"}, {0x10FFFF,"<noncharacter-10FFFF>"}, } self:iterate(examples, function(self,codepoint,name) self:equals(show(codepoint), Unicode_data.lookup_name(codepoint),name) end) end functionp:test_lookup_age() localexamples={ {0x0061,"1.1"}, {0x0378,"NA"}, {0x1B44,"5.0"}, {0x3555,"3.0"}, {0xAC01,"2.0"}, {0xDC00,"2.0",}, {0xEEEE,"1.1"}, {0xFDD1,"3.1",}, {0x1F4A9,"6.0"}, {0xE0000,"NA"}, {0xF0F0F,"2.0"}, {0x10FFFF,"2.0"}, } self:iterate(examples, function(self,codepoint,age) -- Remove pcall when this function is added to [[Module:Unicode data]]. pcall(function() self:equals(show(codepoint), Unicode_data.lookup_age(codepoint),age) end) end) end functionp:test_is_combining() localexamples={ {0x0300,true}, {0x0060,false}, } self:iterate(examples, function(self,codepoint,expected) self:equals( show_codepoint_and_name(codepoint), Unicode_data.is_combining(codepoint), expected) end) end functionp:test_is_default_ignorable() localexamples={ {0x0061,false}, {0x00AD,true}, } self:iterate(examples, function(self,codepoint,expected) -- Remove pcall when this function is added to [[Module:Unicode data]]. pcall(function() self:equals( show_codepoint_and_name(codepoint), Unicode_data.is_default_ignorable(codepoint), expected) end) end) end functionp:test_lookup_script() localexamples={ {0x0061,"Latn"}, {0x002F,"Zyyy"}, {0x0300,"Zinh"}, {0x0378,"Zzzz"}, {0x0398,"Grek"}, {0x03E2,"Copt"}, {0x2014,"Zyyy"}, } self:iterate(examples, function(self,codepoint,expected) self:equals( show_codepoint_and_name(codepoint), Unicode_data.lookup_script(codepoint), expected) end) end functionp:test_lookup_category() localexamples={ {get_codepoint"\t","Cc"}, {get_codepoint" ","Zs"}, {get_codepoint"[","Ps"}, {get_codepoint"]","Pe"}, {get_codepoint"^","Sk"}, {get_codepoint"A","Lu"}, {0x00AD,"Cf"}, {get_codepoint"3⁄4","No"}, {get_codepoint"«","Pi"}, {get_codepoint"»","Pf"}, {0x0300,"Mn"}, {0x0488,"Me"}, {get_codepoint"٣","Nd"}, {get_codepoint"子","Lo"}, {get_codepoint"ᾮ","Lt"}, {0x1B44,"Mc"}, {get_codepoint"∈","Sm"}, {get_codepoint"‿","Pc"}, {get_codepoint"↹","So"}, {get_codepoint"⸗","Pd"}, {get_codepoint"VIII","Nl"}, {0x2028,"Zl"}, {0x2029,"Zp"}, {get_codepoint"ゞ","Lm"}, {0xD800,"Cs"}, {get_codepoint"£","Sc"}, {0xFFFF,"Cn"}, {0x100000,"Co"}, } self:iterate(examples, function(self,codepoint,expected) self:equals( show_codepoint_and_name(codepoint), Unicode_data.lookup_category(codepoint), expected) end) end localfun=require"Module:Fun" localm_table=require"Module:TableTools" localscript_to_count_mt={ __index=function(self,key) self[key]=0 return0 end, __call=function(self,...) returnsetmetatable({},self) end } setmetatable(script_to_count_mt,script_to_count_mt) localscript_counts=setmetatable({},{ __index=function(self,str) iftype(str)~="string"thenreturnnilend localscript_to_count=script_to_count_mt() forcodepointinmw.ustring.gcodepoint(str)do localscript=Unicode_data.lookup_script(codepoint) script_to_count[script]=script_to_count[script]+1 end localprinted=table.concat( fun.mapIter( function(count,script) return("%s (%d)"):format(script,count) end, m_table.sortedPairs( script_to_count, function(script1,script2) returnscript_to_count[script1]>script_to_count[script2] end)), ", ") self[str]=printed returnprinted end, }) localscript_examples={ -- To demonstrate that "is_Latin" doesn't treat a string of Zyyy and Zinh -- characters as Latn. -- This particular example only has characters below U+0340, so -- lookup_script doesn't have to be called. {"%!?́",nil}, {"’ʼ""†‡•‰′‽⁕",nil}, {"col·legi","Latn"}, "HTML character references", {"𐘀","Lina"}, {"𐘀","Lina"}, {"–",nil}, {"–",nil}, -- Examples from [[Template talk:Lang#Italicisation of Halkomelem]] "Halkomelem", {"lá:yelhp","Latn"}, {"xwməθkwəy̓əm",nil},-- one Greek (Grek) character {"hən̓q̓əmin̓əm̓","Latn"}, "Quotes", -- [[s:it:Divina Commedia/Inferno/Canto I]] { [[Tant’è amara che poco è più morte; ma per trattar del ben ch’i’ vi trovai, dirò de l’altre cose ch’i’ v’ ho scorte.]], "Latn" }, {-- A blessing in Navajo: --[[User talk:Stephen G. Brown/text8]] [[Díí Késhmish biyiʼ yáʼąąshdę́ę́ʼ ląʼígóó bee nikʼihojidlíi dooleeł. Niheechʼínáánáháhígíí biyiʼ iłhodeezyéél, iłhózhǫ́, ayóóʼóʼóʼní bee nikʼihojidlíi dooleeł. Tʼáá sahdiigiʼ átʼéego baa hózhǫ́ǫgo nihił hanááhoolzhiizhígí biyiʼ tʼáá ałtsojįʼ iłhózhǫ́ nííʼ dooleeł.]], "Latn" }, {-- The opening of the Iliad ([[s:el:Ιλιάς/Α]]), with macrons and -- breves added to mark the length of the monophthongs α, ι, υ: [[Μῆνῐν ᾰ̓́ειδε, θεᾱ́, Πηληῐ̈ᾰ́δεω Ᾰ̓χῐλῆος οὐλομένην, ἣ μῡρῐ́ ̓ Ᾰ̓χαιοῖς ᾰ̓́λγε ̓ ἔθηκε, πολλᾱ̀ς δ ̓ ῐ̓φθῑ́μους ψῡχᾱ̀ς Ἄῐ̈δῐ προῐ̈́ᾰψεν ἡρώων, αὐτοὺς δὲ ἑλώρῐᾰ τεῦχε κῠ́νεσσιν οἰωνοῖσῐ́ τε πᾶσῐ· Δῐὸς δ ̓ ἐτελείετο βουλή·]], "Grek" }, {-- The Brothers Karamazov: [[w:ru:Братья Карамазовы (Достоевский)/Книга первая]] [[Вот если вы не согласитесь с этим последним тезисом и ответите: «Не так» или «не всегда так», то я, пожалуй, и ободрюсь духом насчет значения героя моего Алексея Федоровича. Ибо не только чудак «не всегда» частность и обособление, а напротив, бывает так, что он-то, пожалуй, и носит в себе иной раз сердцевину целого, а остальные люди его эпохи — все, каким-нибудь наплывным ветром, на время почему-то от него оторвались...]], "Cyrl" }, {-- Rig Veda: [[https://sa.wikisource.org/wiki/ऋग्वेदः_सूक्तं_१.१]] [[ॐ अग्निमीळे पुरोहितं यज्ञस्य देवमृत्विजम् । होतारं रत्नधातमम् ॥१॥ अग्निः पूर्वेभिरृषिभिरीड्यो नूतनैरुत । स देवाँ एह वक्षति ॥२॥ अग्निना रयिमश्नवत् पोषमेव दिवेदिवे । यशसं वीरवत्तमम् ॥३॥ अग्ने यं यज्ञमध्वरं विश्वतः परिभूरसि । स इद्देवेषु गच्छति ॥४॥ अग्निर्होता कविक्रतुः सत्यश्चित्रश्रवस्तमः । देवो देवेभिरा गमत् ॥५॥ यदङ्ग दाशुषे त्वमग्ने भद्रं करिष्यसि । तवेत् तत् सत्यमङ्गिरः ॥६॥ उप त्वाग्ने दिवेदिवे दोषावस्तर्धिया वयम् । नमो भरन्त एमसि ॥७॥ राजन्तमध्वराणां गोपामृतस्य दीदिविम् । वर्धमानं स्वे दमे ॥८॥ स नः पितेव सूनवेऽग्ने सूपायनो भव । सचस्वा नः स्वस्तये ॥९॥]], "Deva" }, } localends_in_punctuation=setmetatable({},{ __index=function(self,key) localval=mw.ustring.match(mw.ustring.sub(key,-1),"%p")~=nil self[key]=val returnval end, }) localfunctionshow_script_example(script_example) localseparator=": " -- If last character is punctuation, place script counts on their own line -- Could use Unicode_data.lookup_category, but that is more memory-intensive. ifends_in_punctuation[script_example]then separator="<br>• " end returnscript_example:gsub('\n','<br>')..separator ..script_counts[script_example] end functionp:test_get_best_script() self:iterate(script_examples, function(self,str,expected) self:equals( show_script_example(str), Unicode_data.get_best_script(str), expected) end) end functionp:test_is_Latin() self:iterate(script_examples, function(self,str,best_script,is_Latin) self:equals(show_script_example(str),Unicode_data.is_Latin(str), is_Latinorbest_script=="Latn") end) end functionp:test_lookup_block() localexamples={ {0x0064,"Basic Latin"}, {0x030B,"Combining Diacritical Marks"}, {0x03A3,"Greek and Coptic"}, {0x0411,"Cyrillic"}, {0x10E6,"Georgian"}, {0x3175,"Hangul Compatibility Jamo"}, {0xAC01,"Hangul Syllables"}, {0x4E0A,"CJK Unified Ideographs"}, {0x1F608,"Emoticons"}, {0x30000,"CJK Unified Ideographs Extension G"}, {0x10FFFF,"Supplementary Private Use Area-B"}, } self:iterate(examples, function(self,codepoint,block_name) self:equals( show(codepoint), Unicode_data.lookup_block(codepoint), block_name) end) end functionp:test_is_rtl() localexamples={ {"أبو عبد الله محمد بن عبد الله اللواتي الطنجي بن بطوطة",true},-- Ibn Battuta's full name {"أدب القاضي Adab al-qādī",false},-- Example of incorrect input {"ܛܘܼܒܲܝܗܘܿܢ ܠܐܲܝܠܹܝܢ ܕܲܕ݂ܟܹܝܢ ܒܠܸܒ̇ܗܘܿܢ܄ ܕܗܸܢ݂ܘܿܢ ܢܸܚܙܘܿܢ ܠܐܲܠܵܗܵܐ܂",true},-- Syriac, sixth beatitude (Matthew 5:8) {"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ.",true},-- Hebrew: Genesis 1:1 {"𞤀𞤣𞤤𞤢𞤥 𞤆𞤵𞤤𞤢𞤪",true},-- Adlam: name of alphabet { -- Avestan: Hymn to Haoma: Yasna 10.8 ([[wikt:𐬀𐬉𐬴𐬨𐬀]]) "𐬬𐬍𐬯𐬞𐬈 ⸱ 𐬰𐬍 ⸱ 𐬀𐬥𐬌𐬌𐬉 ⸱ 𐬨𐬀𐬜𐬃𐬢𐬵𐬋 ⸱ 𐬀𐬉𐬴𐬨𐬀 ⸱ 𐬵𐬀𐬗𐬌𐬧𐬙𐬈 ⸱ 𐬑𐬭𐬎𐬎𐬍𐬨 ⸱ 𐬛𐬭𐬎𐬎𐬋 ⸱ 𐬁𐬀𐬝 ⸱ 𐬵𐬋 ⸱ 𐬫𐬋 ⸱ 𐬵𐬀𐬊𐬨𐬀𐬵𐬈 ⸱ 𐬨𐬀𐬜𐬋 ⸱ 𐬀𐬴𐬀 ⸱ 𐬵𐬀𐬗𐬀𐬌𐬙𐬈", true }, {"ދިވެހި",true},-- the word dhivehi written in Thaana script {"𐤀𐤓𐤍𐤟𐤆𐤐𐤏𐤋𐤟𐤀𐤕𐤁𐤏𐤋𐤟𐤁𐤍𐤀𐤇𐤓𐤌𐤟𐤌𐤋𐤊𐤂𐤁𐤋𐤟𐤋𐤀𐤇𐤓𐤌𐤟𐤀𐤁𐤄",true},-- Phoenician: Ahiram sarcophagus ([[wikt:𐤀𐤓𐤍]]) {"ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ",true},-- Mandaic: manda ḏ'haije ("knowledge of life"; [[wikt:ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ]]) {"ࠄࠟࠓࠂࠝࠓࠜࠉࠆࠜࠉࠌ",true},-- Samaritan Hebrew: īargerēzēm ("Mount Gerizim"; [[wikt:Mount Gerizim]]) {"%$!^&",false}, } self:iterate(examples, function(self,str,expected) self:equals(str,Unicode_data.is_rtl(str),expected) end) end -- Change function names into more readable headers for the testcases tables. fork,vinm_table.sortedPairs(p)do iftype(k)=="string"then localnew_k=k:gsub("^test_(.+)$","testcases for <code>%1</code>") ifnew_k~=kthen p[k]=nil p[new_k]=v end end end returnp