Jump to content
Wikipedia The Free Encyclopedia

Module:Unicode data/testcases

From Wikipedia, the free encyclopedia
This is the test cases page for the module Module:Unicode data. Results of the test cases.
 localp=require"Module:UnitTests"
 localUnicode_data=require(mw.title.getCurrentTitle().subpageText=="sandbox"
 and"Module:Unicode data/sandbox"or"Module:Unicode data")

 localU=mw.ustring.char
 localget_codepoint=mw.ustring.codepoint
 localfunctionshow(codepoint)
 ifUnicode_data.is_printable(codepoint)then
 localprinted_codepoint=U(codepoint)
 ifmw.ustring.toNFC(printed_codepoint)~=printed_codepointthen
 printed_codepoint=("&#x%X;"):format(codepoint)
 end
 ifUnicode_data.is_combining(codepoint)then
 printed_codepoint="◌"..printed_codepoint
 end
 return("U+%04X: %s"):format(codepoint,printed_codepoint)
 else
 return("U+%04X"):format(codepoint)
 end
 end

 localfunctionshow_codepoint_and_name(codepoint)
 return("%s (%s)"):format(show(codepoint),
 Unicode_data.lookup_name(codepoint))
 end

 functionp:test_lookup_name()
 localexamples={
 {0x0000,"<control-0000>"},
 {0x007F,"<control-007F>"},
 {0x00C1,"LATIN CAPITAL LETTER A WITH ACUTE"},
 {0x0300,"COMBINING GRAVE ACCENT"},
 {0x0378,"<reserved-0378>"},
 {0x1B44,"BALINESE ADEG ADEG"},
 {0x1F71,"GREEK SMALL LETTER ALPHA WITH OXIA"},
 {0x3555,"CJK UNIFIED IDEOGRAPH-3555"},
 {0xAC01,"HANGUL SYLLABLE GAG"},
 {0xD5FF,"HANGUL SYLLABLE HEH"},
 {0xDC00,"<surrogate-DC00>",},
 {0xEEEE,"<private-use-EEEE>"},
 {0xFE04,"VARIATION SELECTOR-5"},
 {0xFDD1,"<noncharacter-FDD1>",},
 {0xFFFD,"REPLACEMENT CHARACTER"},
 {0xFFFF,"<noncharacter-FFFF>"},
 {0x1F4A9,"PILE OF POO"},
 {0x2ED9D,"CJK UNIFIED IDEOGRAPH-2ED9D"},
 {0xE0000,"<reserved-E0000>"},
 {0xF0F0F,"<private-use-F0F0F>"},
 {0x10FFFF,"<noncharacter-10FFFF>"},
 }

 self:iterate(examples,
 function(self,codepoint,name)
 self:equals(show(codepoint),
 Unicode_data.lookup_name(codepoint),name)
 end)
 end

 functionp:test_lookup_age()
 localexamples={
 {0x0061,"1.1"},
 {0x0378,"NA"},
 {0x1B44,"5.0"},
 {0x3555,"3.0"},
 {0xAC01,"2.0"},
 {0xDC00,"2.0",},
 {0xEEEE,"1.1"},
 {0xFDD1,"3.1",},
 {0x1F4A9,"6.0"},
 {0xE0000,"NA"},
 {0xF0F0F,"2.0"},
 {0x10FFFF,"2.0"},
 }

 self:iterate(examples,
 function(self,codepoint,age)
 -- Remove pcall when this function is added to [[Module:Unicode data]].
 pcall(function()
 self:equals(show(codepoint),
 Unicode_data.lookup_age(codepoint),age)
 end)
 end)
 end

 functionp:test_is_combining()
 localexamples={
 {0x0300,true},
 {0x0060,false},
 }

 self:iterate(examples,
 function(self,codepoint,expected)
 self:equals(
 show_codepoint_and_name(codepoint),
 Unicode_data.is_combining(codepoint),
 expected)
 end)
 end

 functionp:test_is_default_ignorable()
 localexamples={
 {0x0061,false},
 {0x00AD,true},
 }

 self:iterate(examples,
 function(self,codepoint,expected)
 -- Remove pcall when this function is added to [[Module:Unicode data]].
 pcall(function()
 self:equals(
 show_codepoint_and_name(codepoint),
 Unicode_data.is_default_ignorable(codepoint),
 expected)
 end)
 end)
 end

 functionp:test_lookup_script()
 localexamples={
 {0x0061,"Latn"},
 {0x002F,"Zyyy"},
 {0x0300,"Zinh"},
 {0x0378,"Zzzz"},
 {0x0398,"Grek"},
 {0x03E2,"Copt"},
 {0x2014,"Zyyy"},
 }

 self:iterate(examples,
 function(self,codepoint,expected)
 self:equals(
 show_codepoint_and_name(codepoint),
 Unicode_data.lookup_script(codepoint),
 expected)
 end)
 end

 functionp:test_lookup_category()
 localexamples={
 {get_codepoint"\t","Cc"},
 {get_codepoint" ","Zs"},
 {get_codepoint"[","Ps"},
 {get_codepoint"]","Pe"},
 {get_codepoint"^","Sk"},
 {get_codepoint"A","Lu"},
 {0x00AD,"Cf"},
 {get_codepoint"3⁄4","No"},
 {get_codepoint"«","Pi"},
 {get_codepoint"»","Pf"},
 {0x0300,"Mn"},
 {0x0488,"Me"},
 {get_codepoint"٣","Nd"},
 {get_codepoint"子","Lo"},
 {get_codepoint"ᾮ","Lt"},
 {0x1B44,"Mc"},
 {get_codepoint"∈","Sm"},
 {get_codepoint"‿","Pc"},
 {get_codepoint"↹","So"},
 {get_codepoint"⸗","Pd"},
 {get_codepoint"VIII","Nl"},
 {0x2028,"Zl"},
 {0x2029,"Zp"},
 {get_codepoint"ゞ","Lm"},
 {0xD800,"Cs"},
 {get_codepoint"£","Sc"},
 {0xFFFF,"Cn"},
 {0x100000,"Co"},
 }

 self:iterate(examples,
 function(self,codepoint,expected)
 self:equals(
 show_codepoint_and_name(codepoint),
 Unicode_data.lookup_category(codepoint),
 expected)
 end)
 end

 localfun=require"Module:Fun"
 localm_table=require"Module:TableTools"

 localscript_to_count_mt={
 __index=function(self,key)
 self[key]=0
 return0
 end,
 __call=function(self,...)
 returnsetmetatable({},self)
 end
 }
 setmetatable(script_to_count_mt,script_to_count_mt)

 localscript_counts=setmetatable({},{
 __index=function(self,str)
 iftype(str)~="string"thenreturnnilend

 localscript_to_count=script_to_count_mt()

 forcodepointinmw.ustring.gcodepoint(str)do
 localscript=Unicode_data.lookup_script(codepoint)
 script_to_count[script]=script_to_count[script]+1
 end

 localprinted=table.concat(
 fun.mapIter(
 function(count,script)
 return("%s (%d)"):format(script,count)
 end,
 m_table.sortedPairs(
 script_to_count,
 function(script1,script2)
 returnscript_to_count[script1]>script_to_count[script2]
 end)),
 ", ")

 self[str]=printed

 returnprinted
 end,
 })

 localscript_examples={
 -- To demonstrate that "is_Latin" doesn't treat a string of Zyyy and Zinh
 -- characters as Latn.

 -- This particular example only has characters below U+0340, so
 -- lookup_script doesn't have to be called.
 {"%!?́",nil},
 {"’ʼ""†‡•‰′‽⁕",nil},
 {"col·legi","Latn"},
 "HTML character references",
 {"𐘀","Lina"},
 {"&#x10600;","Lina"},
 {"–",nil},
 {"&ndash;",nil},
 -- Examples from [[Template talk:Lang#Italicisation of Halkomelem]]
 "Halkomelem",
 {"lá:yelhp","Latn"},
 {"xwməθkwəy̓əm",nil},-- one Greek (Grek) character
 {"hən̓q̓əmin̓əm̓","Latn"},
 "Quotes",
 -- [[s:it:Divina Commedia/Inferno/Canto I]]
 {
 [[Tant’è amara che poco è più morte;
 ma per trattar del ben ch’i’ vi trovai,
 dirò de l’altre cose ch’i’ v’ ho scorte.]],
 "Latn"

 },
 {-- A blessing in Navajo:
 --[[User talk:Stephen G. Brown/text8]]
 [[Díí Késhmish biyiʼ yáʼąąshdę́ę́ʼ ląʼígóó bee nikʼihojidlíi dooleeł. 
 Niheechʼínáánáháhígíí biyiʼ iłhodeezyéél, iłhózhǫ́, ayóóʼóʼóʼní
 bee nikʼihojidlíi dooleeł. Tʼáá sahdiigiʼ átʼéego baa hózhǫ́ǫgo
 nihił hanááhoolzhiizhígí biyiʼ tʼáá ałtsojįʼ iłhózhǫ́ nííʼ dooleeł.]],
 "Latn"
 },
 {-- The opening of the Iliad ([[s:el:Ιλιάς/Α]]), with macrons and
 -- breves added to mark the length of the monophthongs α, ι, υ: 
 [[Μῆνῐν ᾰ̓́ειδε, θεᾱ́, Πηληῐ̈ᾰ́δεω Ᾰ̓χῐλῆος
 οὐλομένην, ἣ μῡρῐ́ ̓ Ᾰ̓χαιοῖς ᾰ̓́λγε ̓ ἔθηκε,
 πολλᾱ̀ς δ ̓ ῐ̓φθῑ́μους ψῡχᾱ̀ς Ἄῐ̈δῐ προῐ̈́ᾰψεν
 ἡρώων, αὐτοὺς δὲ ἑλώρῐᾰ τεῦχε κῠ́νεσσιν
 οἰωνοῖσῐ́ τε πᾶσῐ· Δῐὸς δ ̓ ἐτελείετο βουλή·]],
 "Grek"
 },
 {-- The Brothers Karamazov: [[w:ru:Братья Карамазовы (Достоевский)/Книга первая]]
 [[Вот если вы не согласитесь с этим последним тезисом и
 ответите: «Не так» или «не всегда так», то я, пожалуй, и
 ободрюсь духом насчет значения героя моего Алексея
 Федоровича. Ибо не только чудак «не всегда» частность и
 обособление, а напротив, бывает так, что он-то, пожалуй,
 и носит в себе иной раз сердцевину целого, а остальные
 люди его эпохи — все, каким-нибудь наплывным ветром,
 на время почему-то от него оторвались...]],
 "Cyrl"
 },
 {-- Rig Veda: [[https://sa.wikisource.org/wiki/ऋग्वेदः_सूक्तं_१.१]]
 [[ॐ अग्निमीळे पुरोहितं यज्ञस्य देवमृत्विजम् ।
 होतारं रत्नधातमम् ॥१॥
 अग्निः पूर्वेभिरृषिभिरीड्यो नूतनैरुत ।
 स देवाँ एह वक्षति ॥२॥
 अग्निना रयिमश्नवत् पोषमेव दिवेदिवे ।
 यशसं वीरवत्तमम् ॥३॥
 अग्ने यं यज्ञमध्वरं विश्वतः परिभूरसि ।
 स इद्देवेषु गच्छति ॥४॥
 अग्निर्होता कविक्रतुः सत्यश्चित्रश्रवस्तमः ।
 देवो देवेभिरा गमत् ॥५॥
 यदङ्ग दाशुषे त्वमग्ने भद्रं करिष्यसि ।
 तवेत् तत् सत्यमङ्गिरः ॥६॥
 उप त्वाग्ने दिवेदिवे दोषावस्तर्धिया वयम् ।
 नमो भरन्त एमसि ॥७॥
 राजन्तमध्वराणां गोपामृतस्य दीदिविम् ।
 वर्धमानं स्वे दमे ॥८॥
 स नः पितेव सूनवेऽग्ने सूपायनो भव ।
 सचस्वा नः स्वस्तये ॥९॥]],
 "Deva"
 },
 }

 localends_in_punctuation=setmetatable({},{
 __index=function(self,key)
 localval=mw.ustring.match(mw.ustring.sub(key,-1),"%p")~=nil
 self[key]=val
 returnval
 end,
 })
 localfunctionshow_script_example(script_example)
 localseparator=": "

 -- If last character is punctuation, place script counts on their own line
 -- Could use Unicode_data.lookup_category, but that is more memory-intensive.
 ifends_in_punctuation[script_example]then
 separator="<br>&bull; "
 end

 returnscript_example:gsub('\n','<br>')..separator
 ..script_counts[script_example]
 end

 functionp:test_get_best_script()
 self:iterate(script_examples,
 function(self,str,expected)
 self:equals(
 show_script_example(str),
 Unicode_data.get_best_script(str),
 expected)
 end)
 end

 functionp:test_is_Latin()
 self:iterate(script_examples,
 function(self,str,best_script,is_Latin)
 self:equals(show_script_example(str),Unicode_data.is_Latin(str),
 is_Latinorbest_script=="Latn")
 end)
 end

 functionp:test_lookup_block()
 localexamples={
 {0x0064,"Basic Latin"},
 {0x030B,"Combining Diacritical Marks"},
 {0x03A3,"Greek and Coptic"},
 {0x0411,"Cyrillic"},
 {0x10E6,"Georgian"},
 {0x3175,"Hangul Compatibility Jamo"},
 {0xAC01,"Hangul Syllables"},
 {0x4E0A,"CJK Unified Ideographs"},
 {0x1F608,"Emoticons"},
 {0x30000,"CJK Unified Ideographs Extension G"},
 {0x10FFFF,"Supplementary Private Use Area-B"},
 }

 self:iterate(examples,
 function(self,codepoint,block_name)
 self:equals(
 show(codepoint),
 Unicode_data.lookup_block(codepoint),
 block_name)
 end)
 end

 functionp:test_is_rtl()
 localexamples={
 {"أبو عبد الله محمد بن عبد الله اللواتي الطنجي بن بطوطة",true},-- Ibn Battuta's full name
 {"أدب القاضي Adab al-qādī",false},-- Example of incorrect input
 {"ܛܘܼܒܲܝܗܘܿܢ ܠܐܲܝܠܹܝܢ ܕܲܕ݂ܟܹܝܢ ܒܠܸܒ̇ܗܘܿܢ܄ ܕܗܸܢ݂ܘܿܢ ܢܸܚܙܘܿܢ ܠܐܲܠܵܗܵܐ܂‬",true},-- Syriac, sixth beatitude (Matthew 5:8)
 {"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ.",true},-- Hebrew: Genesis 1:1
 {"𞤀𞤣𞤤𞤢𞤥 𞤆𞤵𞤤𞤢𞤪",true},-- Adlam: name of alphabet
 {
 -- Avestan: Hymn to Haoma: Yasna 10.8 ([[wikt:𐬀𐬉𐬴𐬨𐬀]])
 "𐬬𐬍𐬯𐬞𐬈 ⸱ 𐬰𐬍 ⸱ 𐬀𐬥𐬌𐬌𐬉 ⸱ 𐬨𐬀𐬜𐬃𐬢𐬵𐬋 ⸱ 𐬀𐬉𐬴𐬨𐬀 ⸱ 𐬵𐬀𐬗𐬌𐬧𐬙𐬈 ⸱ 𐬑𐬭𐬎𐬎𐬍𐬨 ⸱ 𐬛𐬭𐬎𐬎𐬋 ⸱ 𐬁𐬀𐬝 ⸱ 𐬵𐬋 ⸱ 𐬫𐬋 ⸱ 𐬵𐬀𐬊𐬨𐬀𐬵𐬈 ⸱ 𐬨𐬀𐬜𐬋 ⸱ 𐬀𐬴𐬀 ⸱ 𐬵𐬀𐬗𐬀𐬌𐬙𐬈",
 true
 },
 {"ދިވެހި",true},-- the word dhivehi written in Thaana script
 {"𐤀𐤓𐤍𐤟𐤆𐤐𐤏𐤋𐤟𐤀𐤕𐤁𐤏𐤋𐤟𐤁𐤍𐤀𐤇𐤓𐤌𐤟𐤌𐤋𐤊𐤂𐤁𐤋𐤟𐤋𐤀𐤇𐤓𐤌𐤟𐤀𐤁𐤄",true},-- Phoenician: Ahiram sarcophagus ([[wikt:𐤀𐤓𐤍]])
 {"ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ",true},-- Mandaic: manda ḏ'haije ("knowledge of life"; [[wikt:ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ]])
 {"ࠄࠟࠓࠂࠝࠓࠜࠉࠆࠜࠉࠌ",true},-- Samaritan Hebrew: īargerēzēm ("Mount Gerizim"; [[wikt:Mount Gerizim]])
 {"%$!^&",false},
 }

 self:iterate(examples,
 function(self,str,expected)
 self:equals(str,Unicode_data.is_rtl(str),expected)
 end)
 end

 -- Change function names into more readable headers for the testcases tables.
 fork,vinm_table.sortedPairs(p)do
 iftype(k)=="string"then
 localnew_k=k:gsub("^test_(.+)$","testcases for <code>%1</code>")
 ifnew_k~=kthen
 p[k]=nil
 p[new_k]=v
 end
 end
 end

 returnp

AltStyle によって変換されたページ (->オリジナル) /