Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit bdcea11

Browse files
Add grapheme_levenshtein function. (#18087)
Measure levenshtein for grapheme cluster unit
1 parent 6fa669a commit bdcea11

File tree

6 files changed

+359
-1
lines changed

6 files changed

+359
-1
lines changed

‎NEWS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ PHP NEWS
8989
. Added Locale::isRightToLeft to check if a locale is written right to left.
9090
(David Carlier)
9191
. Added null bytes presence in locale inputs for Locale class. (David Carlier)
92+
. Added grapheme_levenshtein() function. (Yuya Hamada)
9293

9394
- MySQLi:
9495
. Fixed bugs GH-17900 and GH-8084 (calling mysqli::__construct twice).

‎UPGRADING

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,8 @@ PHP 8.5 UPGRADE NOTES
319319
- Intl:
320320
. Added locale_is_right_to_left/Locale::isRightToLeft, returns true if
321321
the locale is written right to left (after its enrichment with likely subtags).
322+
. Added grapheme_levenshtein() function.
323+
RFC: https://wiki.php.net/rfc/grapheme_levenshtein
322324

323325
- Pdo\Sqlite:
324326
. Added support for Pdo\Sqlite::setAuthorizer(), which is the equivalent of

‎ext/intl/grapheme/grapheme_string.c

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -918,4 +918,219 @@ PHP_FUNCTION(grapheme_str_split)
918918
ubrk_close(bi);
919919
}
920920

921+
PHP_FUNCTION(grapheme_levenshtein)
922+
{
923+
zend_string *string1, *string2;
924+
zend_long cost_ins = 1;
925+
zend_long cost_rep = 1;
926+
zend_long cost_del = 1;
927+
928+
ZEND_PARSE_PARAMETERS_START(2, 5)
929+
Z_PARAM_STR(string1)
930+
Z_PARAM_STR(string2)
931+
Z_PARAM_OPTIONAL
932+
Z_PARAM_LONG(cost_ins)
933+
Z_PARAM_LONG(cost_rep)
934+
Z_PARAM_LONG(cost_del)
935+
ZEND_PARSE_PARAMETERS_END();
936+
937+
if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) {
938+
zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
939+
RETURN_THROWS();
940+
}
941+
942+
if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) {
943+
zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
944+
RETURN_THROWS();
945+
}
946+
947+
if (cost_del <= 0 || cost_del > UINT_MAX / 4) {
948+
zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
949+
RETURN_THROWS();
950+
}
951+
952+
zend_long c0, c1, c2;
953+
zend_long retval;
954+
size_t i2;
955+
char *pstr1, *pstr2;
956+
957+
UChar *ustring1 = NULL;
958+
UChar *ustring2 = NULL;
959+
960+
int32_t ustring1_len = 0;
961+
int32_t ustring2_len = 0;
962+
963+
UErrorCode ustatus = U_ZERO_ERROR;
964+
965+
/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
966+
* that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time)
967+
* by having shorter rows (p1 & p2). */
968+
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
969+
zend_string *tmp = string1;
970+
string1 = string2;
971+
string2 = tmp;
972+
}
973+
974+
pstr1 = ZSTR_VAL(string1);
975+
pstr2 = ZSTR_VAL(string2);
976+
977+
intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus);
978+
979+
if (U_FAILURE(ustatus)) {
980+
intl_error_set_code(NULL, ustatus);
981+
982+
intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
983+
efree(ustring1);
984+
RETURN_FALSE;
985+
}
986+
987+
intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus);
988+
989+
if (U_FAILURE(ustatus)) {
990+
intl_error_set_code(NULL, ustatus);
991+
992+
intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
993+
efree(ustring2);
994+
efree(ustring1);
995+
RETURN_FALSE;
996+
}
997+
998+
UBreakIterator *bi1, *bi2;
999+
1000+
int32_t strlen_1, strlen_2;
1001+
strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0);
1002+
strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0);
1003+
1004+
if (strlen_1 == 0) {
1005+
efree(ustring1);
1006+
efree(ustring2);
1007+
RETURN_LONG(strlen_2 * cost_ins);
1008+
}
1009+
if (strlen_2 == 0) {
1010+
efree(ustring1);
1011+
efree(ustring2);
1012+
RETURN_LONG(strlen_1 * cost_del);
1013+
}
1014+
1015+
unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE];
1016+
unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE];
1017+
bi1 = grapheme_get_break_iterator(u_break_iterator_buffer1, &ustatus);
1018+
if (U_FAILURE(ustatus)) {
1019+
intl_error_set_code(NULL, ustatus);
1020+
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #1 ($string1)", 0);
1021+
efree(ustring2);
1022+
efree(ustring1);
1023+
ubrk_close(bi1);
1024+
RETURN_FALSE;
1025+
}
1026+
1027+
bi2 = grapheme_get_break_iterator(u_break_iterator_buffer2, &ustatus);
1028+
if (U_FAILURE(ustatus)) {
1029+
intl_error_set_code(NULL, ustatus);
1030+
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #2 ($string2)", 0);
1031+
efree(ustring2);
1032+
efree(ustring1);
1033+
ubrk_close(bi2);
1034+
ubrk_close(bi1);
1035+
RETURN_FALSE;
1036+
}
1037+
ubrk_setText(bi1, ustring1, ustring1_len, &ustatus);
1038+
1039+
if (U_FAILURE(ustatus)) {
1040+
intl_error_set_code(NULL, ustatus);
1041+
1042+
intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #1 ($string1)", 0);
1043+
efree(ustring2);
1044+
efree(ustring1);
1045+
ubrk_close(bi2);
1046+
ubrk_close(bi1);
1047+
RETURN_FALSE;
1048+
}
1049+
1050+
ubrk_setText(bi2, ustring2, ustring2_len, &ustatus);
1051+
if (U_FAILURE(ustatus)) {
1052+
intl_error_set_code(NULL, ustatus);
1053+
1054+
intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #2 ($string2)", 0);
1055+
efree(ustring2);
1056+
efree(ustring1);
1057+
ubrk_close(bi2);
1058+
ubrk_close(bi1);
1059+
RETURN_FALSE;
1060+
}
1061+
UCollator *collator = ucol_open("", &ustatus);
1062+
if (U_FAILURE(ustatus)) {
1063+
intl_error_set_code(NULL, ustatus);
1064+
1065+
intl_error_set_custom_msg(NULL, "Error on ucol_open", 0);
1066+
efree(ustring2);
1067+
efree(ustring1);
1068+
ubrk_close(bi2);
1069+
ubrk_close(bi1);
1070+
ucol_close(collator);
1071+
RETURN_FALSE;
1072+
}
1073+
1074+
zend_long *p1, *p2, *tmp;
1075+
p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
1076+
p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
1077+
1078+
for (i2 = 0; i2 <= strlen_2; i2++) {
1079+
p1[i2] = i2 * cost_ins;
1080+
}
1081+
1082+
int32_t current1 = 0;
1083+
int32_t current2 = 0;
1084+
int32_t pos1 = 0;
1085+
int32_t pos2 = 0;
1086+
1087+
while (true) {
1088+
current1 = ubrk_current(bi1);
1089+
pos1 = ubrk_next(bi1);
1090+
if (pos1 == UBRK_DONE) {
1091+
break;
1092+
}
1093+
p2[0] = p1[0] + cost_del;
1094+
for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) {
1095+
current2 = ubrk_current(bi2);
1096+
pos2 = ubrk_next(bi2);
1097+
if (pos2 == UBRK_DONE) {
1098+
break;
1099+
}
1100+
if (ucol_strcoll(collator, ustring1 + current1, pos1 - current1, ustring2 + current2, pos2 - current2) == UCOL_EQUAL) {
1101+
c0 = p1[i2];
1102+
} else {
1103+
c0 = p1[i2] + cost_rep;
1104+
}
1105+
c1 = p1[i2 + 1] + cost_del;
1106+
if (c1 < c0) {
1107+
c0 = c1;
1108+
}
1109+
c2 = p2[i2] + cost_ins;
1110+
if (c2 < c0) {
1111+
c0 = c2;
1112+
}
1113+
p2[i2 + 1] = c0;
1114+
}
1115+
ubrk_first(bi2);
1116+
tmp = p1;
1117+
p1 = p2;
1118+
p2 = tmp;
1119+
}
1120+
1121+
ucol_close(collator);
1122+
1123+
ubrk_close(bi1);
1124+
ubrk_close(bi2);
1125+
1126+
efree(ustring1);
1127+
efree(ustring2);
1128+
1129+
retval = p1[strlen_2];
1130+
1131+
efree(p1);
1132+
efree(p2);
1133+
RETURN_LONG(retval);
1134+
}
1135+
9211136
/* }}} */

‎ext/intl/php_intl.stub.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle =
447447

448448
function grapheme_str_split(string $string, int $length = 1): array|false {}
449449

450+
function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {}
451+
450452
/** @param int $next */
451453
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}
452454

‎ext/intl/php_intl_arginfo.h

Lines changed: 11 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /