readdir() returns inaccessible name if file was created with invalid UTF-8
Christian Franke
Christian.Franke@t-online.de
Sat Jun 28 10:18:57 GMT 2025
Corinna Vinschen via Cygwin wrote:
> On Jun 27 15:32, Christian Franke via Cygwin wrote:
>> $ touch $'t-\xef\x80\x80'
>> The name mapping is:
>> "t-\xEF\x80\x80" -(open, ...)-> L"t-\xDB59" -(readdir)-> "t-"
> Did you copy/paste this from the old mail, by any chance?
Sorry, I accidentally mixed two cases with same readdir() result:
"t-\xEF\x80\x80" -(open, ...)-> L"t-\xF000" -(readdir)-> "t-"
"t-\xED\xAD\x99' -(open, ...)-> L"t-\xDB59" -(readdir)-> "t-"
$ touch $'t-\xed\xad\x99'
$ touch $'t-\xef\x80\x80'
$ ls | uniq -c
2 t-
Does no longer occur in 3.7.0-0.165.g1b60f4861b70 but see below.
> Using the latest test DLL the mapping is
>> "t-\xEF\x80\x80" -(open, ...)-> L"t-\xF000"
>> And that's basically correct, albeit it leads to problems.
>> You know that we defined the area from 0xf000 to 0xf0ff as our private
> use area to create filenames with characters invalid in DOS filenames
> by transposing these chars into the private use area. When converting
> the filenames back, the 0xf0XX chars are transposed back to 0xXX.
Yes.
> But yeah, I found the bug here. The problem is that the transpose table
> incorrectly contains NUL as transposable character. So if you create
> L"t-\xF000", that's fine. However, when converting this name back to
> UTF-8, the filename becomes L"t-0円". Oops.
>> I dropped the ASCII NUL from the list of transposable characters and
> now what you get is this:
>> $ touch $'t-\xef\x80\x80'
> $ touch $'t-\xef\x80\x81'
> $ ls -l
> total 0
> -rw-r--r-- 1 corinna vinschen 0 Jun 27 16:49 't-'$'001円'
> -rw-r--r-- 1 corinna vinschen 0 Jun 27 16:49 't-'$'357円200円200円'
>> Apart from the incorrect transposition of ASCII NUL, the transposition
> works transparently:
>> $ echo foo > $'t-\xef\x80\x81'
> $ cat $'t-\xef\x80\x81'
> foo
> $ cat $'t-\x01'
> foo
>> I'll apply the patch shortly.
$ touch $'t-\xed\xad\x90'
$ touch $'t-\xed\xad\x91'
$ touch $'t-\xed\xad\x92'
$ touch $'t-\xed\xad\x93'
$ touch $'t-\xed\xad\x94'
$ ls | uniq -c
5 t-
$ ls -s
ls: cannot access 't-': No such file or directory
ls: cannot access 't-': No such file or directory
ls: cannot access 't-': No such file or directory
ls: cannot access 't-': No such file or directory
ls: cannot access 't-': No such file or directory
total 0
? t- ? t- ? t- ? t- ? t-
All results found by several runs with different seeds of the attached
test program have in common that the Windows path name contains an
invalid word in UTF-16 High Surrogate range:
$ ./randnames 42
$'t-\xEC\x9E\xB3\xEF\x82\x80\xEF\x83\xA0': access() failed, errno=2:
$'t-\xED\xA4\xA8\x80\xE0': original path
L"t-\xD928\xF080\xF0E0": Windows path
$'t-\xEE\x9E\xB3\xEF\x83\xA1': access() failed, errno=2:
$'t-\xED\xA6\xB0\xE1': original path
L"t-\xD9B0\xF0E1": Windows path
...
$'t-\xE7\xBE\xB3\xEF\x82\xB3': access() failed, errno=2:
$'t-\xED\xA2\x96\xB3': original path
L"t-\xD896\xF0B3": Windows path
--
Thanks,
Christian
-------------- next part --------------
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <wchar.h>
#include <windows.h>
static void print_c(FILE * f, const char * s)
{
fputs("$'", f);
char c;
for (int i = 0; (c = s[i]); i++) {
if (c == '\'')
fputs("'\\'$'", f);
else if (' ' <= c && c <= '~')
fputc(c, f);
else
fprintf(f, "\\x%02X", c & 0xff);
}
fputc('\'', f);
}
static void print_w(FILE * f, const wchar_t * s)
{
fputs("L\"", f);
wchar_t c;
for (int i = 0; (c = s[i]); i++) {
if (c == L'"' || c == L'\\')
fprintf(f, "\\%c", c);
else if (L' ' <= c && c <= L'~')
fputc(c, f);
else
fprintf(f, "\\x%04X", c & 0xffff);
}
fputc('"', f);
}
static void get_winname(wchar_t * name)
{
WIN32_FIND_DATAW e;
HANDLE h = FindFirstFileW(L"*", &e);
if (h == INVALID_HANDLE_VALUE) {
fprintf(stderr, "FindFirstFileW(): Error=%u\n", GetLastError());
exit(1);
}
int i = 0;
do {
if (!wcscmp(e.cFileName, L".") || !wcscmp(e.cFileName, L".."))
continue;
wcscpy(name, e.cFileName);
i++;
} while (FindNextFileW(h, &e));
FindClose(h);
if (i != 1) {
fprintf(stderr, "Error: %d Win32 files found\n", i);
exit(1);
}
}
static void get_cygname(char * name)
{
DIR * d = opendir(".");
if (!d) {
perror("opendir");
exit(1);
}
int i = 0;
const struct dirent * e;
while ((e = readdir(d))) {
if (!strcmp(e->d_name, ".") || !strcmp(e->d_name, ".."))
continue;
strcpy(name, e->d_name);
i++;
}
closedir(d);
if (i != 1) {
fprintf(stderr, "Error: %d Cygwin files found\n", i);
exit(1);
}
}
static void randname(char * name, int maxlen)
{
int len = 1 + rand() % (maxlen + 1 - 1);
for (int i = 0; i < len; i++) {
char c = 1 + rand() % (256 - 2 - 1);
if (c >= '/')
c++;
if (c >= '\\')
c++;
name[i] = c;
}
name[len] = 0;
}
static int testname(const char * name)
{
int fd = open(name, O_WRONLY|O_CREAT, 0644);
if (fd < 0) {
print_c(stdout, name); printf(": open() failed, errno=%d\n", errno);
exit(1);
}
close(fd);
char cygname[MAX_PATH];
get_cygname(cygname);
wchar_t winname[MAX_PATH];
get_winname(winname);
int rc = 1;
if (access(cygname, 0)) {
print_c(stdout, cygname); printf(": access() failed, errno=%d:\n", errno);
print_c(stdout, name); printf(": original path\n");
print_w(stdout, winname); printf(": Windows path\n\n");
rc = 0;
}
if (unlink(name)) {
print_c(stdout, name); printf(": unlink() failed, errno=%d\n", errno);
print_w(stdout, winname); printf(": Windows path\n");
exit(1);
}
return rc;
}
int main(int argc, char **argv)
{
if (argc > 1)
srand(atoi(argv[1]));
const char * dir = "test.tmp";
rmdir(dir);
if (mkdir(dir, 0755)) {
perror(dir); return 1;
}
if (chdir(dir)) {
perror(dir); return 1;
}
int errs = 0;
for (int i = 0; i < 100000; i++) {
char name[8] = "t-";
randname(name + 2, sizeof(name) - 1 - 2);
if (!testname(name) && ++errs >= 10)
break;
}
return 0;
}
More information about the Cygwin
mailing list