[Python-checkins] bpo-33695 shutil.copytree() + os.scandir() cache (#7874)

Giampaolo Rodola webhook-mailer at python.org
Mon Nov 12 09:18:25 EST 2018


https://github.com/python/cpython/commit/19c46a4c96553b2a8390bf8a0e138f2b23e28ed6
commit: 19c46a4c96553b2a8390bf8a0e138f2b23e28ed6
branch: master
author: Giampaolo Rodola <g.rodola at gmail.com>
committer: GitHub <noreply at github.com>
date: 2018年11月12日T06:18:15-08:00
summary:
bpo-33695 shutil.copytree() + os.scandir() cache (#7874)
files:
A Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst
M Doc/whatsnew/3.8.rst
M Lib/shutil.py
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
index 91e0d5bb7b33..e5e6d4a59944 100644
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -277,6 +277,14 @@ Optimizations
 See :ref:`shutil-platform-dependent-efficient-copy-operations` section.
 (Contributed by Giampaolo Rodola' in :issue:`25427`.)
 
+* :func:`shutil.copytree` uses :func:`os.scandir` function and all copy
+ functions depending from it use cached :func:`os.stat` values. The speedup
+ for copying a directory with 8000 files is around +9% on Linux, +20% on
+ Windows and +30% on a Windows SMB share. Also the number of :func:`os.stat`
+ syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
+ on network filesystems. (Contributed by Giampaolo Rodola' in :issue:`33695`.)
+
+
 * The default protocol in the :mod:`pickle` module is now Protocol 4,
 first introduced in Python 3.4. It offers better performance and smaller
 size compared to Protocol 3 available since Python 3.0.
diff --git a/Lib/shutil.py b/Lib/shutil.py
index b7a7df3a51fa..74348ba62ef7 100644
--- a/Lib/shutil.py
+++ b/Lib/shutil.py
@@ -200,6 +200,12 @@ def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE):
 
 def _samefile(src, dst):
 # Macintosh, Unix.
+ if isinstance(src, os.DirEntry) and hasattr(os.path, 'samestat'):
+ try:
+ return os.path.samestat(src.stat(), os.stat(dst))
+ except OSError:
+ return False
+
 if hasattr(os.path, 'samefile'):
 try:
 return os.path.samefile(src, dst)
@@ -210,6 +216,12 @@ def _samefile(src, dst):
 return (os.path.normcase(os.path.abspath(src)) ==
 os.path.normcase(os.path.abspath(dst)))
 
+def _stat(fn):
+ return fn.stat() if isinstance(fn, os.DirEntry) else os.stat(fn)
+
+def _islink(fn):
+ return fn.is_symlink() if isinstance(fn, os.DirEntry) else os.path.islink(fn)
+
 def copyfile(src, dst, *, follow_symlinks=True):
 """Copy data from src to dst in the most efficient way possible.
 
@@ -223,18 +235,19 @@ def copyfile(src, dst, *, follow_symlinks=True):
 file_size = 0
 for i, fn in enumerate([src, dst]):
 try:
- st = os.stat(fn)
+ st = _stat(fn)
 except OSError:
 # File most likely does not exist
 pass
 else:
 # XXX What about other special files? (sockets, devices...)
 if stat.S_ISFIFO(st.st_mode):
+ fn = fn.path if isinstance(fn, os.DirEntry) else fn
 raise SpecialFileError("`%s` is a named pipe" % fn)
 if _WINDOWS and i == 0:
 file_size = st.st_size
 
- if not follow_symlinks and os.path.islink(src):
+ if not follow_symlinks and _islink(src):
 os.symlink(os.readlink(src), dst)
 else:
 with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
@@ -270,13 +283,13 @@ def copymode(src, dst, *, follow_symlinks=True):
 (e.g. Linux) this method does nothing.
 
 """
- if not follow_symlinks and os.path.islink(src) and os.path.islink(dst):
+ if not follow_symlinks and _islink(src) and os.path.islink(dst):
 if hasattr(os, 'lchmod'):
 stat_func, chmod_func = os.lstat, os.lchmod
 else:
 return
 elif hasattr(os, 'chmod'):
- stat_func, chmod_func = os.stat, os.chmod
+ stat_func, chmod_func = _stat, os.chmod
 else:
 return
 
@@ -325,7 +338,7 @@ def _nop(*args, ns=None, follow_symlinks=None):
 pass
 
 # follow symlinks (aka don't not follow symlinks)
- follow = follow_symlinks or not (os.path.islink(src) and os.path.islink(dst))
+ follow = follow_symlinks or not (_islink(src) and os.path.islink(dst))
 if follow:
 # use the real function if it exists
 def lookup(name):
@@ -339,7 +352,10 @@ def lookup(name):
 return fn
 return _nop
 
- st = lookup("stat")(src, follow_symlinks=follow)
+ if isinstance(src, os.DirEntry):
+ st = src.stat(follow_symlinks=follow)
+ else:
+ st = lookup("stat")(src, follow_symlinks=follow)
 mode = stat.S_IMODE(st.st_mode)
 lookup("utime")(dst, ns=(st.st_atime_ns, st.st_mtime_ns),
 follow_symlinks=follow)
@@ -415,79 +431,47 @@ def _ignore_patterns(path, names):
 return set(ignored_names)
 return _ignore_patterns
 
-def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
- ignore_dangling_symlinks=False):
- """Recursively copy a directory tree.
-
- The destination directory must not already exist.
- If exception(s) occur, an Error is raised with a list of reasons.
-
- If the optional symlinks flag is true, symbolic links in the
- source tree result in symbolic links in the destination tree; if
- it is false, the contents of the files pointed to by symbolic
- links are copied. If the file pointed by the symlink doesn't
- exist, an exception will be added in the list of errors raised in
- an Error exception at the end of the copy process.
-
- You can set the optional ignore_dangling_symlinks flag to true if you
- want to silence this exception. Notice that this has no effect on
- platforms that don't support os.symlink.
-
- The optional ignore argument is a callable. If given, it
- is called with the `src` parameter, which is the directory
- being visited by copytree(), and `names` which is the list of
- `src` contents, as returned by os.listdir():
-
- callable(src, names) -> ignored_names
-
- Since copytree() is called recursively, the callable will be
- called once for each directory that is copied. It returns a
- list of names relative to the `src` directory that should
- not be copied.
-
- The optional copy_function argument is a callable that will be used
- to copy each file. It will be called with the source path and the
- destination path as arguments. By default, copy2() is used, but any
- function that supports the same signature (like copy()) can be used.
-
- """
- names = os.listdir(src)
+def _copytree(entries, src, dst, symlinks, ignore, copy_function,
+ ignore_dangling_symlinks):
 if ignore is not None:
- ignored_names = ignore(src, names)
+ ignored_names = ignore(src, set(os.listdir(src)))
 else:
 ignored_names = set()
 
 os.makedirs(dst)
 errors = []
- for name in names:
- if name in ignored_names:
+ use_srcentry = copy_function is copy2 or copy_function is copy
+
+ for srcentry in entries:
+ if srcentry.name in ignored_names:
 continue
- srcname = os.path.join(src, name)
- dstname = os.path.join(dst, name)
+ srcname = os.path.join(src, srcentry.name)
+ dstname = os.path.join(dst, srcentry.name)
+ srcobj = srcentry if use_srcentry else srcname
 try:
- if os.path.islink(srcname):
+ if srcentry.is_symlink():
 linkto = os.readlink(srcname)
 if symlinks:
 # We can't just leave it to `copy_function` because legacy
 # code with a custom `copy_function` may rely on copytree
 # doing the right thing.
 os.symlink(linkto, dstname)
- copystat(srcname, dstname, follow_symlinks=not symlinks)
+ copystat(srcobj, dstname, follow_symlinks=not symlinks)
 else:
 # ignore dangling symlink if the flag is on
 if not os.path.exists(linkto) and ignore_dangling_symlinks:
 continue
 # otherwise let the copy occurs. copy2 will raise an error
- if os.path.isdir(srcname):
- copytree(srcname, dstname, symlinks, ignore,
+ if srcentry.is_dir():
+ copytree(srcobj, dstname, symlinks, ignore,
 copy_function)
 else:
- copy_function(srcname, dstname)
- elif os.path.isdir(srcname):
- copytree(srcname, dstname, symlinks, ignore, copy_function)
+ copy_function(srcobj, dstname)
+ elif srcentry.is_dir():
+ copytree(srcobj, dstname, symlinks, ignore, copy_function)
 else:
 # Will raise a SpecialFileError for unsupported file types
- copy_function(srcname, dstname)
+ copy_function(srcentry, dstname)
 # catch the Error from the recursive copytree so that we can
 # continue with other files
 except Error as err:
@@ -504,6 +488,47 @@ def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
 raise Error(errors)
 return dst
 
+def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
+ ignore_dangling_symlinks=False):
+ """Recursively copy a directory tree.
+
+ The destination directory must not already exist.
+ If exception(s) occur, an Error is raised with a list of reasons.
+
+ If the optional symlinks flag is true, symbolic links in the
+ source tree result in symbolic links in the destination tree; if
+ it is false, the contents of the files pointed to by symbolic
+ links are copied. If the file pointed by the symlink doesn't
+ exist, an exception will be added in the list of errors raised in
+ an Error exception at the end of the copy process.
+
+ You can set the optional ignore_dangling_symlinks flag to true if you
+ want to silence this exception. Notice that this has no effect on
+ platforms that don't support os.symlink.
+
+ The optional ignore argument is a callable. If given, it
+ is called with the `src` parameter, which is the directory
+ being visited by copytree(), and `names` which is the list of
+ `src` contents, as returned by os.listdir():
+
+ callable(src, names) -> ignored_names
+
+ Since copytree() is called recursively, the callable will be
+ called once for each directory that is copied. It returns a
+ list of names relative to the `src` directory that should
+ not be copied.
+
+ The optional copy_function argument is a callable that will be used
+ to copy each file. It will be called with the source path and the
+ destination path as arguments. By default, copy2() is used, but any
+ function that supports the same signature (like copy()) can be used.
+
+ """
+ with os.scandir(src) as entries:
+ return _copytree(entries=entries, src=src, dst=dst, symlinks=symlinks,
+ ignore=ignore, copy_function=copy_function,
+ ignore_dangling_symlinks=ignore_dangling_symlinks)
+
 # version vulnerable to race conditions
 def _rmtree_unsafe(path, onerror):
 try:
diff --git a/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst b/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst
new file mode 100644
index 000000000000..21950453b0ad
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-06-23-12-47-37.bpo-33695.seRTxh.rst
@@ -0,0 +1,7 @@
+:func:`shutil.copytree` uses :func:`os.scandir` function and all copy
+functions depending from it use cached :func:`os.stat` values. The speedup
+for copying a directory with 8000 files is around +9% on Linux, +20% on
+Windows and + 30% on a Windows SMB share. Also the number of :func:`os.stat`
+syscalls is reduced by 38% making :func:`shutil.copytree` especially faster
+on network filesystems.
+(Contributed by Giampaolo Rodola' in :issue:`33695`.)


More information about the Python-checkins mailing list

AltStyle によって変換されたページ (->オリジナル) /