From 1fd83f134473efe04b390b7038fec9bc01d5940f Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Thu, 3 Apr 2025 00:54:04 +0100 Subject: [PATCH 1/5] Optimise import time for ``shlex`` --- Lib/shlex.py | 23 ++++++++++++------- ...-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst | 2 ++ 2 files changed, 17 insertions(+), 8 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst diff --git a/Lib/shlex.py b/Lib/shlex.py index f4821616b62a0f..b299f65be373b4 100644 --- a/Lib/shlex.py +++ b/Lib/shlex.py @@ -7,25 +7,22 @@ # iterator interface by Gustavo Niemeyer, April 2003. # changes to tokenize more like Posix shells by Vinay Sajip, July 2016. -import os -import re -import sys -from collections import deque - -from io import StringIO - __all__ = ["shlex", "split", "quote", "join"] class shlex: "A lexical analyzer class for simple shell-like syntaxes." def __init__(self, instream=None, infile=None, posix=False, punctuation_chars=False): + from collections import deque # deferred import for performance + if isinstance(instream, str): + from io import StringIO # deferred import for performance instream = StringIO(instream) if instream is not None: self.instream = instream self.infile = infile else: + import sys # deferred import for performance self.instream = sys.stdin self.infile = None self.posix = posix @@ -78,6 +75,7 @@ def push_token(self, tok): def push_source(self, newstream, newfile=None): "Push an input source onto the lexer's input source stack." if isinstance(newstream, str): + from io import StringIO # deferred import for performance newstream = StringIO(newstream) self.filestack.appendleft((self.infile, self.instream, self.lineno)) self.infile = newfile @@ -278,6 +276,7 @@ def read_token(self): def sourcehook(self, newfile): "Hook called on a filename to be sourced." + import os.path if newfile[0] == '"': newfile = newfile[1:-1] # This implements cpp-like semantics for relative-path inclusion. @@ -318,7 +317,14 @@ def join(split_command): return ' '.join(quote(arg) for arg in split_command) -_find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search +def _find_unsafe(s, /): + # this function replaces itself with the compiled pattern on execution, + # to allow as deferred import of re for performance + global _find_unsafe + import re + _find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search + return _find_unsafe(s) + def quote(s): """Return a shell-escaped version of the string *s*.""" @@ -337,6 +343,7 @@ def _print_tokens(lexer): print("Token: " + repr(tt)) if __name__ == '__main__': + import sys # deferred import for performance if len(sys.argv) == 1: _print_tokens(shlex()) else: diff --git a/Misc/NEWS.d/next/Library/2025-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst b/Misc/NEWS.d/next/Library/2025-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst new file mode 100644 index 00000000000000..c3b599b041aa64 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst @@ -0,0 +1,2 @@ +Improve import times by up to 33x for the :mod:`shlex` module. Patch by Adam +Turner. From 8811463aed98455641958e4a64cf7ba242ac283e Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Fri, 4 Apr 2025 16:07:58 +0100 Subject: [PATCH 2/5] Revert deferral of sys and io --- Lib/shlex.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Lib/shlex.py b/Lib/shlex.py index b299f65be373b4..29278931a4febd 100644 --- a/Lib/shlex.py +++ b/Lib/shlex.py @@ -7,6 +7,9 @@ # iterator interface by Gustavo Niemeyer, April 2003. # changes to tokenize more like Posix shells by Vinay Sajip, July 2016. +import sys +from io import StringIO + __all__ = ["shlex", "split", "quote", "join"] class shlex: @@ -16,13 +19,11 @@ def __init__(self, instream=None, infile=None, posix=False, from collections import deque # deferred import for performance if isinstance(instream, str): - from io import StringIO # deferred import for performance instream = StringIO(instream) if instream is not None: self.instream = instream self.infile = infile else: - import sys # deferred import for performance self.instream = sys.stdin self.infile = None self.posix = posix @@ -75,7 +76,6 @@ def push_token(self, tok): def push_source(self, newstream, newfile=None): "Push an input source onto the lexer's input source stack." if isinstance(newstream, str): - from io import StringIO # deferred import for performance newstream = StringIO(newstream) self.filestack.appendleft((self.infile, self.instream, self.lineno)) self.infile = newfile @@ -343,7 +343,6 @@ def _print_tokens(lexer): print("Token: " + repr(tt)) if __name__ == '__main__': - import sys # deferred import for performance if len(sys.argv) == 1: _print_tokens(shlex()) else: From bd6916a9d45e68ce9f5de60f79cb0d9e049eb61d Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 6 Apr 2025 05:13:47 +0100 Subject: [PATCH 3/5] Switch to bytes.translate() based approach --- Lib/shlex.py | 16 ++++++---------- ...025-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst | 5 +++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/Lib/shlex.py b/Lib/shlex.py index 29278931a4febd..9272955cabfd8c 100644 --- a/Lib/shlex.py +++ b/Lib/shlex.py @@ -317,20 +317,16 @@ def join(split_command): return ' '.join(quote(arg) for arg in split_command) -def _find_unsafe(s, /): - # this function replaces itself with the compiled pattern on execution, - # to allow as deferred import of re for performance - global _find_unsafe - import re - _find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search - return _find_unsafe(s) - - def quote(s): """Return a shell-escaped version of the string *s*.""" if not s: return "''" - if _find_unsafe(s) is None: + + # Use bytes.translate() for performance + safe_chars = (b'%+,-./0123456789:=@' + b'ABCDEFGHIJKLMNOPQRSTUVWXYZ_' + b'abcdefghijklmnopqrstuvwxyz') + if not s.encode().translate(None, delete=safe_chars): return s # use single quotes, and put single quotes into double quotes diff --git a/Misc/NEWS.d/next/Library/2025-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst b/Misc/NEWS.d/next/Library/2025-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst index c3b599b041aa64..6b4b3ed7526a8b 100644 --- a/Misc/NEWS.d/next/Library/2025-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst +++ b/Misc/NEWS.d/next/Library/2025-04-03-00-56-48.gh-issue-118761.Vb0S1B.rst @@ -1,2 +1,3 @@ -Improve import times by up to 33x for the :mod:`shlex` module. Patch by Adam -Turner. +Improve import times by up to 33x for the :mod:`shlex` module, +and improve the performance of :func:`shlex.quote` by up to 12x. +Patch by Adam Turner. From 192329ea333c1e19e5ef251161afe4f221217a78 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+AA-Turner@users.noreply.github.com> Date: Fri, 18 Apr 2025 02:50:26 +0100 Subject: [PATCH 4/5] Update Lib/shlex.py --- Lib/shlex.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/shlex.py b/Lib/shlex.py index 9272955cabfd8c..5bf6e0d70e0012 100644 --- a/Lib/shlex.py +++ b/Lib/shlex.py @@ -326,7 +326,8 @@ def quote(s): safe_chars = (b'%+,-./0123456789:=@' b'ABCDEFGHIJKLMNOPQRSTUVWXYZ_' b'abcdefghijklmnopqrstuvwxyz') - if not s.encode().translate(None, delete=safe_chars): + # No quoting is needed if `s` is an ASCII string consisting only of `safe_chars` + if s.isascii() and not s.encode().translate(None, delete=safe_chars): return s # use single quotes, and put single quotes into double quotes From 4a640ce2f34753d8c0c55f3cecdaa1f406f3dd00 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Mon, 21 Apr 2025 01:33:49 +0100 Subject: [PATCH 5/5] Add test_lazy_imports --- Lib/test/test_shlex.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/test_shlex.py b/Lib/test/test_shlex.py index 797c91ee7effdf..f35571ea88654d 100644 --- a/Lib/test/test_shlex.py +++ b/Lib/test/test_shlex.py @@ -3,6 +3,7 @@ import shlex import string import unittest +from test.support import import_helper # The original test data set was from shellwords, by Hartmut Goebel. @@ -363,6 +364,9 @@ def testPunctuationCharsReadOnly(self): with self.assertRaises(AttributeError): shlex_instance.punctuation_chars = False + def test_lazy_imports(self): + import_helper.ensure_lazy_imports('shlex', {'collections', 're', 'os'}) + # Allow this test to be used with old shlex.py if not getattr(shlex, "split", None):