From be59df8359083b9dce08ab4eb8b83a4e77799f33 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 30 Mar 2025 20:27:55 +0100 Subject: [PATCH 01/11] Add whitespace tests --- Lib/test/test_textwrap.py | 40 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index dfbc2b93dfc0d6..6478c75dd854e4 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -769,6 +769,46 @@ def assertUnchanged(self, text): """assert that dedent() has no effect on 'text'""" self.assertEqual(text, dedent(text)) + def test_dedent_only_whitespace(self): + # The empty string. + text = "" + self.assertUnchanged(text) + + # Only spaces. + text = " " + expect = "" + self.assertEqual(expect, dedent(text)) + + # Only tabs. + text = "\t\t\t\t" + expect = "" + self.assertEqual(expect, dedent(text)) + + # A mixture. + text = " \t \t\t \t " + expect = "" + self.assertEqual(expect, dedent(text)) + + # ASCII whitespace. + text = "\f\n\r\t\v " + expect = "\f\n\r\t\v " + self.assertEqual(expect, dedent(text)) + + # One newline. + text = "\n" + expect = "\n" + self.assertEqual(expect, dedent(text)) + + # Windows-style newlines. + text = "\r\n" + expect = "\r\n" + self.assertEqual(expect, dedent(text)) + + # Whitespace mixture. + text = " \n\t\n \n\t\t\n\n\n " + expect = "\n\n\n\n\n\n" + self.assertEqual(expect, dedent(text)) + def test_dedent_nomargin(self): # No lines indented. text = "Hello there.\nHow are you?\nOh good, I'm glad." From fec6717d1d6546e4b5df76211c9add57b8e2110e Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 30 Mar 2025 19:51:40 +0100 Subject: [PATCH 02/11] Optimise ``textwrap.dedent()`` Co-authored-by: Marius Juston --- Lib/test/test_textwrap.py | 4 +- Lib/textwrap.py | 55 ++++++------------- ...-03-30-19-55-10.gh-issue-131792.NNjzFA.rst | 2 + 3 files changed, 22 insertions(+), 39 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 6478c75dd854e4..7715d14fd12b5e 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -791,7 +791,7 @@ def test_dedent_only_whitespace(self): # ASCII whitespace. text = "\f\n\r\t\v " - expect = "\f\n\r\t\v " + expect = "\n" self.assertEqual(expect, dedent(text)) # One newline. @@ -801,7 +801,7 @@ def test_dedent_only_whitespace(self): # Windows-style newlines. text = "\r\n" - expect = "\r\n" + expect = "\n" self.assertEqual(expect, dedent(text)) # Whitespace mixture. diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 1bf07aa46cad99..99950278a690cf 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -413,9 +413,6 @@ def shorten(text, width, **kwargs): # -- Loosely related functionality ------------------------------------- -_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) -_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) - def dedent(text): """Remove any common leading whitespace from every line in `text`. @@ -429,42 +426,26 @@ def dedent(text): Entirely blank lines are normalized to a newline character. """ - # Look for the longest leading string of spaces and tabs common to - # all lines. - margin = None - text = _whitespace_only_re.sub('', text) - indents = _leading_whitespace_re.findall(text) - for indent in indents: - if margin is None: - margin = indent - - # Current line more deeply indented than previous winner: - # no change (previous winner is still on top). - elif indent.startswith(margin): - pass - - # Current line consistent with and no deeper than previous winner: - # it's the new winner. - elif margin.startswith(indent): - margin = indent - - # Find the largest common whitespace between current line and previous - # winner. - else: - for i, (x, y) in enumerate(zip(margin, indent)): - if x != y: - margin = margin[:i] - break + if not text: + return text + + lines = text.split('\n') + + non_blank_lines = [l for l in lines if l and not l.isspace()] + if not non_blank_lines: + return '\n'.join([l if l and not l.isspace() else '' for l in lines]) - # sanity check (testing/debugging only) - if 0 and margin: - for line in text.split("\n"): - assert not line or line.startswith(margin), \ - "line = %r, margin = %r" % (line, margin) + # Get length of leading whitespace, inspired by ``os.path.commonprefix()`` + l1 = min(non_blank_lines) + l2 = max(non_blank_lines) + margin = 0 + for i, c in enumerate(l1): + if c != l2[i] or c not in ' \t': + break + margin += 1 - if margin: - text = re.sub(r'(?m)^' + margin, '', text) - return text + return '\n'.join([l[margin:] if l and not l.isspace() else '' + for l in lines]) def indent(text, prefix, predicate=None): diff --git a/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst new file mode 100644 index 00000000000000..d1ae12461bcf9c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst @@ -0,0 +1,2 @@ +Improved performance of :func: `textwrap.dedent` by ~2.4x. +Patch by Adam Turner and Marius Juston. From 19e76d3efde3527eb2b38fa6cea65b17be22d4d7 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 30 Mar 2025 20:33:19 +0100 Subject: [PATCH 03/11] Further micro-optimisation for entirely blank input --- Lib/textwrap.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 99950278a690cf..6d87e3b1207cd6 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -429,13 +429,14 @@ def dedent(text): if not text: return text - lines = text.split('\n') + # If the input is entirely whitespace, return normalized lines + if text.isspace(): + return '\n' * text.count('\n') - non_blank_lines = [l for l in lines if l and not l.isspace()] - if not non_blank_lines: - return '\n'.join([l if l and not l.isspace() else '' for l in lines]) + lines = text.split('\n') # Get length of leading whitespace, inspired by ``os.path.commonprefix()`` + non_blank_lines = [l for l in lines if l and not l.isspace()] l1 = min(non_blank_lines) l2 = max(non_blank_lines) margin = 0 From 850c07ab83f5c5b140b58162259d887fca6eab59 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 30 Mar 2025 21:26:43 +0100 Subject: [PATCH 04/11] whitespace --- .../next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst index d1ae12461bcf9c..934d0afd05efcc 100644 --- a/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst +++ b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst @@ -1,2 +1,2 @@ -Improved performance of :func: `textwrap.dedent` by ~2.4x. +Improved performance of :func:`textwrap.dedent` by ~2.4x. Patch by Adam Turner and Marius Juston. From 796fb3eefceb8d68c129c912eeae20ce9e1de2b1 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 30 Mar 2025 22:09:42 +0100 Subject: [PATCH 05/11] Use margin as the loop index variable --- Lib/textwrap.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 6d87e3b1207cd6..a5da476355faea 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -439,11 +439,9 @@ def dedent(text): non_blank_lines = [l for l in lines if l and not l.isspace()] l1 = min(non_blank_lines) l2 = max(non_blank_lines) - margin = 0 - for i, c in enumerate(l1): - if c != l2[i] or c not in ' \t': + for margin, c in enumerate(l1): + if c != l2[margin] or c not in ' \t': break - margin += 1 return '\n'.join([l[margin:] if l and not l.isspace() else '' for l in lines]) From ecc9cfcff34c5e7e23fce7bd115f44e7eaa1280d Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 30 Mar 2025 22:10:22 +0100 Subject: [PATCH 06/11] Add full stops --- Lib/textwrap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index a5da476355faea..573ab49be1de18 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -429,13 +429,13 @@ def dedent(text): if not text: return text - # If the input is entirely whitespace, return normalized lines + # If the input is entirely whitespace, return normalized lines. if text.isspace(): return '\n' * text.count('\n') lines = text.split('\n') - # Get length of leading whitespace, inspired by ``os.path.commonprefix()`` + # Get length of leading whitespace, inspired by ``os.path.commonprefix()``. non_blank_lines = [l for l in lines if l and not l.isspace()] l1 = min(non_blank_lines) l2 = max(non_blank_lines) From 311ac87656935d32ae49defc4b51606964977550 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 30 Mar 2025 22:43:53 +0100 Subject: [PATCH 07/11] Further micro-optimisation suggested by Pieter Eendebak --- Lib/textwrap.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 573ab49be1de18..cdda2a6811decd 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -443,8 +443,7 @@ def dedent(text): if c != l2[margin] or c not in ' \t': break - return '\n'.join([l[margin:] if l and not l.isspace() else '' - for l in lines]) + return '\n'.join([l[margin:] if not l.isspace() else '' for l in lines]) def indent(text, prefix, predicate=None): From f2050aef1128021cb6fa5e29b60cb19741d3be2f Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 30 Mar 2025 23:05:16 +0100 Subject: [PATCH 08/11] Remove early-exit check for whitespace-only input, as suggested by Pieter Eendebak --- Lib/textwrap.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index cdda2a6811decd..bb6a1186316275 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -429,16 +429,13 @@ def dedent(text): if not text: return text - # If the input is entirely whitespace, return normalized lines. - if text.isspace(): - return '\n' * text.count('\n') - lines = text.split('\n') # Get length of leading whitespace, inspired by ``os.path.commonprefix()``. non_blank_lines = [l for l in lines if l and not l.isspace()] - l1 = min(non_blank_lines) - l2 = max(non_blank_lines) + l1 = min(non_blank_lines, default='') + l2 = max(non_blank_lines, default='') + margin = 0 for margin, c in enumerate(l1): if c != l2[margin] or c not in ' \t': break From 1f73bcbc04ac844dd63df32f6cbbdafa68cefa5f Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Mon, 31 Mar 2025 01:02:57 +0100 Subject: [PATCH 09/11] More test updates --- Lib/test/test_textwrap.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 7715d14fd12b5e..77366988b57fa7 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -769,7 +769,7 @@ def assertUnchanged(self, text): """assert that dedent() has no effect on 'text'""" self.assertEqual(text, dedent(text)) - def test_dedent_only_whitespace(self): + def test_dedent_whitespace(self): # The empty string. text = "" self.assertUnchanged(text) @@ -800,8 +800,8 @@ def test_dedent_only_whitespace(self): self.assertEqual(expect, dedent(text)) # Windows-style newlines. - text = "\r\n" - expect = "\n" + text = "\r\n" * 5 + expect = "\n" * 5 self.assertEqual(expect, dedent(text)) # Whitespace mixture. @@ -809,6 +809,16 @@ def test_dedent_only_whitespace(self): expect = "\n\n\n\n\n\n" self.assertEqual(expect, dedent(text)) + # Lines consisting only of whitespace are always normalised + text = "a\n \n\t\n" + expect = "a\n\n\n" + self.assertEqual(expect, dedent(text)) + + # Whitespace characters on non-empty lines are retained + text = "a\r\n\r\n\r\n" + expect = "a\r\n\n\n" + self.assertEqual(expect, dedent(text)) + def test_dedent_nomargin(self): # No lines indented. text = "Hello there.\nHow are you?\nOh good, I'm glad." From cc67165718b620e86c06a7a1aa516234d89f4dad Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Mon, 31 Mar 2025 01:05:17 +0100 Subject: [PATCH 10/11] Update NEWS --- .../Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst index 934d0afd05efcc..6c97638dc84e89 100644 --- a/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst +++ b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst @@ -1,2 +1,4 @@ -Improved performance of :func:`textwrap.dedent` by ~2.4x. +Improved performance of :func:`textwrap.dedent` by ~2.4x, +and fixed a bug where blank lines with whitespace characters other than space +or horizontal tab were not normalised to the newline. Patch by Adam Turner and Marius Juston. From bf58dfa27118c0f724eb8395fc9db66df98fccc1 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Mon, 31 Mar 2025 01:11:07 +0100 Subject: [PATCH 11/11] Update NEWS --- .../Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst index 6c97638dc84e89..62b619c0d80f4d 100644 --- a/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst +++ b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst @@ -1,4 +1,5 @@ -Improved performance of :func:`textwrap.dedent` by ~2.4x, +Improved performance of :func:`textwrap.dedent` by an average of ~2.4x, +(with improvements of up to 4x for large inputs), and fixed a bug where blank lines with whitespace characters other than space or horizontal tab were not normalised to the newline. -Patch by Adam Turner and Marius Juston. +Patch by Adam Turner, Marius Juston, and Pieter Eendebak.