Skip to content

Commit 16c302f

Browse files
committed
Simplify code for getting a unicode codepoint's canonical class.
Three places of unicode_norm.c use a similar logic for getting the combining class from a codepoint. Commit 2991ac5 has added the function get_canonical_class() for this purpose, but it was only called by the backend. This commit refactors the code to use this function in all the places where the combining class is retrieved from a given codepoint. Author: John Naylor Discussion: https://postgr.es/m/CAFBsxsHUV7s7YrOm6hFz-Jq8Sc7K_yxTkfNZxsDV-DuM-k-gwg@mail.gmail.com
1 parent df99ddc commit 16c302f

File tree

1 file changed

+22
-25
lines changed

1 file changed

+22
-25
lines changed

src/common/unicode_norm.c

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,23 @@ get_code_entry(pg_wchar code)
105105
#endif
106106
}
107107

108+
/*
109+
* Get the combining class of the given codepoint.
110+
*/
111+
static uint8
112+
get_canonical_class(pg_wchar code)
113+
{
114+
const pg_unicode_decomposition *entry = get_code_entry(code);
115+
116+
/*
117+
* If no entries are found, the character used is either an Hangul
118+
* character or a character with a class of 0 and no decompositions.
119+
*/
120+
if (!entry)
121+
return 0;
122+
else
123+
return entry->comb_class;
124+
}
108125

109126
/*
110127
* Given a decomposition entry looked up earlier, get the decomposed
@@ -430,16 +447,8 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
430447
pg_wchar prev = decomp_chars[count - 1];
431448
pg_wchar next = decomp_chars[count];
432449
pg_wchar tmp;
433-
const pg_unicode_decomposition *prevEntry = get_code_entry(prev);
434-
const pg_unicode_decomposition *nextEntry = get_code_entry(next);
435-
436-
/*
437-
* If no entries are found, the character used is either an Hangul
438-
* character or a character with a class of 0 and no decompositions,
439-
* so move to next result.
440-
*/
441-
if (prevEntry == NULL || nextEntry == NULL)
442-
continue;
450+
const uint8 prevClass = get_canonical_class(prev);
451+
const uint8 nextClass = get_canonical_class(next);
443452

444453
/*
445454
* Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
@@ -449,10 +458,10 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
449458
* combining class for the second, and the second is not a starter. A
450459
* character is a starter if its combining class is 0.
451460
*/
452-
if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
461+
if (prevClass == 0 || nextClass == 0)
453462
continue;
454463

455-
if (prevEntry->comb_class <= nextEntry->comb_class)
464+
if (prevClass <= nextClass)
456465
continue;
457466

458467
/* exchange can happen */
@@ -489,8 +498,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
489498
for (count = 1; count < decomp_size; count++)
490499
{
491500
pg_wchar ch = decomp_chars[count];
492-
const pg_unicode_decomposition *ch_entry = get_code_entry(ch);
493-
int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
501+
int ch_class = get_canonical_class(ch);
494502
pg_wchar composite;
495503

496504
if (last_class < ch_class &&
@@ -527,17 +535,6 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
527535
/* We only need this in the backend. */
528536
#ifndef FRONTEND
529537

530-
static uint8
531-
get_canonical_class(pg_wchar ch)
532-
{
533-
const pg_unicode_decomposition *entry = get_code_entry(ch);
534-
535-
if (!entry)
536-
return 0;
537-
else
538-
return entry->comb_class;
539-
}
540-
541538
static const pg_unicode_normprops *
542539
qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
543540
{

0 commit comments

Comments
 (0)