From 10bf439929fdb396fe663304777d6221643d466f Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 4 Feb 2023 00:57:09 +0100 Subject: [PATCH 1/5] Don't do toupper() redudantly in encoding the data for metaphone All inputs for ENCODE() are already uppercase, so there's no need to spend time uppercasing them again. --- ext/standard/metaphone.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ext/standard/metaphone.c b/ext/standard/metaphone.c index 2ba7a839c88e9..7453abfa3ac66 100644 --- a/ext/standard/metaphone.c +++ b/ext/standard/metaphone.c @@ -78,7 +78,8 @@ static const char _codes[26] = }; -#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0) +/* Note: these macros require an uppercase letter input! */ +#define ENCODE(c) (isalpha(c) ? _codes[((c) - 'A')] : 0) #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */ From d5945f65882ed4042dbbce5274a21d9f8b537118 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 4 Feb 2023 00:57:53 +0100 Subject: [PATCH 2/5] Don't compute uppercase letter redudantly in checks If it's a zero-terminator check, or an isalpha() check there's no need to convert it to uppercase first. --- ext/standard/metaphone.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ext/standard/metaphone.c b/ext/standard/metaphone.c index 7453abfa3ac66..4a1118d7d5d41 100644 --- a/ext/standard/metaphone.c +++ b/ext/standard/metaphone.c @@ -103,15 +103,17 @@ static const char _codes[26] = * accesssing the array directly... */ /* Look at the next letter in the word */ -#define Next_Letter (toupper(word[w_idx+1])) +#define Raw_Next_Letter (word[w_idx+1]) +#define Next_Letter (toupper(Raw_Next_Letter)) /* Look at the current letter in the word */ -#define Curr_Letter (toupper(word[w_idx])) +#define Raw_Curr_Letter (word[w_idx]) +#define Curr_Letter (toupper(Raw_Curr_Letter)) /* Go N letters back. */ #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0') /* Previous letter. I dunno, should this return null on failure? */ #define Prev_Letter (Look_Back_Letter(1)) /* Look two letters down. It makes sure you don't walk off the string. */ -#define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \ +#define After_Next_Letter (Raw_Next_Letter != '\0' ? toupper(word[w_idx+2]) \ : '\0') #define Look_Ahead_Letter(n) (toupper(Lookahead((char *) word+w_idx, n))) @@ -180,9 +182,9 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /*-- The first phoneme has to be processed specially. --*/ /* Find our first letter */ - for (; !isalpha(Curr_Letter); w_idx++) { + for (; !isalpha(Raw_Curr_Letter); w_idx++) { /* On the off chance we were given nothing but crap... */ - if (Curr_Letter == '\0') { + if (Raw_Curr_Letter == '\0') { End_Phoned_Word(); return; } @@ -248,7 +250,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /* On to the metaphoning */ - for (; Curr_Letter != '\0' && + for (; Raw_Curr_Letter != '\0' && (max_phonemes == 0 || Phone_Len < (size_t)max_phonemes); w_idx++) { /* How many letters to skip because an eariler encoding handled @@ -264,7 +266,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem */ /* Ignore non-alphas */ - if (!isalpha(Curr_Letter)) + if (!isalpha(Raw_Curr_Letter)) continue; /* Drop duplicates, except CC */ From 18d18bf811cbd2811927c827a9dc511a4f194917 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 4 Feb 2023 00:58:40 +0100 Subject: [PATCH 3/5] Cleanup LookAhead helper --- ext/standard/metaphone.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ext/standard/metaphone.c b/ext/standard/metaphone.c index 4a1118d7d5d41..f993478947f74 100644 --- a/ext/standard/metaphone.c +++ b/ext/standard/metaphone.c @@ -122,15 +122,13 @@ static const char _codes[26] = /* I probably could have just used strlen... */ static char Lookahead(char *word, int how_far) { - char letter_ahead = '\0'; /* null by default */ int idx; for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); /* Edge forward in the string... */ - letter_ahead = word[idx]; /* idx will be either == to how_far or - * at the end of the string + return word[idx]; /* idx will be either == to how_far or + * at the end of the string where ot will be null */ - return letter_ahead; } From 022bd7a1cb510d6d0509fa3e0a38312b915979d7 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 4 Feb 2023 14:20:05 +0100 Subject: [PATCH 4/5] Add some letter caching to metaphone to increase performance We don't have to re-read letters, and re-uppercase them if we already did it once. By caching these results we gain performance. Furthermore, we can avoid fetching and uppercasing in some conditions by first checking what we already had: e.g. if a condition depends on both Prev_Letter and After_Next_Letter, but we already have Prev_Letter cached we can place that first to avoid a fetch+toupper of the "after next letter". --- ext/standard/metaphone.c | 137 ++++++++++++++++++++++----------------- 1 file changed, 79 insertions(+), 58 deletions(-) diff --git a/ext/standard/metaphone.c b/ext/standard/metaphone.c index f993478947f74..44ce2becb7ad7 100644 --- a/ext/standard/metaphone.c +++ b/ext/standard/metaphone.c @@ -102,18 +102,19 @@ static const char _codes[26] = /* I suppose I could have been using a character pointer instead of * accesssing the array directly... */ +#define Convert_Raw(c) toupper(c) /* Look at the next letter in the word */ -#define Raw_Next_Letter (word[w_idx+1]) -#define Next_Letter (toupper(Raw_Next_Letter)) +#define Read_Raw_Next_Letter (word[w_idx+1]) +#define Read_Next_Letter (Convert_Raw(Read_Raw_Next_Letter)) /* Look at the current letter in the word */ -#define Raw_Curr_Letter (word[w_idx]) -#define Curr_Letter (toupper(Raw_Curr_Letter)) +#define Read_Raw_Curr_Letter (word[w_idx]) +#define Read_Curr_Letter (Convert_Raw(Read_Raw_Curr_Letter)) /* Go N letters back. */ -#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0') +#define Look_Back_Letter(n) (w_idx >= n ? Convert_Raw(word[w_idx-n]) : '\0') /* Previous letter. I dunno, should this return null on failure? */ -#define Prev_Letter (Look_Back_Letter(1)) +#define Read_Prev_Letter (Look_Back_Letter(1)) /* Look two letters down. It makes sure you don't walk off the string. */ -#define After_Next_Letter (Raw_Next_Letter != '\0' ? toupper(word[w_idx+2]) \ +#define Read_After_Next_Letter (Read_Raw_Next_Letter != '\0' ? Convert_Raw(word[w_idx+2]) \ : '\0') #define Look_Ahead_Letter(n) (toupper(Lookahead((char *) word+w_idx, n))) @@ -165,6 +166,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem int w_idx = 0; /* point in the phonization we're at. */ size_t p_idx = 0; /* end of the phoned phrase */ size_t max_buffer_len = 0; /* maximum length of the destination buffer */ + char curr_letter; ZEND_ASSERT(word != NULL); ZEND_ASSERT(max_phonemes >= 0); @@ -180,18 +182,20 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /*-- The first phoneme has to be processed specially. --*/ /* Find our first letter */ - for (; !isalpha(Raw_Curr_Letter); w_idx++) { + for (; !isalpha(curr_letter = Read_Raw_Curr_Letter); w_idx++) { /* On the off chance we were given nothing but crap... */ - if (Raw_Curr_Letter == '\0') { + if (curr_letter == '\0') { End_Phoned_Word(); return; } } - switch (Curr_Letter) { + curr_letter = Convert_Raw(curr_letter); + + switch (curr_letter) { /* AE becomes E */ case 'A': - if (Next_Letter == 'E') { + if (Read_Next_Letter == 'E') { Phonize('E'); w_idx += 2; } @@ -205,7 +209,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem case 'G': case 'K': case 'P': - if (Next_Letter == 'N') { + if (Read_Next_Letter == 'N') { Phonize('N'); w_idx += 2; } @@ -213,16 +217,18 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /* WH becomes W, WR becomes R W if followed by a vowel */ - case 'W': - if (Next_Letter == 'R') { - Phonize(Next_Letter); + case 'W': { + char next_letter = Read_Next_Letter; + if (next_letter == 'R') { + Phonize(next_letter); w_idx += 2; - } else if (Next_Letter == 'H' || isvowel(Next_Letter)) { + } else if (next_letter == 'H' || isvowel(next_letter)) { Phonize('W'); w_idx += 2; } /* else ignore */ break; + } /* X becomes S */ case 'X': Phonize('S'); @@ -237,7 +243,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem case 'I': case 'O': case 'U': - Phonize(Curr_Letter); + Phonize(curr_letter); w_idx++; break; default: @@ -248,7 +254,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /* On to the metaphoning */ - for (; Raw_Curr_Letter != '\0' && + for (; (curr_letter = Read_Raw_Curr_Letter) != '\0' && (max_phonemes == 0 || Phone_Len < (size_t)max_phonemes); w_idx++) { /* How many letters to skip because an eariler encoding handled @@ -264,18 +270,23 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem */ /* Ignore non-alphas */ - if (!isalpha(Raw_Curr_Letter)) + if (!isalpha(curr_letter)) continue; + curr_letter = Convert_Raw(curr_letter); + /* Note: we can't cache curr_letter from the previous loop + * because of the skip_letter variable. */ + char prev_letter = Read_Prev_Letter; + /* Drop duplicates, except CC */ - if (Curr_Letter == Prev_Letter && - Curr_Letter != 'C') + if (curr_letter == prev_letter && + curr_letter != 'C') continue; - switch (Curr_Letter) { + switch (curr_letter) { /* B -> B unless in MB */ case 'B': - if (Prev_Letter != 'M') + if (prev_letter != 'M') Phonize('B'); break; /* 'sh' if -CIA- or -CH, but not SCH, except SCHW. @@ -284,20 +295,20 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem * dropped if -SCI-, SCE-, -SCY- (handed in S) * else K */ - case 'C': - if (MAKESOFT(Next_Letter)) { /* C[IEY] */ - if (After_Next_Letter == 'A' && - Next_Letter == 'I') { /* CIA */ + case 'C': { + char next_letter = Read_Next_Letter; + if (MAKESOFT(next_letter)) { /* C[IEY] */ + if (next_letter == 'I' && Read_After_Next_Letter == 'A') { /* CIA */ Phonize(SH); } /* SC[IEY] */ - else if (Prev_Letter == 'S') { + else if (prev_letter == 'S') { /* Dropped */ } else { Phonize('S'); } - } else if (Next_Letter == 'H') { - if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */ + } else if (next_letter == 'H') { + if ((!traditional) && (prev_letter == 'S' || Read_After_Next_Letter == 'R')) { /* Christ, School */ Phonize('K'); } else { Phonize(SH); @@ -307,12 +318,13 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem Phonize('K'); } break; + } /* J if in -DGE-, -DGI- or -DGY- * else T */ case 'D': - if (Next_Letter == 'G' && - MAKESOFT(After_Next_Letter)) { + if (Read_Next_Letter == 'G' && + MAKESOFT(Read_After_Next_Letter)) { Phonize('J'); skip_letter++; } else @@ -324,8 +336,9 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem * else J if in -GE-, -GI, -GY and not GG * else K */ - case 'G': - if (Next_Letter == 'H') { + case 'G': { + char next_letter = Read_Next_Letter; + if (next_letter == 'H') { if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) { Phonize('F'); @@ -333,38 +346,40 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem } else { /* silent */ } - } else if (Next_Letter == 'N') { - if (Isbreak(After_Next_Letter) || - (After_Next_Letter == 'E' && + } else if (next_letter == 'N') { + char after_next_letter = Read_After_Next_Letter; + if (Isbreak(after_next_letter) || + (after_next_letter == 'E' && Look_Ahead_Letter(3) == 'D')) { /* dropped */ } else Phonize('K'); - } else if (MAKESOFT(Next_Letter) && - Prev_Letter != 'G') { + } else if (MAKESOFT(next_letter) && + prev_letter != 'G') { Phonize('J'); } else { Phonize('K'); } break; + } /* H if before a vowel and not after C,G,P,S,T */ case 'H': - if (isvowel(Next_Letter) && - !AFFECTH(Prev_Letter)) + if (isvowel(Read_Next_Letter) && + !AFFECTH(prev_letter)) Phonize('H'); break; /* dropped if after C * else K */ case 'K': - if (Prev_Letter != 'C') + if (prev_letter != 'C') Phonize('K'); break; /* F if before H * else P */ case 'P': - if (Next_Letter == 'H') { + if (Read_Next_Letter == 'H') { Phonize('F'); } else { Phonize('P'); @@ -378,44 +393,50 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- * else S */ - case 'S': - if (Next_Letter == 'I' && - (After_Next_Letter == 'O' || - After_Next_Letter == 'A')) { + case 'S': { + char next_letter = Read_Next_Letter; + char after_next_letter; + if (next_letter == 'I' && + ((after_next_letter = Read_After_Next_Letter) == 'O' || + after_next_letter == 'A')) { Phonize(SH); - } else if (Next_Letter == 'H') { + } else if (next_letter == 'H') { Phonize(SH); skip_letter++; - } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { + } else if ((!traditional) && (next_letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { Phonize(SH); skip_letter += 2; } else { Phonize('S'); } break; + } /* 'sh' in -TIA- or -TIO- * else 'th' before H * else T */ - case 'T': - if (Next_Letter == 'I' && - (After_Next_Letter == 'O' || - After_Next_Letter == 'A')) { + case 'T': { + char next_letter = Read_Next_Letter; + char after_next_letter; + if (next_letter == 'I' && + ((after_next_letter = Read_After_Next_Letter) == 'O' || + after_next_letter == 'A')) { Phonize(SH); - } else if (Next_Letter == 'H') { + } else if (next_letter == 'H') { Phonize(TH); skip_letter++; - } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) { + } else if (!(next_letter == 'C' && Read_After_Next_Letter == 'H')) { Phonize('T'); } break; + } /* F */ case 'V': Phonize('F'); break; /* W before a vowel, else dropped */ case 'W': - if (isvowel(Next_Letter)) + if (isvowel(Read_Next_Letter)) Phonize('W'); break; /* KS */ @@ -425,7 +446,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem break; /* Y if followed by a vowel */ case 'Y': - if (isvowel(Next_Letter)) + if (isvowel(Read_Next_Letter)) Phonize('Y'); break; /* S */ @@ -439,7 +460,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem case 'M': case 'N': case 'R': - Phonize(Curr_Letter); + Phonize(curr_letter); break; default: /* nothing */ From 2dd1d9ad4a712f5dd1a3d94e0f9208fb4496675b Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sun, 5 Feb 2023 16:03:40 +0100 Subject: [PATCH 5/5] Address review comments --- ext/standard/metaphone.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/standard/metaphone.c b/ext/standard/metaphone.c index 44ce2becb7ad7..7726bba2f17e7 100644 --- a/ext/standard/metaphone.c +++ b/ext/standard/metaphone.c @@ -128,7 +128,7 @@ static char Lookahead(char *word, int how_far) /* Edge forward in the string... */ return word[idx]; /* idx will be either == to how_far or - * at the end of the string where ot will be null + * at the end of the string where it will be null */ } @@ -220,7 +220,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem case 'W': { char next_letter = Read_Next_Letter; if (next_letter == 'R') { - Phonize(next_letter); + Phonize('R'); w_idx += 2; } else if (next_letter == 'H' || isvowel(next_letter)) { Phonize('W');