Skip to content

Metaphone performance improvement #10501

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 83 additions & 61 deletions ext/standard/metaphone.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ static const char _codes[26] =
};


#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
/* Note: these macros require an uppercase letter input! */
#define ENCODE(c) (isalpha(c) ? _codes[((c) - 'A')] : 0)

#define isvowel(c) (ENCODE(c) & 1) /* AEIOU */

Expand All @@ -101,16 +102,19 @@ static const char _codes[26] =
/* I suppose I could have been using a character pointer instead of
* accesssing the array directly... */

#define Convert_Raw(c) toupper(c)
/* Look at the next letter in the word */
#define Next_Letter (toupper(word[w_idx+1]))
#define Read_Raw_Next_Letter (word[w_idx+1])
#define Read_Next_Letter (Convert_Raw(Read_Raw_Next_Letter))
/* Look at the current letter in the word */
#define Curr_Letter (toupper(word[w_idx]))
#define Read_Raw_Curr_Letter (word[w_idx])
#define Read_Curr_Letter (Convert_Raw(Read_Raw_Curr_Letter))
/* Go N letters back. */
#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
#define Look_Back_Letter(n) (w_idx >= n ? Convert_Raw(word[w_idx-n]) : '\0')
/* Previous letter. I dunno, should this return null on failure? */
#define Prev_Letter (Look_Back_Letter(1))
#define Read_Prev_Letter (Look_Back_Letter(1))
/* Look two letters down. It makes sure you don't walk off the string. */
#define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
#define Read_After_Next_Letter (Read_Raw_Next_Letter != '\0' ? Convert_Raw(word[w_idx+2]) \
: '\0')
#define Look_Ahead_Letter(n) (toupper(Lookahead((char *) word+w_idx, n)))

Expand All @@ -119,15 +123,13 @@ static const char _codes[26] =
/* I probably could have just used strlen... */
static char Lookahead(char *word, int how_far)
{
char letter_ahead = '\0'; /* null by default */
int idx;
for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
/* Edge forward in the string... */

letter_ahead = word[idx]; /* idx will be either == to how_far or
* at the end of the string
return word[idx]; /* idx will be either == to how_far or
* at the end of the string where it will be null
*/
return letter_ahead;
}


Expand Down Expand Up @@ -164,6 +166,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
int w_idx = 0; /* point in the phonization we're at. */
size_t p_idx = 0; /* end of the phoned phrase */
size_t max_buffer_len = 0; /* maximum length of the destination buffer */
char curr_letter;
ZEND_ASSERT(word != NULL);
ZEND_ASSERT(max_phonemes >= 0);

Expand All @@ -179,18 +182,20 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem

/*-- The first phoneme has to be processed specially. --*/
/* Find our first letter */
for (; !isalpha(Curr_Letter); w_idx++) {
for (; !isalpha(curr_letter = Read_Raw_Curr_Letter); w_idx++) {
/* On the off chance we were given nothing but crap... */
if (Curr_Letter == '\0') {
if (curr_letter == '\0') {
End_Phoned_Word();
return;
}
}

switch (Curr_Letter) {
curr_letter = Convert_Raw(curr_letter);

switch (curr_letter) {
/* AE becomes E */
case 'A':
if (Next_Letter == 'E') {
if (Read_Next_Letter == 'E') {
Phonize('E');
w_idx += 2;
}
Expand All @@ -204,24 +209,26 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
case 'G':
case 'K':
case 'P':
if (Next_Letter == 'N') {
if (Read_Next_Letter == 'N') {
Phonize('N');
w_idx += 2;
}
break;
/* WH becomes W,
WR becomes R
W if followed by a vowel */
case 'W':
if (Next_Letter == 'R') {
Phonize(Next_Letter);
case 'W': {
char next_letter = Read_Next_Letter;
if (next_letter == 'R') {
Phonize('R');
w_idx += 2;
} else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
} else if (next_letter == 'H' || isvowel(next_letter)) {
Phonize('W');
w_idx += 2;
}
/* else ignore */
break;
}
/* X becomes S */
case 'X':
Phonize('S');
Expand All @@ -236,7 +243,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
case 'I':
case 'O':
case 'U':
Phonize(Curr_Letter);
Phonize(curr_letter);
w_idx++;
break;
default:
Expand All @@ -247,7 +254,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem


/* On to the metaphoning */
for (; Curr_Letter != '\0' &&
for (; (curr_letter = Read_Raw_Curr_Letter) != '\0' &&
(max_phonemes == 0 || Phone_Len < (size_t)max_phonemes);
w_idx++) {
/* How many letters to skip because an eariler encoding handled
Expand All @@ -263,18 +270,23 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
*/

/* Ignore non-alphas */
if (!isalpha(Curr_Letter))
if (!isalpha(curr_letter))
continue;

curr_letter = Convert_Raw(curr_letter);
/* Note: we can't cache curr_letter from the previous loop
* because of the skip_letter variable. */
char prev_letter = Read_Prev_Letter;

/* Drop duplicates, except CC */
if (Curr_Letter == Prev_Letter &&
Curr_Letter != 'C')
if (curr_letter == prev_letter &&
curr_letter != 'C')
continue;

switch (Curr_Letter) {
switch (curr_letter) {
/* B -> B unless in MB */
case 'B':
if (Prev_Letter != 'M')
if (prev_letter != 'M')
Phonize('B');
break;
/* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
Expand All @@ -283,20 +295,20 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
* dropped if -SCI-, SCE-, -SCY- (handed in S)
* else K
*/
case 'C':
if (MAKESOFT(Next_Letter)) { /* C[IEY] */
if (After_Next_Letter == 'A' &&
Next_Letter == 'I') { /* CIA */
case 'C': {
char next_letter = Read_Next_Letter;
if (MAKESOFT(next_letter)) { /* C[IEY] */
if (next_letter == 'I' && Read_After_Next_Letter == 'A') { /* CIA */
Phonize(SH);
}
/* SC[IEY] */
else if (Prev_Letter == 'S') {
else if (prev_letter == 'S') {
/* Dropped */
} else {
Phonize('S');
}
} else if (Next_Letter == 'H') {
if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */
} else if (next_letter == 'H') {
if ((!traditional) && (prev_letter == 'S' || Read_After_Next_Letter == 'R')) { /* Christ, School */
Phonize('K');
} else {
Phonize(SH);
Expand All @@ -306,12 +318,13 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
Phonize('K');
}
break;
}
/* J if in -DGE-, -DGI- or -DGY-
* else T
*/
case 'D':
if (Next_Letter == 'G' &&
MAKESOFT(After_Next_Letter)) {
if (Read_Next_Letter == 'G' &&
MAKESOFT(Read_After_Next_Letter)) {
Phonize('J');
skip_letter++;
} else
Expand All @@ -323,47 +336,50 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
* else J if in -GE-, -GI, -GY and not GG
* else K
*/
case 'G':
if (Next_Letter == 'H') {
case 'G': {
char next_letter = Read_Next_Letter;
if (next_letter == 'H') {
if (!(NOGHTOF(Look_Back_Letter(3)) ||
Look_Back_Letter(4) == 'H')) {
Phonize('F');
skip_letter++;
} else {
/* silent */
}
} else if (Next_Letter == 'N') {
if (Isbreak(After_Next_Letter) ||
(After_Next_Letter == 'E' &&
} else if (next_letter == 'N') {
char after_next_letter = Read_After_Next_Letter;
if (Isbreak(after_next_letter) ||
(after_next_letter == 'E' &&
Look_Ahead_Letter(3) == 'D')) {
/* dropped */
} else
Phonize('K');
} else if (MAKESOFT(Next_Letter) &&
Prev_Letter != 'G') {
} else if (MAKESOFT(next_letter) &&
prev_letter != 'G') {
Phonize('J');
} else {
Phonize('K');
}
break;
}
/* H if before a vowel and not after C,G,P,S,T */
case 'H':
if (isvowel(Next_Letter) &&
!AFFECTH(Prev_Letter))
if (isvowel(Read_Next_Letter) &&
!AFFECTH(prev_letter))
Phonize('H');
break;
/* dropped if after C
* else K
*/
case 'K':
if (Prev_Letter != 'C')
if (prev_letter != 'C')
Phonize('K');
break;
/* F if before H
* else P
*/
case 'P':
if (Next_Letter == 'H') {
if (Read_Next_Letter == 'H') {
Phonize('F');
} else {
Phonize('P');
Expand All @@ -377,44 +393,50 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
/* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
* else S
*/
case 'S':
if (Next_Letter == 'I' &&
(After_Next_Letter == 'O' ||
After_Next_Letter == 'A')) {
case 'S': {
char next_letter = Read_Next_Letter;
char after_next_letter;
if (next_letter == 'I' &&
((after_next_letter = Read_After_Next_Letter) == 'O' ||
after_next_letter == 'A')) {
Phonize(SH);
} else if (Next_Letter == 'H') {
} else if (next_letter == 'H') {
Phonize(SH);
skip_letter++;
} else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
} else if ((!traditional) && (next_letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
Phonize(SH);
skip_letter += 2;
} else {
Phonize('S');
}
break;
}
/* 'sh' in -TIA- or -TIO-
* else 'th' before H
* else T
*/
case 'T':
if (Next_Letter == 'I' &&
(After_Next_Letter == 'O' ||
After_Next_Letter == 'A')) {
case 'T': {
char next_letter = Read_Next_Letter;
char after_next_letter;
if (next_letter == 'I' &&
((after_next_letter = Read_After_Next_Letter) == 'O' ||
after_next_letter == 'A')) {
Phonize(SH);
} else if (Next_Letter == 'H') {
} else if (next_letter == 'H') {
Phonize(TH);
skip_letter++;
} else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
} else if (!(next_letter == 'C' && Read_After_Next_Letter == 'H')) {
Phonize('T');
}
break;
}
/* F */
case 'V':
Phonize('F');
break;
/* W before a vowel, else dropped */
case 'W':
if (isvowel(Next_Letter))
if (isvowel(Read_Next_Letter))
Phonize('W');
break;
/* KS */
Expand All @@ -424,7 +446,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
break;
/* Y if followed by a vowel */
case 'Y':
if (isvowel(Next_Letter))
if (isvowel(Read_Next_Letter))
Phonize('Y');
break;
/* S */
Expand All @@ -438,7 +460,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem
case 'M':
case 'N':
case 'R':
Phonize(Curr_Letter);
Phonize(curr_letter);
break;
default:
/* nothing */
Expand Down