diff --git a/ext/standard/metaphone.c b/ext/standard/metaphone.c index 2ba7a839c88e9..7726bba2f17e7 100644 --- a/ext/standard/metaphone.c +++ b/ext/standard/metaphone.c @@ -78,7 +78,8 @@ static const char _codes[26] = }; -#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0) +/* Note: these macros require an uppercase letter input! */ +#define ENCODE(c) (isalpha(c) ? _codes[((c) - 'A')] : 0) #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */ @@ -101,16 +102,19 @@ static const char _codes[26] = /* I suppose I could have been using a character pointer instead of * accesssing the array directly... */ +#define Convert_Raw(c) toupper(c) /* Look at the next letter in the word */ -#define Next_Letter (toupper(word[w_idx+1])) +#define Read_Raw_Next_Letter (word[w_idx+1]) +#define Read_Next_Letter (Convert_Raw(Read_Raw_Next_Letter)) /* Look at the current letter in the word */ -#define Curr_Letter (toupper(word[w_idx])) +#define Read_Raw_Curr_Letter (word[w_idx]) +#define Read_Curr_Letter (Convert_Raw(Read_Raw_Curr_Letter)) /* Go N letters back. */ -#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0') +#define Look_Back_Letter(n) (w_idx >= n ? Convert_Raw(word[w_idx-n]) : '\0') /* Previous letter. I dunno, should this return null on failure? */ -#define Prev_Letter (Look_Back_Letter(1)) +#define Read_Prev_Letter (Look_Back_Letter(1)) /* Look two letters down. It makes sure you don't walk off the string. */ -#define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \ +#define Read_After_Next_Letter (Read_Raw_Next_Letter != '\0' ? Convert_Raw(word[w_idx+2]) \ : '\0') #define Look_Ahead_Letter(n) (toupper(Lookahead((char *) word+w_idx, n))) @@ -119,15 +123,13 @@ static const char _codes[26] = /* I probably could have just used strlen... */ static char Lookahead(char *word, int how_far) { - char letter_ahead = '\0'; /* null by default */ int idx; for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); /* Edge forward in the string... */ - letter_ahead = word[idx]; /* idx will be either == to how_far or - * at the end of the string + return word[idx]; /* idx will be either == to how_far or + * at the end of the string where it will be null */ - return letter_ahead; } @@ -164,6 +166,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem int w_idx = 0; /* point in the phonization we're at. */ size_t p_idx = 0; /* end of the phoned phrase */ size_t max_buffer_len = 0; /* maximum length of the destination buffer */ + char curr_letter; ZEND_ASSERT(word != NULL); ZEND_ASSERT(max_phonemes >= 0); @@ -179,18 +182,20 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /*-- The first phoneme has to be processed specially. --*/ /* Find our first letter */ - for (; !isalpha(Curr_Letter); w_idx++) { + for (; !isalpha(curr_letter = Read_Raw_Curr_Letter); w_idx++) { /* On the off chance we were given nothing but crap... */ - if (Curr_Letter == '\0') { + if (curr_letter == '\0') { End_Phoned_Word(); return; } } - switch (Curr_Letter) { + curr_letter = Convert_Raw(curr_letter); + + switch (curr_letter) { /* AE becomes E */ case 'A': - if (Next_Letter == 'E') { + if (Read_Next_Letter == 'E') { Phonize('E'); w_idx += 2; } @@ -204,7 +209,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem case 'G': case 'K': case 'P': - if (Next_Letter == 'N') { + if (Read_Next_Letter == 'N') { Phonize('N'); w_idx += 2; } @@ -212,16 +217,18 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /* WH becomes W, WR becomes R W if followed by a vowel */ - case 'W': - if (Next_Letter == 'R') { - Phonize(Next_Letter); + case 'W': { + char next_letter = Read_Next_Letter; + if (next_letter == 'R') { + Phonize('R'); w_idx += 2; - } else if (Next_Letter == 'H' || isvowel(Next_Letter)) { + } else if (next_letter == 'H' || isvowel(next_letter)) { Phonize('W'); w_idx += 2; } /* else ignore */ break; + } /* X becomes S */ case 'X': Phonize('S'); @@ -236,7 +243,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem case 'I': case 'O': case 'U': - Phonize(Curr_Letter); + Phonize(curr_letter); w_idx++; break; default: @@ -247,7 +254,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /* On to the metaphoning */ - for (; Curr_Letter != '\0' && + for (; (curr_letter = Read_Raw_Curr_Letter) != '\0' && (max_phonemes == 0 || Phone_Len < (size_t)max_phonemes); w_idx++) { /* How many letters to skip because an eariler encoding handled @@ -263,18 +270,23 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem */ /* Ignore non-alphas */ - if (!isalpha(Curr_Letter)) + if (!isalpha(curr_letter)) continue; + curr_letter = Convert_Raw(curr_letter); + /* Note: we can't cache curr_letter from the previous loop + * because of the skip_letter variable. */ + char prev_letter = Read_Prev_Letter; + /* Drop duplicates, except CC */ - if (Curr_Letter == Prev_Letter && - Curr_Letter != 'C') + if (curr_letter == prev_letter && + curr_letter != 'C') continue; - switch (Curr_Letter) { + switch (curr_letter) { /* B -> B unless in MB */ case 'B': - if (Prev_Letter != 'M') + if (prev_letter != 'M') Phonize('B'); break; /* 'sh' if -CIA- or -CH, but not SCH, except SCHW. @@ -283,20 +295,20 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem * dropped if -SCI-, SCE-, -SCY- (handed in S) * else K */ - case 'C': - if (MAKESOFT(Next_Letter)) { /* C[IEY] */ - if (After_Next_Letter == 'A' && - Next_Letter == 'I') { /* CIA */ + case 'C': { + char next_letter = Read_Next_Letter; + if (MAKESOFT(next_letter)) { /* C[IEY] */ + if (next_letter == 'I' && Read_After_Next_Letter == 'A') { /* CIA */ Phonize(SH); } /* SC[IEY] */ - else if (Prev_Letter == 'S') { + else if (prev_letter == 'S') { /* Dropped */ } else { Phonize('S'); } - } else if (Next_Letter == 'H') { - if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */ + } else if (next_letter == 'H') { + if ((!traditional) && (prev_letter == 'S' || Read_After_Next_Letter == 'R')) { /* Christ, School */ Phonize('K'); } else { Phonize(SH); @@ -306,12 +318,13 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem Phonize('K'); } break; + } /* J if in -DGE-, -DGI- or -DGY- * else T */ case 'D': - if (Next_Letter == 'G' && - MAKESOFT(After_Next_Letter)) { + if (Read_Next_Letter == 'G' && + MAKESOFT(Read_After_Next_Letter)) { Phonize('J'); skip_letter++; } else @@ -323,8 +336,9 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem * else J if in -GE-, -GI, -GY and not GG * else K */ - case 'G': - if (Next_Letter == 'H') { + case 'G': { + char next_letter = Read_Next_Letter; + if (next_letter == 'H') { if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) { Phonize('F'); @@ -332,38 +346,40 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem } else { /* silent */ } - } else if (Next_Letter == 'N') { - if (Isbreak(After_Next_Letter) || - (After_Next_Letter == 'E' && + } else if (next_letter == 'N') { + char after_next_letter = Read_After_Next_Letter; + if (Isbreak(after_next_letter) || + (after_next_letter == 'E' && Look_Ahead_Letter(3) == 'D')) { /* dropped */ } else Phonize('K'); - } else if (MAKESOFT(Next_Letter) && - Prev_Letter != 'G') { + } else if (MAKESOFT(next_letter) && + prev_letter != 'G') { Phonize('J'); } else { Phonize('K'); } break; + } /* H if before a vowel and not after C,G,P,S,T */ case 'H': - if (isvowel(Next_Letter) && - !AFFECTH(Prev_Letter)) + if (isvowel(Read_Next_Letter) && + !AFFECTH(prev_letter)) Phonize('H'); break; /* dropped if after C * else K */ case 'K': - if (Prev_Letter != 'C') + if (prev_letter != 'C') Phonize('K'); break; /* F if before H * else P */ case 'P': - if (Next_Letter == 'H') { + if (Read_Next_Letter == 'H') { Phonize('F'); } else { Phonize('P'); @@ -377,44 +393,50 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- * else S */ - case 'S': - if (Next_Letter == 'I' && - (After_Next_Letter == 'O' || - After_Next_Letter == 'A')) { + case 'S': { + char next_letter = Read_Next_Letter; + char after_next_letter; + if (next_letter == 'I' && + ((after_next_letter = Read_After_Next_Letter) == 'O' || + after_next_letter == 'A')) { Phonize(SH); - } else if (Next_Letter == 'H') { + } else if (next_letter == 'H') { Phonize(SH); skip_letter++; - } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { + } else if ((!traditional) && (next_letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { Phonize(SH); skip_letter += 2; } else { Phonize('S'); } break; + } /* 'sh' in -TIA- or -TIO- * else 'th' before H * else T */ - case 'T': - if (Next_Letter == 'I' && - (After_Next_Letter == 'O' || - After_Next_Letter == 'A')) { + case 'T': { + char next_letter = Read_Next_Letter; + char after_next_letter; + if (next_letter == 'I' && + ((after_next_letter = Read_After_Next_Letter) == 'O' || + after_next_letter == 'A')) { Phonize(SH); - } else if (Next_Letter == 'H') { + } else if (next_letter == 'H') { Phonize(TH); skip_letter++; - } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) { + } else if (!(next_letter == 'C' && Read_After_Next_Letter == 'H')) { Phonize('T'); } break; + } /* F */ case 'V': Phonize('F'); break; /* W before a vowel, else dropped */ case 'W': - if (isvowel(Next_Letter)) + if (isvowel(Read_Next_Letter)) Phonize('W'); break; /* KS */ @@ -424,7 +446,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem break; /* Y if followed by a vowel */ case 'Y': - if (isvowel(Next_Letter)) + if (isvowel(Read_Next_Letter)) Phonize('Y'); break; /* S */ @@ -438,7 +460,7 @@ static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonem case 'M': case 'N': case 'R': - Phonize(Curr_Letter); + Phonize(curr_letter); break; default: /* nothing */