@@ -917,4 +917,185 @@ PHP_FUNCTION(grapheme_str_split)
917
917
ubrk_close (bi );
918
918
}
919
919
920
+ PHP_FUNCTION (grapheme_levenshtein )
921
+ {
922
+ zend_string * string1 , * string2 ;
923
+ zend_long cost_ins = 1 ;
924
+ zend_long cost_rep = 1 ;
925
+ zend_long cost_del = 1 ;
926
+
927
+ ZEND_PARSE_PARAMETERS_START (2 , 5 )
928
+ Z_PARAM_STR (string1 )
929
+ Z_PARAM_STR (string2 )
930
+ Z_PARAM_OPTIONAL
931
+ Z_PARAM_LONG (cost_ins )
932
+ Z_PARAM_LONG (cost_rep )
933
+ Z_PARAM_LONG (cost_del )
934
+ ZEND_PARSE_PARAMETERS_END ();
935
+
936
+ if (cost_ins <= 0 || cost_ins > UINT_MAX / 4 ) {
937
+ zend_argument_value_error (3 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
938
+ RETURN_THROWS ();
939
+ }
940
+
941
+ if (cost_rep <= 0 || cost_rep > UINT_MAX / 4 ) {
942
+ zend_argument_value_error (4 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
943
+ RETURN_THROWS ();
944
+ }
945
+
946
+ if (cost_del <= 0 || cost_del > UINT_MAX / 4 ) {
947
+ zend_argument_value_error (5 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
948
+ RETURN_THROWS ();
949
+ }
950
+
951
+ zend_long * p1 , * p2 , * tmp ;
952
+ zend_long c0 , c1 , c2 ;
953
+ zend_long retval ;
954
+ size_t i2 ;
955
+ char * pstr1 , * pstr2 ;
956
+
957
+ UChar * ustring1 = NULL ;
958
+ UChar * ustring2 = NULL ;
959
+
960
+ int32_t ustring1_len = 0 ;
961
+ int32_t ustring2_len = 0 ;
962
+
963
+ UErrorCode ustatus1 = U_ZERO_ERROR ;
964
+ UErrorCode ustatus2 = U_ZERO_ERROR ;
965
+
966
+ /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
967
+ * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
968
+ * by having shorter rows (p1 & p2). */
969
+ if (ZSTR_LEN (string1 ) < ZSTR_LEN (string2 ) && cost_ins == cost_rep && cost_rep == cost_del ) {
970
+ zend_string * tmp = string1 ;
971
+ string1 = string2 ;
972
+ string2 = tmp ;
973
+ }
974
+
975
+ pstr1 = ZSTR_VAL (string1 );
976
+ pstr2 = ZSTR_VAL (string2 );
977
+
978
+ intl_convert_utf8_to_utf16 (& ustring1 , & ustring1_len , pstr1 , ZSTR_LEN (string1 ), & ustatus1 );
979
+
980
+ if ( U_FAILURE ( ustatus1 ) ) {
981
+ /* Set global error code. */
982
+ intl_error_set_code ( NULL , ustatus1 );
983
+
984
+ /* Set error messages. */
985
+ intl_error_set_custom_msg ( NULL , "Error converting input string to UTF-16" , 0 );
986
+ if (ustring1 ) {
987
+ efree ( ustring1 );
988
+ }
989
+ RETURN_FALSE ;
990
+ }
991
+
992
+ intl_convert_utf8_to_utf16 (& ustring2 , & ustring2_len , pstr2 , ZSTR_LEN (string2 ), & ustatus2 );
993
+
994
+ if ( U_FAILURE ( ustatus2 ) ) {
995
+ /* Set global error code. */
996
+ intl_error_set_code ( NULL , ustatus2 );
997
+
998
+ /* Set error messages. */
999
+ intl_error_set_custom_msg ( NULL , "Error converting input string to UTF-16" , 0 );
1000
+ if (ustring2 ) {
1001
+ efree ( ustring2 );
1002
+ }
1003
+ if (ustring1 ) {
1004
+ efree ( ustring1 );
1005
+ }
1006
+ RETURN_FALSE ;
1007
+ }
1008
+
1009
+ UText * ut1 = NULL ;
1010
+ UText * ut2 = NULL ;
1011
+ UBreakIterator * bi1 , * bi2 ;
1012
+
1013
+ int32_t strlen_1 , strlen_2 ;
1014
+ strlen_1 = grapheme_split_string (ustring1 , ustring1_len , NULL , 0 );
1015
+ strlen_2 = grapheme_split_string (ustring2 , ustring2_len , NULL , 0 );
1016
+
1017
+ if (strlen_1 == 0 ) {
1018
+ efree (ustring1 );
1019
+ efree (ustring2 );
1020
+ RETURN_LONG (strlen_2 * cost_ins );
1021
+ }
1022
+ if (strlen_2 == 0 ) {
1023
+ efree (ustring1 );
1024
+ efree (ustring2 );
1025
+ RETURN_LONG (strlen_1 * cost_del );
1026
+ }
1027
+
1028
+ unsigned char u_break_iterator_buffer1 [U_BRK_SAFECLONE_BUFFERSIZE ];
1029
+ unsigned char u_break_iterator_buffer2 [U_BRK_SAFECLONE_BUFFERSIZE ];
1030
+ bi1 = grapheme_get_break_iterator ((void * )u_break_iterator_buffer1 , & ustatus1 );
1031
+ bi2 = grapheme_get_break_iterator ((void * )u_break_iterator_buffer2 , & ustatus2 );
1032
+
1033
+ ut1 = utext_openUTF8 (ut1 , pstr1 , ZSTR_LEN (string1 ), & ustatus1 );
1034
+ ubrk_setUText (bi1 , ut1 , & ustatus1 );
1035
+ ut2 = utext_openUTF8 (ut2 , pstr2 , ZSTR_LEN (string2 ), & ustatus2 );
1036
+ ubrk_setUText (bi2 , ut2 , & ustatus2 );
1037
+
1038
+ p1 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1039
+ p2 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1040
+
1041
+ for (i2 = 0 ; i2 <= strlen_2 ; i2 ++ ) {
1042
+ p1 [i2 ] = i2 * cost_ins ;
1043
+ }
1044
+
1045
+ int32_t current1 = 0 ;
1046
+ int32_t current2 = 0 ;
1047
+ int32_t pos1 = 0 ;
1048
+ int32_t pos2 = 0 ;
1049
+ int32_t usrch_pos = 0 ;
1050
+ for ( ; pos1 != UBRK_DONE ; ) {
1051
+ current1 = ubrk_current (bi1 );
1052
+ pos1 = ubrk_next (bi1 );
1053
+ if (pos1 == UBRK_DONE ) {
1054
+ break ;
1055
+ }
1056
+ p2 [0 ] = p1 [0 ] + cost_del ;
1057
+ for (i2 = 0 , pos2 = 0 ; pos2 != UBRK_DONE ; i2 ++ ) {
1058
+ current2 = ubrk_current (bi2 );
1059
+ pos2 = ubrk_next (bi2 );
1060
+ if (pos2 == UBRK_DONE ) {
1061
+ break ;
1062
+ }
1063
+ usrch_pos = grapheme_strpos_utf16 (pstr1 + current1 , pos1 - current1 , pstr2 + current2 , pos2 - current2 , 0 , NULL , 0 , 0 );
1064
+ if (usrch_pos == 0 ) {
1065
+ c0 = p1 [i2 ];
1066
+ } else {
1067
+ c0 = p1 [i2 ] + cost_rep ;
1068
+ }
1069
+ c1 = p1 [i2 + 1 ] + cost_del ;
1070
+ if (c1 < c0 ) {
1071
+ c0 = c1 ;
1072
+ }
1073
+ c2 = p2 [i2 ] + cost_ins ;
1074
+ if (c2 < c0 ) {
1075
+ c0 = c2 ;
1076
+ }
1077
+ p2 [i2 + 1 ] = c0 ;
1078
+ }
1079
+ ubrk_first (bi2 );
1080
+ tmp = p1 ;
1081
+ p1 = p2 ;
1082
+ p2 = tmp ;
1083
+ }
1084
+
1085
+ utext_close (ut1 );
1086
+ utext_close (ut2 );
1087
+
1088
+ ubrk_close (bi1 );
1089
+ ubrk_close (bi2 );
1090
+
1091
+ efree (ustring1 );
1092
+ efree (ustring2 );
1093
+
1094
+ retval = p1 [strlen_2 ];
1095
+
1096
+ efree (p1 );
1097
+ efree (p2 );
1098
+ RETURN_LONG (retval );
1099
+ }
1100
+
920
1101
/* }}} */
0 commit comments