@@ -918,4 +918,185 @@ PHP_FUNCTION(grapheme_str_split)
918
918
ubrk_close (bi );
919
919
}
920
920
921
+ PHP_FUNCTION (grapheme_levenshtein )
922
+ {
923
+ zend_string * string1 , * string2 ;
924
+ zend_long cost_ins = 1 ;
925
+ zend_long cost_rep = 1 ;
926
+ zend_long cost_del = 1 ;
927
+
928
+ ZEND_PARSE_PARAMETERS_START (2 , 5 )
929
+ Z_PARAM_STR (string1 )
930
+ Z_PARAM_STR (string2 )
931
+ Z_PARAM_OPTIONAL
932
+ Z_PARAM_LONG (cost_ins )
933
+ Z_PARAM_LONG (cost_rep )
934
+ Z_PARAM_LONG (cost_del )
935
+ ZEND_PARSE_PARAMETERS_END ();
936
+
937
+ if (cost_ins <= 0 || cost_ins > UINT_MAX / 4 ) {
938
+ zend_argument_value_error (3 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
939
+ RETURN_THROWS ();
940
+ }
941
+
942
+ if (cost_rep <= 0 || cost_rep > UINT_MAX / 4 ) {
943
+ zend_argument_value_error (4 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
944
+ RETURN_THROWS ();
945
+ }
946
+
947
+ if (cost_del <= 0 || cost_del > UINT_MAX / 4 ) {
948
+ zend_argument_value_error (5 , "must be greater than 0 and less than or equal to %d" , UINT_MAX / 4 );
949
+ RETURN_THROWS ();
950
+ }
951
+
952
+ zend_long * p1 , * p2 , * tmp ;
953
+ zend_long c0 , c1 , c2 ;
954
+ zend_long retval ;
955
+ size_t i2 ;
956
+ char * pstr1 , * pstr2 ;
957
+
958
+ UChar * ustring1 = NULL ;
959
+ UChar * ustring2 = NULL ;
960
+
961
+ int32_t ustring1_len = 0 ;
962
+ int32_t ustring2_len = 0 ;
963
+
964
+ UErrorCode ustatus1 = U_ZERO_ERROR ;
965
+ UErrorCode ustatus2 = U_ZERO_ERROR ;
966
+
967
+ /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
968
+ * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
969
+ * by having shorter rows (p1 & p2). */
970
+ if (ZSTR_LEN (string1 ) < ZSTR_LEN (string2 ) && cost_ins == cost_rep && cost_rep == cost_del ) {
971
+ zend_string * tmp = string1 ;
972
+ string1 = string2 ;
973
+ string2 = tmp ;
974
+ }
975
+
976
+ pstr1 = ZSTR_VAL (string1 );
977
+ pstr2 = ZSTR_VAL (string2 );
978
+
979
+ intl_convert_utf8_to_utf16 (& ustring1 , & ustring1_len , pstr1 , ZSTR_LEN (string1 ), & ustatus1 );
980
+
981
+ if ( U_FAILURE ( ustatus1 ) ) {
982
+ /* Set global error code. */
983
+ intl_error_set_code ( NULL , ustatus1 );
984
+
985
+ /* Set error messages. */
986
+ intl_error_set_custom_msg ( NULL , "Error converting input string to UTF-16" , 0 );
987
+ if (ustring1 ) {
988
+ efree ( ustring1 );
989
+ }
990
+ RETURN_FALSE ;
991
+ }
992
+
993
+ intl_convert_utf8_to_utf16 (& ustring2 , & ustring2_len , pstr2 , ZSTR_LEN (string2 ), & ustatus2 );
994
+
995
+ if ( U_FAILURE ( ustatus2 ) ) {
996
+ /* Set global error code. */
997
+ intl_error_set_code ( NULL , ustatus2 );
998
+
999
+ /* Set error messages. */
1000
+ intl_error_set_custom_msg ( NULL , "Error converting input string to UTF-16" , 0 );
1001
+ if (ustring2 ) {
1002
+ efree ( ustring2 );
1003
+ }
1004
+ if (ustring1 ) {
1005
+ efree ( ustring1 );
1006
+ }
1007
+ RETURN_FALSE ;
1008
+ }
1009
+
1010
+ UText * ut1 = NULL ;
1011
+ UText * ut2 = NULL ;
1012
+ UBreakIterator * bi1 , * bi2 ;
1013
+
1014
+ int32_t strlen_1 , strlen_2 ;
1015
+ strlen_1 = grapheme_split_string (ustring1 , ustring1_len , NULL , 0 );
1016
+ strlen_2 = grapheme_split_string (ustring2 , ustring2_len , NULL , 0 );
1017
+
1018
+ if (strlen_1 == 0 ) {
1019
+ efree (ustring1 );
1020
+ efree (ustring2 );
1021
+ RETURN_LONG (strlen_2 * cost_ins );
1022
+ }
1023
+ if (strlen_2 == 0 ) {
1024
+ efree (ustring1 );
1025
+ efree (ustring2 );
1026
+ RETURN_LONG (strlen_1 * cost_del );
1027
+ }
1028
+
1029
+ unsigned char u_break_iterator_buffer1 [U_BRK_SAFECLONE_BUFFERSIZE ];
1030
+ unsigned char u_break_iterator_buffer2 [U_BRK_SAFECLONE_BUFFERSIZE ];
1031
+ bi1 = grapheme_get_break_iterator ((void * )u_break_iterator_buffer1 , & ustatus1 );
1032
+ bi2 = grapheme_get_break_iterator ((void * )u_break_iterator_buffer2 , & ustatus2 );
1033
+
1034
+ ut1 = utext_openUTF8 (ut1 , pstr1 , ZSTR_LEN (string1 ), & ustatus1 );
1035
+ ubrk_setUText (bi1 , ut1 , & ustatus1 );
1036
+ ut2 = utext_openUTF8 (ut2 , pstr2 , ZSTR_LEN (string2 ), & ustatus2 );
1037
+ ubrk_setUText (bi2 , ut2 , & ustatus2 );
1038
+
1039
+ p1 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1040
+ p2 = safe_emalloc (strlen_2 + 1 , sizeof (zend_long ), 0 );
1041
+
1042
+ for (i2 = 0 ; i2 <= strlen_2 ; i2 ++ ) {
1043
+ p1 [i2 ] = i2 * cost_ins ;
1044
+ }
1045
+
1046
+ int32_t current1 = 0 ;
1047
+ int32_t current2 = 0 ;
1048
+ int32_t pos1 = 0 ;
1049
+ int32_t pos2 = 0 ;
1050
+ int32_t usrch_pos = 0 ;
1051
+ for ( ; pos1 != UBRK_DONE ; ) {
1052
+ current1 = ubrk_current (bi1 );
1053
+ pos1 = ubrk_next (bi1 );
1054
+ if (pos1 == UBRK_DONE ) {
1055
+ break ;
1056
+ }
1057
+ p2 [0 ] = p1 [0 ] + cost_del ;
1058
+ for (i2 = 0 , pos2 = 0 ; pos2 != UBRK_DONE ; i2 ++ ) {
1059
+ current2 = ubrk_current (bi2 );
1060
+ pos2 = ubrk_next (bi2 );
1061
+ if (pos2 == UBRK_DONE ) {
1062
+ break ;
1063
+ }
1064
+ usrch_pos = grapheme_strpos_utf16 (pstr1 + current1 , pos1 - current1 , pstr2 + current2 , pos2 - current2 , 0 , NULL , 0 , 0 );
1065
+ if (usrch_pos == 0 ) {
1066
+ c0 = p1 [i2 ];
1067
+ } else {
1068
+ c0 = p1 [i2 ] + cost_rep ;
1069
+ }
1070
+ c1 = p1 [i2 + 1 ] + cost_del ;
1071
+ if (c1 < c0 ) {
1072
+ c0 = c1 ;
1073
+ }
1074
+ c2 = p2 [i2 ] + cost_ins ;
1075
+ if (c2 < c0 ) {
1076
+ c0 = c2 ;
1077
+ }
1078
+ p2 [i2 + 1 ] = c0 ;
1079
+ }
1080
+ ubrk_first (bi2 );
1081
+ tmp = p1 ;
1082
+ p1 = p2 ;
1083
+ p2 = tmp ;
1084
+ }
1085
+
1086
+ utext_close (ut1 );
1087
+ utext_close (ut2 );
1088
+
1089
+ ubrk_close (bi1 );
1090
+ ubrk_close (bi2 );
1091
+
1092
+ efree (ustring1 );
1093
+ efree (ustring2 );
1094
+
1095
+ retval = p1 [strlen_2 ];
1096
+
1097
+ efree (p1 );
1098
+ efree (p2 );
1099
+ RETURN_LONG (retval );
1100
+ }
1101
+
921
1102
/* }}} */
0 commit comments