Perform Utf8ToUnicode transformation in-place

eduar-hte · eduar-hte · commit de3b2a0c4175 · 2024-08-19T10:52:56.000-07:00
- Removed inplace helper function from the class, as it's only
  referenced by the implementation.
diff --git a/src/actions/transformations/utf8_to_unicode.cc b/src/actions/transformations/utf8_to_unicode.cc
@@ -20,67 +20,33 @@
 #include "src/utils/string.h"
 
 
-namespace modsecurity::actions::transformations {
-
-
-bool Utf8ToUnicode::transform(std::string &value, const Transaction *trans) const {
-    std::string ret;
-    unsigned char *input;
-    int _changed = 0;
-    char *out;
-
-    input = reinterpret_cast<unsigned char *>
-        (malloc(sizeof(char) * value.length()+1));
+constexpr int UNICODE_ERROR_CHARACTERS_MISSING   = -1;
+constexpr int UNICODE_ERROR_INVALID_ENCODING     = -2;
 
-    if (input == NULL) {
-        return "";
-    }
-
-    memcpy(input, value.c_str(), value.length()+1);
 
-    out = inplace(input, value.size() + 1, &_changed);
-    free(input);
-    if (out != NULL) {
-        ret.assign(reinterpret_cast<char *>(out),
-            strlen(reinterpret_cast<char *>(out)));
-        free(out);
-    }
+namespace modsecurity::actions::transformations {
 
-    const auto changed = ret != value;
-    value = ret;
-    return changed;
-}
 
+static inline bool encode(std::string &value) {
+    auto input = reinterpret_cast<unsigned char*>(value.data());
+    const auto input_len = value.length();
 
-char *Utf8ToUnicode::inplace(unsigned char *input,
-    uint64_t input_len, int *changed) {
-    unsigned int count = 0;
-    char *data;
-    char *data_orig;
-    unsigned int i, len, j;
-    unsigned int bytes_left = input_len;
+    bool changed = false;
+    std::string::size_type count = 0;
+    auto bytes_left = input_len;
     unsigned char unicode[8];
-    *changed = 0;
 
     /* RFC3629 states that UTF-8 are encoded using sequences of 1 to 4 octets. */
     /* Max size per character should fit in 4 bytes */
-    len = input_len * 4 + 1;
-    data = reinterpret_cast<char *>(malloc(sizeof(char) * len));
-    if (data == NULL) {
-        return NULL;
-    }
-    data_orig = data;
+    const auto len = input_len * 4 + 1;
+    std::string ret(len, {});
+    auto data = ret.data();
 
-    if (input == NULL) {
-        free(data);
-        return NULL;
-    }
-
-    for (i = 0; i < bytes_left;)  {
+    for (std::string::size_type i = 0; i < bytes_left;)  {
         int unicode_len = 0;
         unsigned int d = 0;
         unsigned char c;
-        unsigned char *utf = (unsigned char *)&input[i];
+        auto utf = &input[i];
 
         c = *utf;
 
@@ -108,7 +74,7 @@ char *Utf8ToUnicode::inplace(unsigned char *input,
                 unicode_len = UNICODE_ERROR_INVALID_ENCODING;
             } else {
                 unicode_len = 2;
-                count+=6;
+                count += 6;
                 if (count <= len) {
                     int length = 0;
                     /* compute character number */
@@ -138,11 +104,11 @@ char *Utf8ToUnicode::inplace(unsigned char *input,
                             break;
                     }
 
-                    for (j = 0; j < length; j++) {
+                    for (std::string::size_type j = 0; j < length; j++) {
                         *data++ = unicode[j];
                     }
 
-                    *changed = 1;
+                    changed = true;
                 }
             }
         } else if ((c & 0xF0) == 0xE0) {
@@ -190,11 +156,11 @@ char *Utf8ToUnicode::inplace(unsigned char *input,
                             break;
                     }
 
-                    for (j = 0; j < length; j++) {
+                    for (std::string::size_type j = 0; j < length; j++) {
                         *data++ = unicode[j];
                     }
 
-                    *changed = 1;
+                    changed = true;
                 }
             }
         } else if ((c & 0xF8) == 0xF0) {
@@ -252,11 +218,11 @@ char *Utf8ToUnicode::inplace(unsigned char *input,
                             break;
                     }
 
-                    for (j = 0; j < length; j++) {
+                    for (std::string::size_type j = 0; j < length; j++) {
                         *data++ = unicode[j];
                     }
 
-                    *changed = 1;
+                    changed = true;
                 }
             }
         } else {
@@ -300,7 +266,14 @@ char *Utf8ToUnicode::inplace(unsigned char *input,
 
     *data ='\0';
 
-    return data_orig;
+    ret.resize(data - ret.c_str());
+    std::swap(value, ret);
+    return changed;
+}
+
+
+bool Utf8ToUnicode::transform(std::string &value, const Transaction *trans) const {
+    return encode(value);
 }
 
 
diff --git a/src/actions/transformations/utf8_to_unicode.h b/src/actions/transformations/utf8_to_unicode.h
@@ -18,12 +18,6 @@
 
 #include "transformation.h"
 
-#define UNICODE_ERROR_CHARACTERS_MISSING    -1
-#define UNICODE_ERROR_INVALID_ENCODING      -2
-#define UNICODE_ERROR_OVERLONG_CHARACTER    -3
-#define UNICODE_ERROR_RESTRICTED_CHARACTER  -4
-#define UNICODE_ERROR_DECODING_ERROR        -5
-
 namespace modsecurity::actions::transformations {
 
 class Utf8ToUnicode : public Transformation {
@@ -32,9 +26,6 @@ class Utf8ToUnicode : public Transformation {
         : Transformation(action) { }
 
     bool transform(std::string &value, const Transaction *trans) const override;
-
-    static char *inplace(unsigned char *input, uint64_t input_len,
-        int *changed);
 };
 
 }  // namespace modsecurity::actions::transformations