Skip to content

Create a EncodingConverter class with both iconv and icu support. #138893

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,10 @@ else()
option(LLVM_ENABLE_THREADS "Use threads if available." ON)
endif()

set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")

set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")

set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")

set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
Expand Down
35 changes: 35 additions & 0 deletions llvm/cmake/config-ix.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128)
set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
endif()

if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON")
endif()

# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing.
if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}")
if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
find_package(ICU REQUIRED COMPONENTS uc i18n)
if (NOT ICU_FOUND)
message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
endif()
else()
find_package(ICU COMPONENTS uc i18n)
endif()
set(HAVE_ICU ${ICU_FOUND})
set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
endif()

# Check only for builtin iconv to avoid licensing issues.
if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU)
if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
find_package(Iconv REQUIRED)
if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
endif()
else()
find_package(Iconv)
endif()
if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
set(HAVE_ICONV 1)
endif()
endif()

# function checks
check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
find_package(Backtrace)
Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/Config/config.h.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,12 @@
/* Have host's ___chkstk_ms */
#cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}

/* Define if ICU library is available */
#cmakedefine01 HAVE_ICU

/* Define if iconv library is available */
#cmakedefine01 HAVE_ICONV

/* Linker version detected at compile time. */
#cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"

Expand Down
140 changes: 140 additions & 0 deletions llvm/include/llvm/Support/TextEncoding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
//===-- TextEncoding.h - Text encoding conversion class -----------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file provides a utility class to convert between different character
/// set encodings.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_SUPPORT_TEXT_ENCODING_H
#define LLVM_SUPPORT_TEXT_ENCODING_H

#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Config/config.h"
#include "llvm/Support/ErrorOr.h"

#include <string>
#include <system_error>

namespace llvm {

template <typename T> class SmallVectorImpl;

namespace details {
class TextEncodingConverterImplBase {

private:
/// Converts a string.
/// \param[in] Source source string
/// \param[out] Result container for converted string
/// \return error code in case something went wrong
///
/// The following error codes can occur, among others:
/// - std::errc::argument_list_too_long: The result requires more than
/// std::numeric_limits<size_t>::max() bytes.
/// - std::errc::illegal_byte_sequence: The input contains an invalid
/// multibyte sequence.
/// - std::errc::invalid_argument: The input contains an incomplete
/// multibyte sequence.
///
/// If the destination encoding is stateful, the shift state will be set
/// to the initial state.
///
/// In case of an error, the result string contains the successfully converted
/// part of the input string.
///
virtual std::error_code convertString(StringRef Source,
SmallVectorImpl<char> &Result) = 0;

/// Resets the converter to the initial state.
virtual void reset() = 0;

public:
virtual ~TextEncodingConverterImplBase() = default;

/// Converts a string and resets the converter to the initial state.
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
auto EC = convertString(Source, Result);
reset();
return EC;
}
};
} // namespace details

// Names inspired by https://wg21.link/p1885.
enum class TextEncoding {
/// UTF-8 character set encoding.
UTF8,

/// IBM EBCDIC 1047 character set encoding.
IBM1047
};

/// Utility class to convert between different character encodings.
class TextEncodingConverter {
std::unique_ptr<details::TextEncodingConverterImplBase> Converter;

TextEncodingConverter(
std::unique_ptr<details::TextEncodingConverterImplBase> Converter)
: Converter(std::move(Converter)) {}

public:
/// Creates a TextEncodingConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
/// \param[in] From the source character encoding
/// \param[in] To the target character encoding
/// \return a TextEncodingConverter instance or an error code
static ErrorOr<TextEncodingConverter> create(TextEncoding From,
TextEncoding To);

/// Creates a TextEncodingConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
/// \param[in] From name of the source character encoding
/// \param[in] To name of the target character encoding
/// \return a TextEncodingConverter instance or an error code
static ErrorOr<TextEncodingConverter> create(StringRef From, StringRef To);

TextEncodingConverter(const TextEncodingConverter &) = delete;
TextEncodingConverter &operator=(const TextEncodingConverter &) = delete;

TextEncodingConverter(TextEncodingConverter &&Other)
: Converter(std::move(Other.Converter)) {}

TextEncodingConverter &operator=(TextEncodingConverter &&Other) {
if (this != &Other)
Converter = std::move(Other.Converter);
return *this;
}

~TextEncodingConverter() = default;

/// Converts a string.
/// \param[in] Source source string
/// \param[out] Result container for converted string
/// \return error code in case something went wrong
std::error_code convert(StringRef Source,
SmallVectorImpl<char> &Result) const {
return Converter->convert(Source, Result);
}

ErrorOr<std::string> convert(StringRef Source) const {
SmallString<100> Result;
auto EC = Converter->convert(Source, Result);
if (!EC)
return std::string(Result);
return EC;
}
};

} // namespace llvm

#endif
9 changes: 9 additions & 0 deletions llvm/lib/Support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ add_llvm_component_library(LLVMSupport
SuffixTree.cpp
SystemUtils.cpp
TarWriter.cpp
TextEncoding.cpp
ThreadPool.cpp
TimeProfiler.cpp
Timer.cpp
Expand Down Expand Up @@ -316,6 +317,14 @@ add_llvm_component_library(LLVMSupport
Demangle
)

# Link ICU library if it is an external library.
if(ICU_FOUND)
target_link_libraries(LLVMSupport
PRIVATE
${ICU_LIBRARIES}
)
endif()

set(llvm_system_libs ${system_libs})

# This block is only needed for llvm-config. When we deprecate llvm-config and
Expand Down
Loading