Skip to content

Commit 3192c7b

Browse files
committed
Create a CharSetConverter class with both iconv and icu support.
1 parent 9602216 commit 3192c7b

File tree

9 files changed

+774
-2
lines changed

9 files changed

+774
-2
lines changed

llvm/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,10 @@ else()
592592
option(LLVM_ENABLE_THREADS "Use threads if available." ON)
593593
endif()
594594

595+
set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")
596+
597+
set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
598+
595599
set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
596600

597601
set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")

llvm/cmake/config-ix.cmake

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128)
294294
set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
295295
endif()
296296

297+
if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
298+
message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON")
299+
endif()
300+
301+
# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing.
302+
if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
303+
set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
304+
set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}")
305+
if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
306+
find_package(ICU REQUIRED COMPONENTS uc i18n)
307+
if (NOT ICU_FOUND)
308+
message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
309+
endif()
310+
else()
311+
find_package(ICU COMPONENTS uc i18n)
312+
endif()
313+
set(HAVE_ICU ${ICU_FOUND})
314+
set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
315+
endif()
316+
317+
# Check for builtin iconv to avoid licensing issues.
318+
if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU)
319+
if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
320+
find_package(Iconv REQUIRED)
321+
if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
322+
message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
323+
endif()
324+
else()
325+
find_package(Iconv)
326+
endif()
327+
if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
328+
set(HAVE_ICONV 1)
329+
endif()
330+
endif()
331+
297332
# function checks
298333
check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
299334
find_package(Backtrace)

llvm/include/llvm/Config/config.h.cmake

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,12 @@
236236
/* Have host's ___chkstk_ms */
237237
#cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
238238

239+
/* Define if ICU library is available */
240+
#cmakedefine HAVE_ICU ${HAVE_ICU}
241+
242+
/* Define if iconv library is available */
243+
#cmakedefine HAVE_ICONV ${HAVE_ICONV}
244+
239245
/* Linker version detected at compile time. */
240246
#cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
241247

llvm/include/llvm/Support/CharSet.h

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file provides a utility class to convert between different character
11+
/// set encodings.
12+
///
13+
//===----------------------------------------------------------------------===//
14+
15+
#ifndef LLVM_SUPPORT_CHARSET_H
16+
#define LLVM_SUPPORT_CHARSET_H
17+
18+
#include "llvm/ADT/SmallString.h"
19+
#include "llvm/ADT/StringRef.h"
20+
#include "llvm/Config/config.h"
21+
#include "llvm/Support/ErrorOr.h"
22+
23+
#include <string>
24+
#include <system_error>
25+
26+
namespace llvm {
27+
28+
template <typename T> class SmallVectorImpl;
29+
30+
namespace details {
31+
class CharSetConverterImplBase {
32+
33+
private:
34+
/// Converts a string.
35+
/// \param[in] Source source string
36+
/// \param[out] Result container for converted string
37+
/// \return error code in case something went wrong
38+
///
39+
/// The following error codes can occur, among others:
40+
/// - std::errc::argument_list_too_long: The result requires more than
41+
/// std::numeric_limits<size_t>::max() bytes.
42+
/// - std::errc::illegal_byte_sequence: The input contains an invalid
43+
/// multibyte sequence.
44+
/// - std::errc::invalid_argument: The input contains an incomplete
45+
/// multibyte sequence.
46+
///
47+
/// If the destination charset is a stateful character set, the shift state
48+
/// will be set to the initial state.
49+
///
50+
/// In case of an error, the result string contains the successfully converted
51+
/// part of the input string.
52+
///
53+
virtual std::error_code convertString(StringRef Source,
54+
SmallVectorImpl<char> &Result) = 0;
55+
56+
/// Resets the converter to the initial state.
57+
virtual void reset() = 0;
58+
59+
public:
60+
virtual ~CharSetConverterImplBase() = default;
61+
62+
/// Converts a string and resets the converter to the initial state.
63+
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
64+
auto EC = convertString(Source, Result);
65+
reset();
66+
return EC;
67+
}
68+
};
69+
} // namespace details
70+
71+
// Names inspired by https://wg21.link/p1885.
72+
namespace text_encoding {
73+
enum class id {
74+
/// UTF-8 character set encoding.
75+
UTF8,
76+
77+
/// IBM EBCDIC 1047 character set encoding.
78+
IBM1047
79+
};
80+
} // end namespace text_encoding
81+
82+
/// Utility class to convert between different character set encodings.
83+
class CharSetConverter {
84+
std::unique_ptr<details::CharSetConverterImplBase> Converter;
85+
86+
CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
87+
: Converter(std::move(Converter)) {}
88+
89+
public:
90+
/// Creates a CharSetConverter instance.
91+
/// Returns std::errc::invalid_argument in case the requested conversion is
92+
/// not supported.
93+
/// \param[in] CSFrom the source character encoding
94+
/// \param[in] CSTo the target character encoding
95+
/// \return a CharSetConverter instance or an error code
96+
static ErrorOr<CharSetConverter> create(text_encoding::id CSFrom,
97+
text_encoding::id CSTo);
98+
99+
/// Creates a CharSetConverter instance.
100+
/// Returns std::errc::invalid_argument in case the requested conversion is
101+
/// not supported.
102+
/// \param[in] CPFrom name of the source character encoding
103+
/// \param[in] CPTo name of the target character encoding
104+
/// \return a CharSetConverter instance or an error code
105+
static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);
106+
107+
CharSetConverter(const CharSetConverter &) = delete;
108+
CharSetConverter &operator=(const CharSetConverter &) = delete;
109+
110+
CharSetConverter(CharSetConverter &&Other)
111+
: Converter(std::move(Other.Converter)) {}
112+
113+
CharSetConverter &operator=(CharSetConverter &&Other) {
114+
if (this != &Other)
115+
Converter = std::move(Other.Converter);
116+
return *this;
117+
}
118+
119+
~CharSetConverter() = default;
120+
121+
/// Converts a string.
122+
/// \param[in] Source source string
123+
/// \param[out] Result container for converted string
124+
/// \return error code in case something went wrong
125+
std::error_code convert(StringRef Source,
126+
SmallVectorImpl<char> &Result) const {
127+
return Converter->convert(Source, Result);
128+
}
129+
130+
ErrorOr<std::string> convert(StringRef Source) const {
131+
SmallString<100> Result;
132+
auto EC = Converter->convert(Source, Result);
133+
if (!EC)
134+
return std::string(Result);
135+
return EC;
136+
}
137+
};
138+
139+
} // namespace llvm
140+
141+
#endif

llvm/lib/Support/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ add_llvm_component_library(LLVMSupport
162162
CachePruning.cpp
163163
Caching.cpp
164164
circular_raw_ostream.cpp
165+
CharSet.cpp
165166
Chrono.cpp
166167
COM.cpp
167168
CodeGenCoverage.cpp
@@ -316,6 +317,14 @@ add_llvm_component_library(LLVMSupport
316317
Demangle
317318
)
318319

320+
# Link ICU library if it is an external library.
321+
if(ICU_FOUND)
322+
target_link_libraries(LLVMSupport
323+
PRIVATE
324+
${ICU_LIBRARIES}
325+
)
326+
endif()
327+
319328
set(llvm_system_libs ${system_libs})
320329

321330
# This block is only needed for llvm-config. When we deprecate llvm-config and

0 commit comments

Comments
 (0)