Skip to content

Commit 4b8b4fe

Browse files
committed
address latest comments
1 parent b32b472 commit 4b8b4fe

File tree

3 files changed

+108
-38
lines changed

3 files changed

+108
-38
lines changed

llvm/include/llvm/Support/TextEncoding.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===-- TextEncodingConverter.h - Encoding conversion class -------*- C++ -*-=//
1+
//===-- TextEncoding.h - Text encoding conversion class -----------*- C++ -*-=//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -12,8 +12,8 @@
1212
///
1313
//===----------------------------------------------------------------------===//
1414

15-
#ifndef LLVM_SUPPORT_ENCODING_CONVERTER_H
16-
#define LLVM_SUPPORT_ENCODING_CONVERTER_H
15+
#ifndef LLVM_SUPPORT_TEXT_ENCODING_H
16+
#define LLVM_SUPPORT_TEXT_ENCODING_H
1717

1818
#include "llvm/ADT/SmallString.h"
1919
#include "llvm/ADT/StringRef.h"

llvm/lib/Support/TextEncoding.cpp

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===-- TextEncoding.cpp - Encoding conversion class --------------*- C++ -*-=//
1+
//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -82,7 +82,7 @@ enum ConversionType {
8282
// aforementioned encodings. The use of tables for conversion is only
8383
// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
8484
// encodings are not supported.
85-
class TextEncodingConverterTable
85+
class TextEncodingConverterTable final
8686
: public details::TextEncodingConverterImplBase {
8787
const ConversionType ConvType;
8888

@@ -118,7 +118,8 @@ struct UConverterDeleter {
118118
};
119119
using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
120120

121-
class TextEncodingConverterICU : public details::TextEncodingConverterImplBase {
121+
class TextEncodingConverterICU final
122+
: public details::TextEncodingConverterImplBase {
122123
UConverterUniquePtr FromConvDesc;
123124
UConverterUniquePtr ToConvDesc;
124125

@@ -137,7 +138,8 @@ class TextEncodingConverterICU : public details::TextEncodingConverterImplBase {
137138
// TODO: The current implementation discards the partial result and restarts the
138139
// conversion from the beginning if there is a conversion error due to
139140
// insufficient buffer size. In the future, it would better to save the partial
140-
// result and redo the conversion for the remaining string.
141+
// result and resume the conversion for the remaining string.
142+
// TODO: Improve translation of ICU errors to error_code
141143
std::error_code
142144
TextEncodingConverterICU::convertString(StringRef Source,
143145
SmallVectorImpl<char> &Result) {
@@ -169,9 +171,12 @@ TextEncodingConverterICU::convertString(StringRef Source,
169171
/*pivotLimit=*/NULL, /*reset=*/true,
170172
/*flush=*/true, &EC);
171173
if (U_FAILURE(EC)) {
172-
if (EC == U_BUFFER_OVERFLOW_ERROR && Capacity < Result.max_size()) {
173-
HandleOverflow(Capacity, Output, OutputLength, Result);
174-
continue;
174+
if (EC == U_BUFFER_OVERFLOW_ERROR) {
175+
if (Capacity < Result.max_size()) {
176+
HandleOverflow(Capacity, Output, OutputLength, Result);
177+
continue;
178+
} else
179+
return std::error_code(E2BIG, std::generic_category());
175180
}
176181
// Some other error occured.
177182
Result.resize(Output - Result.data());
@@ -190,7 +195,7 @@ void TextEncodingConverterICU::reset() {
190195
}
191196

192197
#elif HAVE_ICONV
193-
class TextEncodingConverterIconv
198+
class TextEncodingConverterIconv final
194199
: public details::TextEncodingConverterImplBase {
195200
class UniqueIconvT {
196201
iconv_t ConvDesc;
@@ -230,7 +235,7 @@ class TextEncodingConverterIconv
230235
// TODO: The current implementation discards the partial result and restarts the
231236
// conversion from the beginning if there is a conversion error due to
232237
// insufficient buffer size. In the future, it would better to save the partial
233-
// result and redo the conversion for the remaining string.
238+
// result and resume the conversion for the remaining string.
234239
std::error_code
235240
TextEncodingConverterIconv::convertString(StringRef Source,
236241
SmallVectorImpl<char> &Result) {
@@ -249,7 +254,7 @@ TextEncodingConverterIconv::convertString(StringRef Source,
249254
if (errno == E2BIG && Capacity < Result.max_size()) {
250255
HandleOverflow(Capacity, Output, OutputLength, Result);
251256
// Reset converter
252-
iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
257+
reset();
253258
return std::error_code();
254259
} else {
255260
// Some other error occured.
@@ -269,7 +274,7 @@ TextEncodingConverterIconv::convertString(StringRef Source,
269274
// Setup the input. Use nullptr to reset iconv state if input length is
270275
// zero.
271276
size_t InputLength = Source.size();
272-
char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
277+
char *Input = InputLength ? const_cast<char *>(Source.data()) : "";
273278
Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
274279
if (Ret != 0) {
275280
if (auto EC = HandleError(Ret))
@@ -291,7 +296,7 @@ TextEncodingConverterIconv::convertString(StringRef Source,
291296
return std::error_code();
292297
}
293298

294-
void TextEncodingConverterIconv::reset() {
299+
inline void TextEncodingConverterIconv::reset() {
295300
iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
296301
}
297302

@@ -311,7 +316,7 @@ TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) {
311316
else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
312317
Conversion = IBM1047ToUTF8;
313318
else
314-
return std::error_code(errno, std::generic_category());
319+
return std::make_error_code(std::errc::invalid_argument);
315320

316321
return TextEncodingConverter(
317322
std::make_unique<TextEncodingConverterTable>(Conversion));
@@ -330,21 +335,20 @@ ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
330335
#if HAVE_ICU
331336
UErrorCode EC = U_ZERO_ERROR;
332337
UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
333-
if (U_FAILURE(EC)) {
334-
return std::error_code(errno, std::generic_category());
335-
}
338+
if (U_FAILURE(EC))
339+
return std::make_error_code(std::errc::invalid_argument);
340+
336341
UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
337-
if (U_FAILURE(EC)) {
338-
return std::error_code(errno, std::generic_category());
339-
}
340-
std::unique_ptr<details::TextEncodingConverterImplBase> Converter =
341-
std::make_unique<TextEncodingConverterICU>(std::move(FromConvDesc),
342-
std::move(ToConvDesc));
342+
if (U_FAILURE(EC))
343+
return std::make_error_code(std::errc::invalid_argument);
344+
345+
auto Converter = std::make_unique<TextEncodingConverterICU>(
346+
std::move(FromConvDesc), std::move(ToConvDesc));
343347
return TextEncodingConverter(std::move(Converter));
344348
#elif HAVE_ICONV
345349
iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
346350
if (ConvDesc == (iconv_t)-1)
347-
return std::error_code(errno, std::generic_category());
351+
return std::make_error_code(std::errc::invalid_argument);
348352
return TextEncodingConverter(
349353
std::make_unique<TextEncodingConverterIconv>(ConvDesc));
350354
#else

llvm/unittests/Support/TextEncodingTest.cpp

Lines changed: 78 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "llvm/Support/TextEncoding.h"
1010
#include "llvm/ADT/SmallString.h"
11+
#include "llvm/Config/config.h"
1112
#include "gtest/gtest.h"
1213
using namespace llvm;
1314

@@ -61,12 +62,8 @@ TEST(Encoding, FromUTF8) {
6162
ErrorOr<TextEncodingConverter> Conv =
6263
TextEncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047);
6364

64-
// Stop test if conversion is not supported.
65-
if (!Conv) {
66-
ASSERT_EQ(Conv.getError(),
67-
std::make_error_code(std::errc::invalid_argument));
68-
return;
69-
}
65+
// Converter should always exist between UTF-8 and IBM-1047
66+
EXPECT_TRUE(Conv);
7067

7168
std::error_code EC = Conv->convert(Src, Dst);
7269
EXPECT_TRUE(!EC);
@@ -101,12 +98,8 @@ TEST(Encoding, ToUTF8) {
10198
ErrorOr<TextEncodingConverter> Conv =
10299
TextEncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8);
103100

104-
// Stop test if conversion is not supported.
105-
if (!Conv) {
106-
ASSERT_EQ(Conv.getError(),
107-
std::make_error_code(std::errc::invalid_argument));
108-
return;
109-
}
101+
// Converter should always exist between UTF-8 and IBM-1047
102+
EXPECT_TRUE(Conv);
110103

111104
std::error_code EC = Conv->convert(Src, Dst);
112105

@@ -131,28 +124,45 @@ TEST(Encoding, ToUTF8) {
131124
TEST(Encoding, RoundTrip) {
132125
ErrorOr<TextEncodingConverter> ConvToUTF16 =
133126
TextEncodingConverter::create("IBM-1047", "UTF-16");
127+
128+
#ifndef HAVE_ICU
134129
// Stop test if conversion is not supported (no underlying iconv support).
135130
if (!ConvToUTF16) {
136131
ASSERT_EQ(ConvToUTF16.getError(),
137132
std::make_error_code(std::errc::invalid_argument));
138133
return;
139134
}
135+
#else
136+
EXPECT_TRUE(ConvToUTF16);
137+
#endif
138+
140139
ErrorOr<TextEncodingConverter> ConvToUTF32 =
141140
TextEncodingConverter::create("UTF-16", "UTF-32");
141+
142+
#ifndef HAVE_ICU
142143
// Stop test if conversion is not supported (no underlying iconv support).
143144
if (!ConvToUTF32) {
144145
ASSERT_EQ(ConvToUTF32.getError(),
145146
std::make_error_code(std::errc::invalid_argument));
146147
return;
147148
}
149+
#else
150+
EXPECT_TRUE(ConvToUTF32);
151+
#endif
152+
148153
ErrorOr<TextEncodingConverter> ConvToEBCDIC =
149154
TextEncodingConverter::create("UTF-32", "IBM-1047");
155+
156+
#ifndef HAVE_ICU
150157
// Stop test if conversion is not supported (no underlying iconv support).
151158
if (!ConvToEBCDIC) {
152159
ASSERT_EQ(ConvToEBCDIC.getError(),
153160
std::make_error_code(std::errc::invalid_argument));
154161
return;
155162
}
163+
#else
164+
EXPECT_TRUE(ConvToEBCDIC);
165+
#endif
156166

157167
// Setup source string.
158168
char SrcStr[256];
@@ -177,12 +187,17 @@ TEST(Encoding, ShiftState2022) {
177187

178188
ErrorOr<TextEncodingConverter> ConvTo2022 =
179189
TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
190+
191+
#ifndef HAVE_ICU
180192
// Stop test if conversion is not supported (no underlying iconv support).
181193
if (!ConvTo2022) {
182194
ASSERT_EQ(ConvTo2022.getError(),
183195
std::make_error_code(std::errc::invalid_argument));
184196
return;
185197
}
198+
#else
199+
EXPECT_TRUE(ConvTo2022);
200+
#endif
186201

187202
// Check that the string is properly converted.
188203
std::error_code EC = ConvTo2022->convert(Src, Dst);
@@ -197,31 +212,82 @@ TEST(Encoding, InvalidInput) {
197212

198213
ErrorOr<TextEncodingConverter> ConvTo2022 =
199214
TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
215+
216+
#ifndef HAVE_ICU
200217
// Stop test if conversion is not supported (no underlying iconv support).
201218
if (!ConvTo2022) {
202219
ASSERT_EQ(ConvTo2022.getError(),
203220
std::make_error_code(std::errc::invalid_argument));
204221
return;
205222
}
223+
#else
224+
EXPECT_TRUE(ConvTo2022);
225+
#endif
206226

207227
// Check that the string failed to convert.
208228
std::error_code EC = ConvTo2022->convert(Src, Dst);
209229
EXPECT_TRUE(EC);
210230
}
211231

232+
TEST(Encoding, InvalidOutput) {
233+
// Cyrillic in UTF-16
234+
ErrorOr<TextEncodingConverter> ConvToUTF16 =
235+
TextEncodingConverter::create("UTF-8", "UTF-16");
236+
237+
#ifndef HAVE_ICU
238+
// Stop test if conversion is not supported (no underlying iconv support).
239+
if (!ConvToUTF16) {
240+
ASSERT_EQ(ConvToUTF16.getError(),
241+
std::make_error_code(std::errc::invalid_argument));
242+
return;
243+
}
244+
#else
245+
EXPECT_TRUE(ConvToUTF16);
246+
#endif
247+
248+
ErrorOr<TextEncodingConverter> ConvToEBCDIC =
249+
TextEncodingConverter::create("UTF-16", "IBM-1047");
250+
251+
#ifndef HAVE_ICU
252+
// Stop test if conversion is not supported (no underlying iconv support).
253+
if (!ConvToEBCDIC) {
254+
ASSERT_EQ(ConvToEBCDIC.getError(),
255+
std::make_error_code(std::errc::invalid_argument));
256+
return;
257+
}
258+
#else
259+
EXPECT_TRUE(ConvToEBCDIC);
260+
#endif
261+
262+
// Cyrillic string. Convert to UTF-16 and check if properly converted
263+
StringRef Src(CyrillicUTF);
264+
SmallString<8> Dst, Dst1;
265+
std::error_code EC = ConvToUTF16->convert(Src, Dst);
266+
EXPECT_TRUE(!EC);
267+
268+
// Cyrillic string. Results in error because not representable in 1047.
269+
EC = ConvToEBCDIC->convert(Dst, Dst1);
270+
EXPECT_TRUE(EC);
271+
}
272+
212273
TEST(Encoding, ShiftStateIBM939) {
213274
// Earth string.
214275
StringRef Src(EarthUTF);
215276
SmallString<64> Dst;
216277

217278
ErrorOr<TextEncodingConverter> ConvToIBM939 =
218279
TextEncodingConverter::create("UTF-8", "IBM-939");
280+
281+
#ifndef HAVE_ICU
219282
// Stop test if conversion is not supported (no underlying iconv support).
220283
if (!ConvToIBM939) {
221284
ASSERT_EQ(ConvToIBM939.getError(),
222285
std::make_error_code(std::errc::invalid_argument));
223286
return;
224287
}
288+
#else
289+
EXPECT_TRUE(ConvToIBM939);
290+
#endif
225291

226292
// Check that the string is properly converted.
227293
std::error_code EC = ConvToIBM939->convert(Src, Dst);

0 commit comments

Comments
 (0)