Skip to content

Commit bd3dd4d

Browse files
committed
This patch enables the fexec-charset option to control the execution charset of string literals. It sets the default internal charset, system charset, and execution charset for z/OS and UTF-8 for all other platforms.
(cherry picked from commit 0295d0d) (cherry picked from commit e379f6cb9d063cb78c6b48b0e0a8d9f241958f89)
1 parent f99e76b commit bd3dd4d

File tree

20 files changed

+375
-48
lines changed

20 files changed

+375
-48
lines changed

clang/docs/LanguageExtensions.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,7 @@ Builtin Macros
416416
``__clang_literal_encoding__``
417417
Defined to a narrow string literal that represents the current encoding of
418418
narrow string literals, e.g., ``"hello"``. This macro typically expands to
419-
"UTF-8" (but may change in the future if the
420-
``-fexec-charset="Encoding-Name"`` option is implemented.)
419+
the text encoding specified by -fexec-charset if specified, or the system charset.
421420

422421
``__clang_wide_literal_encoding__``
423422
Defined to a narrow string literal that represents the current encoding of

clang/include/clang/Basic/LangOptions.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,9 @@ class LangOptions : public LangOptionsBase {
633633
bool AtomicFineGrainedMemory = false;
634634
bool AtomicIgnoreDenormalMode = false;
635635

636+
/// Name of the exec charset to convert the internal charset to.
637+
std::string ExecCharset;
638+
636639
LangOptions();
637640

638641
/// Set language defaults for the given input language and

clang/include/clang/Basic/TokenKinds.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,13 @@ inline bool isLiteral(TokenKind K) {
101101
isStringLiteral(K) || K == tok::header_name || K == tok::binary_data;
102102
}
103103

104+
/// Return true if this is a utf literal kind.
105+
inline bool isUTFLiteral(TokenKind K) {
106+
return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
107+
K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
108+
K == tok::utf32_char_constant || K == tok::utf32_string_literal;
109+
}
110+
104111
/// Return true if this is any of tok::annot_* kinds.
105112
bool isAnnotation(TokenKind K);
106113

clang/include/clang/Driver/Options.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7245,6 +7245,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in {
72457245
def tune_cpu : Separate<["-"], "tune-cpu">,
72467246
HelpText<"Tune for a specific cpu type">,
72477247
MarshallingInfoString<TargetOpts<"TuneCPU">>;
7248+
def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
7249+
HelpText<"Set the execution <charset> for string and character literals. "
7250+
"Supported character encodings include ISO8859-1, UTF-8, IBM-1047 "
7251+
"and those supported by the host icu or iconv library.">,
7252+
MarshallingInfoString<LangOpts<"ExecCharset">>;
72487253
def target_cpu : Separate<["-"], "target-cpu">,
72497254
HelpText<"Target a specific cpu type">,
72507255
MarshallingInfoString<TargetOpts<"CPU">>;
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H
10+
#define LLVM_CLANG_LEX_LITERALCONVERTER_H
11+
12+
#include "clang/Basic/Diagnostic.h"
13+
#include "clang/Basic/LangOptions.h"
14+
#include "clang/Basic/TargetInfo.h"
15+
#include "llvm/ADT/StringMap.h"
16+
#include "llvm/ADT/StringRef.h"
17+
#include "llvm/Support/TextEncoding.h"
18+
19+
enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset };
20+
21+
class LiteralConverter {
22+
llvm::StringRef InternalCharset;
23+
llvm::StringRef SystemCharset;
24+
llvm::StringRef ExecCharset;
25+
llvm::StringMap<llvm::TextEncodingConverter> TextEncodingConverters;
26+
27+
public:
28+
llvm::TextEncodingConverter *getConverter(const char *Codepage);
29+
llvm::TextEncodingConverter *getConverter(ConversionAction Action);
30+
llvm::TextEncodingConverter *createAndInsertCharConverter(const char *To);
31+
void setConvertersFromOptions(const clang::LangOptions &Opts,
32+
const clang::TargetInfo &TInfo,
33+
clang::DiagnosticsEngine &Diags);
34+
};
35+
36+
#endif

clang/include/clang/Lex/LiteralSupport.h

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717
#include "clang/Basic/CharInfo.h"
1818
#include "clang/Basic/LLVM.h"
1919
#include "clang/Basic/TokenKinds.h"
20+
#include "clang/Lex/LiteralConverter.h"
2021
#include "llvm/ADT/APFloat.h"
2122
#include "llvm/ADT/ArrayRef.h"
2223
#include "llvm/ADT/SmallString.h"
2324
#include "llvm/ADT/StringRef.h"
2425
#include "llvm/Support/DataTypes.h"
25-
26+
#include "llvm/Support/TextEncoding.h"
2627
namespace clang {
2728

2829
class DiagnosticsEngine;
@@ -233,6 +234,7 @@ class StringLiteralParser {
233234
const LangOptions &Features;
234235
const TargetInfo &Target;
235236
DiagnosticsEngine *Diags;
237+
LiteralConverter *LiteralConv;
236238

237239
unsigned MaxTokenLength;
238240
unsigned SizeBound;
@@ -246,18 +248,19 @@ class StringLiteralParser {
246248
StringLiteralEvalMethod EvalMethod;
247249

248250
public:
249-
StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP,
250-
StringLiteralEvalMethod StringMethod =
251-
StringLiteralEvalMethod::Evaluated);
251+
StringLiteralParser(
252+
ArrayRef<Token> StringToks, Preprocessor &PP,
253+
StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
254+
ConversionAction Action = ToExecCharset);
252255
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
253256
const LangOptions &features, const TargetInfo &target,
254257
DiagnosticsEngine *diags = nullptr)
255258
: SM(sm), Features(features), Target(target), Diags(diags),
256-
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
257-
ResultPtr(ResultBuf.data()),
259+
LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0),
260+
Kind(tok::unknown), ResultPtr(ResultBuf.data()),
258261
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
259262
Pascal(false) {
260-
init(StringToks);
263+
init(StringToks, NoConversion);
261264
}
262265

263266
bool hadError;
@@ -305,7 +308,7 @@ class StringLiteralParser {
305308
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix);
306309

307310
private:
308-
void init(ArrayRef<Token> StringToks);
311+
void init(ArrayRef<Token> StringToks, ConversionAction Action);
309312
bool CopyStringFragment(const Token &Tok, const char *TokBegin,
310313
StringRef Fragment);
311314
void DiagnoseLexingError(SourceLocation Loc);

clang/include/clang/Lex/Preprocessor.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "clang/Basic/TokenKinds.h"
2626
#include "clang/Lex/HeaderSearch.h"
2727
#include "clang/Lex/Lexer.h"
28+
#include "clang/Lex/LiteralConverter.h"
2829
#include "clang/Lex/MacroInfo.h"
2930
#include "clang/Lex/ModuleLoader.h"
3031
#include "clang/Lex/ModuleMap.h"
@@ -162,6 +163,7 @@ class Preprocessor {
162163
std::unique_ptr<ScratchBuffer> ScratchBuf;
163164
HeaderSearch &HeaderInfo;
164165
ModuleLoader &TheModuleLoader;
166+
LiteralConverter LiteralConv;
165167

166168
/// External source of macros.
167169
ExternalPreprocessorSource *ExternalSource;
@@ -1224,6 +1226,7 @@ class Preprocessor {
12241226
SelectorTable &getSelectorTable() { return Selectors; }
12251227
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
12261228
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
1229+
LiteralConverter &getLiteralConverter() { return LiteralConv; }
12271230

12281231
void setExternalSource(ExternalPreprocessorSource *Source) {
12291232
ExternalSource = Source;

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include "llvm/Support/FileSystem.h"
4848
#include "llvm/Support/Path.h"
4949
#include "llvm/Support/Process.h"
50+
#include "llvm/Support/TextEncoding.h"
5051
#include "llvm/Support/YAMLParser.h"
5152
#include "llvm/TargetParser/AArch64TargetParser.h"
5253
#include "llvm/TargetParser/ARMTargetParserCommon.h"
@@ -7589,12 +7590,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
75897590
<< value;
75907591
}
75917592

7592-
// -fexec_charset=UTF-8 is default. Reject others
7593+
// Set the default fexec-charset as the system charset.
7594+
CmdArgs.push_back("-fexec-charset");
7595+
CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset()));
75937596
if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
75947597
StringRef value = execCharset->getValue();
7595-
if (!value.equals_insensitive("utf-8"))
7596-
D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
7597-
<< value;
7598+
llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
7599+
llvm::TextEncodingConverter::create("UTF-8", value.data());
7600+
if (ErrorOrConverter) {
7601+
CmdArgs.push_back("-fexec-charset");
7602+
CmdArgs.push_back(Args.MakeArgString(value));
7603+
} else {
7604+
D.Diag(diag::err_drv_invalid_value)
7605+
<< execCharset->getAsString(Args) << value;
7606+
}
75987607
}
75997608

76007609
RenderDiagnosticsOptions(D, Args, CmdArgs);

clang/lib/Frontend/CompilerInstance.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "clang/Frontend/Utils.h"
3333
#include "clang/Frontend/VerifyDiagnosticConsumer.h"
3434
#include "clang/Lex/HeaderSearch.h"
35+
#include "clang/Lex/LiteralConverter.h"
3536
#include "clang/Lex/Preprocessor.h"
3637
#include "clang/Lex/PreprocessorOptions.h"
3738
#include "clang/Sema/CodeCompleteConsumer.h"
@@ -535,6 +536,9 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
535536

536537
if (GetDependencyDirectives)
537538
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
539+
540+
PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(),
541+
getDiagnostics());
538542
}
539543

540544
std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) {

clang/lib/Frontend/InitPreprocessor.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,10 +1057,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
10571057
}
10581058
}
10591059

1060-
// Macros to help identify the narrow and wide character sets
1061-
// FIXME: clang currently ignores -fexec-charset=. If this changes,
1062-
// then this may need to be updated.
1063-
Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\"");
1060+
// Macros to help identify the narrow and wide character sets. This is set
1061+
// to fexec-charset. If fexec-charset is not specified, the default is the
1062+
// system charset.
1063+
if (!LangOpts.ExecCharset.empty())
1064+
Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecCharset);
1065+
else
1066+
Builder.defineMacro("__clang_literal_encoding__",
1067+
TI.getTriple().getSystemCharset());
10641068
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
10651069
// FIXME: 32-bit wchar_t signals UTF-32. This may change
10661070
// if -fwide-exec-charset= is ever supported.

clang/lib/Lex/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ add_clang_library(clangLex
1212
InitHeaderSearch.cpp
1313
Lexer.cpp
1414
LexHLSLRootSignature.cpp
15+
LiteralConverter.cpp
1516
LiteralSupport.cpp
1617
MacroArgs.cpp
1718
MacroInfo.cpp

clang/lib/Lex/LiteralConverter.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
//===--- LiteralConverter.cpp - Translator for String Literals -----------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "clang/Lex/LiteralConverter.h"
10+
#include "clang/Basic/DiagnosticDriver.h"
11+
12+
using namespace llvm;
13+
14+
llvm::TextEncodingConverter *
15+
LiteralConverter::getConverter(const char *Codepage) {
16+
auto Iter = TextEncodingConverters.find(Codepage);
17+
if (Iter != TextEncodingConverters.end())
18+
return &Iter->second;
19+
return nullptr;
20+
}
21+
22+
llvm::TextEncodingConverter *
23+
LiteralConverter::getConverter(ConversionAction Action) {
24+
StringRef CodePage;
25+
if (Action == ToSystemCharset)
26+
CodePage = SystemCharset;
27+
else if (Action == ToExecCharset)
28+
CodePage = ExecCharset;
29+
else
30+
CodePage = InternalCharset;
31+
return getConverter(CodePage.data());
32+
}
33+
34+
llvm::TextEncodingConverter *
35+
LiteralConverter::createAndInsertCharConverter(const char *To) {
36+
const char *From = InternalCharset.data();
37+
llvm::TextEncodingConverter *Converter = getConverter(To);
38+
if (Converter)
39+
return Converter;
40+
41+
ErrorOr<TextEncodingConverter> ErrorOrConverter =
42+
llvm::TextEncodingConverter::create(From, To);
43+
if (!ErrorOrConverter)
44+
return nullptr;
45+
TextEncodingConverters.insert_or_assign(StringRef(To),
46+
std::move(*ErrorOrConverter));
47+
return getConverter(To);
48+
}
49+
50+
void LiteralConverter::setConvertersFromOptions(
51+
const clang::LangOptions &Opts, const clang::TargetInfo &TInfo,
52+
clang::DiagnosticsEngine &Diags) {
53+
using namespace llvm;
54+
SystemCharset = TInfo.getTriple().getSystemCharset();
55+
InternalCharset = "UTF-8";
56+
ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset;
57+
// Create converter between internal and system charset
58+
if (InternalCharset != SystemCharset)
59+
createAndInsertCharConverter(SystemCharset.data());
60+
61+
// Create converter between internal and exec charset specified
62+
// in fexec-charset option.
63+
if (InternalCharset == ExecCharset)
64+
return;
65+
if (!createAndInsertCharConverter(ExecCharset.data())) {
66+
Diags.Report(clang::diag::err_drv_invalid_value)
67+
<< "-fexec-charset" << ExecCharset;
68+
}
69+
}

0 commit comments

Comments
 (0)