-
Notifications
You must be signed in to change notification settings - Fork 13.7k
Enable fexec-charset option #138895
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Enable fexec-charset option #138895
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -633,6 +633,9 @@ class LangOptions : public LangOptionsBase { | |
bool AtomicFineGrainedMemory = false; | ||
bool AtomicIgnoreDenormalMode = false; | ||
|
||
/// Name of the exec charset to convert the internal charset to. | ||
std::string ExecCharset; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lets call that a TextEncoding consistently (replacing all instances of Codepage and Charset) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure if we should replace all these uses of Charset when the option name fexec-charset already has charset in it? Or do we only want the option to have that name and use encoding internally? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The option should stay |
||
|
||
LangOptions(); | ||
|
||
/// Set language defaults for the given input language and | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7247,6 +7247,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in { | |
def tune_cpu : Separate<["-"], "tune-cpu">, | ||
HelpText<"Tune for a specific cpu type">, | ||
MarshallingInfoString<TargetOpts<"TuneCPU">>; | ||
def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">, | ||
HelpText<"Set the execution <charset> for string and character literals. " | ||
"Supported character encodings include ISO8859-1, UTF-8, IBM-1047 " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is too long for Maybe something like "Use for string and character literals" could work here as help text? You can put longer information with examples of character set names and references to icu / iconv into a separate |
||
"and those supported by the host icu or iconv library.">, | ||
MarshallingInfoString<LangOpts<"ExecCharset">>; | ||
def target_cpu : Separate<["-"], "target-cpu">, | ||
HelpText<"Target a specific cpu type">, | ||
MarshallingInfoString<TargetOpts<"CPU">>; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H | ||
#define LLVM_CLANG_LEX_LITERALCONVERTER_H | ||
|
||
#include "clang/Basic/Diagnostic.h" | ||
#include "clang/Basic/LangOptions.h" | ||
#include "clang/Basic/TargetInfo.h" | ||
#include "llvm/ADT/StringMap.h" | ||
#include "llvm/ADT/StringRef.h" | ||
#include "llvm/Support/TextEncoding.h" | ||
|
||
enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset }; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The names here were chosen to represent the different encodings that need to be used based on the context of the string literal which I mentioned in my RFC and have also copied the table to the description of this PR. I'm not sure the proposed names make that context clear |
||
|
||
class LiteralConverter { | ||
llvm::StringRef InternalCharset; | ||
llvm::StringRef SystemCharset; | ||
llvm::StringRef ExecCharset; | ||
llvm::StringMap<llvm::TextEncodingConverter> TextEncodingConverters; | ||
|
||
public: | ||
llvm::TextEncodingConverter *getConverter(const char *Codepage); | ||
llvm::TextEncodingConverter *getConverter(ConversionAction Action); | ||
llvm::TextEncodingConverter *createAndInsertCharConverter(const char *To); | ||
void setConvertersFromOptions(const clang::LangOptions &Opts, | ||
const clang::TargetInfo &TInfo, | ||
clang::DiagnosticsEngine &Diags); | ||
Comment on lines
+31
to
+33
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer, for example a static fuction that returns an optional or a null pointer failure, and let the caller call deal with error |
||
}; | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,12 +17,13 @@ | |
#include "clang/Basic/CharInfo.h" | ||
#include "clang/Basic/LLVM.h" | ||
#include "clang/Basic/TokenKinds.h" | ||
#include "clang/Lex/LiteralConverter.h" | ||
#include "llvm/ADT/APFloat.h" | ||
#include "llvm/ADT/ArrayRef.h" | ||
#include "llvm/ADT/SmallString.h" | ||
#include "llvm/ADT/StringRef.h" | ||
#include "llvm/Support/DataTypes.h" | ||
|
||
#include "llvm/Support/TextEncoding.h" | ||
namespace clang { | ||
|
||
class DiagnosticsEngine; | ||
|
@@ -233,6 +234,7 @@ class StringLiteralParser { | |
const LangOptions &Features; | ||
const TargetInfo &Target; | ||
DiagnosticsEngine *Diags; | ||
LiteralConverter *LiteralConv; | ||
|
||
unsigned MaxTokenLength; | ||
unsigned SizeBound; | ||
|
@@ -246,18 +248,19 @@ class StringLiteralParser { | |
StringLiteralEvalMethod EvalMethod; | ||
|
||
public: | ||
StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP, | ||
StringLiteralEvalMethod StringMethod = | ||
StringLiteralEvalMethod::Evaluated); | ||
StringLiteralParser( | ||
ArrayRef<Token> StringToks, Preprocessor &PP, | ||
StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated, | ||
ConversionAction Action = ToExecCharset); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need Conversion at all. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I put up a table on my old RFC https://discourse.llvm.org/t/rfc-enabling-fexec-charset-support-to-llvm-and-clang-reposting/71512 which shows the different encoding needed depending on the context. So not all strings will be translated to the execution encoding. A separate PR after this one will be needed to ensure we use the correct encoding depending on the context. |
||
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm, | ||
const LangOptions &features, const TargetInfo &target, | ||
DiagnosticsEngine *diags = nullptr) | ||
: SM(sm), Features(features), Target(target), Diags(diags), | ||
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), | ||
ResultPtr(ResultBuf.data()), | ||
LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0), | ||
Kind(tok::unknown), ResultPtr(ResultBuf.data()), | ||
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false), | ||
Pascal(false) { | ||
init(StringToks); | ||
init(StringToks, NoConversion); | ||
} | ||
|
||
bool hadError; | ||
|
@@ -305,7 +308,7 @@ class StringLiteralParser { | |
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); | ||
|
||
private: | ||
void init(ArrayRef<Token> StringToks); | ||
void init(ArrayRef<Token> StringToks, ConversionAction Action); | ||
bool CopyStringFragment(const Token &Tok, const char *TokBegin, | ||
StringRef Fragment); | ||
void DiagnoseLexingError(SourceLocation Loc); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
//===--- LiteralConverter.cpp - Translator for String Literals -----------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "clang/Lex/LiteralConverter.h" | ||
#include "clang/Basic/DiagnosticDriver.h" | ||
|
||
using namespace llvm; | ||
|
||
llvm::TextEncodingConverter * | ||
LiteralConverter::getConverter(const char *Codepage) { | ||
auto Iter = TextEncodingConverters.find(Codepage); | ||
if (Iter != TextEncodingConverters.end()) | ||
return &Iter->second; | ||
return nullptr; | ||
} | ||
|
||
llvm::TextEncodingConverter * | ||
LiteralConverter::getConverter(ConversionAction Action) { | ||
StringRef CodePage; | ||
if (Action == ToSystemCharset) | ||
CodePage = SystemCharset; | ||
else if (Action == ToExecCharset) | ||
CodePage = ExecCharset; | ||
else | ||
CodePage = InternalCharset; | ||
return getConverter(CodePage.data()); | ||
} | ||
|
||
llvm::TextEncodingConverter * | ||
LiteralConverter::createAndInsertCharConverter(const char *To) { | ||
const char *From = InternalCharset.data(); | ||
llvm::TextEncodingConverter *Converter = getConverter(To); | ||
if (Converter) | ||
return Converter; | ||
|
||
ErrorOr<TextEncodingConverter> ErrorOrConverter = | ||
llvm::TextEncodingConverter::create(From, To); | ||
if (!ErrorOrConverter) | ||
return nullptr; | ||
TextEncodingConverters.insert_or_assign(StringRef(To), | ||
std::move(*ErrorOrConverter)); | ||
return getConverter(To); | ||
} | ||
|
||
void LiteralConverter::setConvertersFromOptions( | ||
const clang::LangOptions &Opts, const clang::TargetInfo &TInfo, | ||
clang::DiagnosticsEngine &Diags) { | ||
using namespace llvm; | ||
SystemCharset = TInfo.getTriple().getSystemCharset(); | ||
InternalCharset = "UTF-8"; | ||
ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset; | ||
// Create converter between internal and system charset | ||
if (InternalCharset != SystemCharset) | ||
createAndInsertCharConverter(SystemCharset.data()); | ||
|
||
// Create converter between internal and exec charset specified | ||
// in fexec-charset option. | ||
if (InternalCharset == ExecCharset) | ||
return; | ||
if (!createAndInsertCharConverter(ExecCharset.data())) { | ||
Diags.Report(clang::diag::err_drv_invalid_value) | ||
<< "-fexec-charset" << ExecCharset; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The "typically" here seems wrong -- specifying
-fexec-charset
is atypical, and if it's specified then the macro always (not only typically) expands to that. Also referring to "the system charset" doesn't really seem right, given that for non-z/OS we use UTF-8 regardless of what the operating system would consider to be its character set. How about: