diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index ebcad44197ce4..44e20623d4d0b 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -416,8 +416,7 @@ Builtin Macros ``__clang_literal_encoding__`` Defined to a narrow string literal that represents the current encoding of narrow string literals, e.g., ``"hello"``. This macro typically expands to - "UTF-8" (but may change in the future if the - ``-fexec-charset="Encoding-Name"`` option is implemented.) + the charset specified by -fexec-charset if specified, or the system charset. ``__clang_wide_literal_encoding__`` Defined to a narrow string literal that represents the current encoding of diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 491e8bee9fd5c..559a4be70b74c 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -633,6 +633,9 @@ class LangOptions : public LangOptionsBase { bool AtomicFineGrainedMemory = false; bool AtomicIgnoreDenormalMode = false; + /// Name of the exec charset to convert the internal charset to. + std::string ExecCharset; + LangOptions(); /// Set language defaults for the given input language and diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h index 1b133dde89587..34f6133973e71 100644 --- a/clang/include/clang/Basic/TokenKinds.h +++ b/clang/include/clang/Basic/TokenKinds.h @@ -101,6 +101,13 @@ inline bool isLiteral(TokenKind K) { isStringLiteral(K) || K == tok::header_name || K == tok::binary_data; } +/// Return true if this is a utf literal kind. +inline bool isUTFLiteral(TokenKind K) { + return K == tok::utf8_char_constant || K == tok::utf8_string_literal || + K == tok::utf16_char_constant || K == tok::utf16_string_literal || + K == tok::utf32_char_constant || K == tok::utf32_string_literal; +} + /// Return true if this is any of tok::annot_* kinds. bool isAnnotation(TokenKind K); diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 30ea75bb108d5..9d352eb1270fe 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -7197,6 +7197,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in { def tune_cpu : Separate<["-"], "tune-cpu">, HelpText<"Tune for a specific cpu type">, MarshallingInfoString>; +def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"">, + HelpText<"Set the execution for string and character literals. " + "Supported character encodings include ISO8859-1, UTF-8, IBM-1047 " + "and those supported by the host icu or iconv library.">, + MarshallingInfoString>; def target_cpu : Separate<["-"], "target-cpu">, HelpText<"Target a specific cpu type">, MarshallingInfoString>; diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/LiteralConverter.h new file mode 100644 index 0000000000000..203111255b791 --- /dev/null +++ b/clang/include/clang/Lex/LiteralConverter.h @@ -0,0 +1,36 @@ +//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H +#define LLVM_CLANG_LEX_LITERALCONVERTER_H + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/TargetInfo.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CharSet.h" + +enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset }; + +class LiteralConverter { + llvm::StringRef InternalCharset; + llvm::StringRef SystemCharset; + llvm::StringRef ExecCharset; + llvm::StringMap CharsetConverters; + +public: + llvm::CharSetConverter *getConverter(const char *Codepage); + llvm::CharSetConverter *getConverter(ConversionAction Action); + llvm::CharSetConverter *createAndInsertCharConverter(const char *To); + void setConvertersFromOptions(const clang::LangOptions &Opts, + const clang::TargetInfo &TInfo, + clang::DiagnosticsEngine &Diags); +}; + +#endif diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h index ea5f63bc20399..05c4761e21b52 100644 --- a/clang/include/clang/Lex/LiteralSupport.h +++ b/clang/include/clang/Lex/LiteralSupport.h @@ -17,10 +17,12 @@ #include "clang/Basic/CharInfo.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/TokenKinds.h" +#include "clang/Lex/LiteralConverter.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/CharSet.h" #include "llvm/Support/DataTypes.h" namespace clang { @@ -233,6 +235,7 @@ class StringLiteralParser { const LangOptions &Features; const TargetInfo &Target; DiagnosticsEngine *Diags; + LiteralConverter *LiteralConv; unsigned MaxTokenLength; unsigned SizeBound; @@ -246,18 +249,19 @@ class StringLiteralParser { StringLiteralEvalMethod EvalMethod; public: - StringLiteralParser(ArrayRef StringToks, Preprocessor &PP, - StringLiteralEvalMethod StringMethod = - StringLiteralEvalMethod::Evaluated); + StringLiteralParser( + ArrayRef StringToks, Preprocessor &PP, + StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated, + ConversionAction Action = ToExecCharset); StringLiteralParser(ArrayRef StringToks, const SourceManager &sm, const LangOptions &features, const TargetInfo &target, DiagnosticsEngine *diags = nullptr) : SM(sm), Features(features), Target(target), Diags(diags), - MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), - ResultPtr(ResultBuf.data()), + LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0), + Kind(tok::unknown), ResultPtr(ResultBuf.data()), EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false), Pascal(false) { - init(StringToks); + init(StringToks, NoConversion); } bool hadError; @@ -305,7 +309,7 @@ class StringLiteralParser { static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); private: - void init(ArrayRef StringToks); + void init(ArrayRef StringToks, ConversionAction Action); bool CopyStringFragment(const Token &Tok, const char *TokBegin, StringRef Fragment); void DiagnoseLexingError(SourceLocation Loc); diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index f2dfd3a349b8b..350cd2d436eb3 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -25,6 +25,7 @@ #include "clang/Basic/TokenKinds.h" #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Lexer.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Lex/MacroInfo.h" #include "clang/Lex/ModuleLoader.h" #include "clang/Lex/ModuleMap.h" @@ -156,6 +157,7 @@ class Preprocessor { std::unique_ptr ScratchBuf; HeaderSearch &HeaderInfo; ModuleLoader &TheModuleLoader; + LiteralConverter LiteralConv; /// External source of macros. ExternalPreprocessorSource *ExternalSource; @@ -1218,6 +1220,7 @@ class Preprocessor { SelectorTable &getSelectorTable() { return Selectors; } Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; } llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; } + LiteralConverter &getLiteralConverter() { return LiteralConv; } void setExternalSource(ExternalPreprocessorSource *Source) { ExternalSource = Source; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f87549baff5e1..7b364634f8ff7 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -50,6 +50,7 @@ #include "llvm/Frontend/Debug/Options.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Option/ArgList.h" +#include "llvm/Support/CharSet.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Compression.h" @@ -7597,12 +7598,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, << value; } - // -fexec_charset=UTF-8 is default. Reject others + // Set the default fexec-charset as the system charset. + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset())); if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) { StringRef value = execCharset->getValue(); - if (!value.equals_insensitive("utf-8")) - D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args) - << value; + llvm::ErrorOr ErrorOrConverter = + llvm::CharSetConverter::create("UTF-8", value.data()); + if (ErrorOrConverter) { + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back(Args.MakeArgString(value)); + } else { + D.Diag(diag::err_drv_invalid_value) + << execCharset->getAsString(Args) << value; + } } RenderDiagnosticsOptions(D, Args, CmdArgs); diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index b59496babb62c..879b81bbadabe 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -32,6 +32,7 @@ #include "clang/Frontend/Utils.h" #include "clang/Frontend/VerifyDiagnosticConsumer.h" #include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" #include "clang/Sema/CodeCompleteConsumer.h" @@ -537,6 +538,9 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { if (GetDependencyDirectives) PP->setDependencyDirectivesGetter(*GetDependencyDirectives); + + PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(), + getDiagnostics()); } std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) { diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 96d6fb64a6319..47b71d0fb98c5 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1058,10 +1058,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI, } } - // Macros to help identify the narrow and wide character sets - // FIXME: clang currently ignores -fexec-charset=. If this changes, - // then this may need to be updated. - Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\""); + // Macros to help identify the narrow and wide character sets. This is set + // to fexec-charset. If fexec-charset is not specified, the default is the + // system charset. + if (!LangOpts.ExecCharset.empty()) + Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecCharset); + else + Builder.defineMacro("__clang_literal_encoding__", + TI.getTriple().getSystemCharset()); if (TI.getTypeWidth(TI.getWCharType()) >= 32) { // FIXME: 32-bit wchar_t signals UTF-32. This may change // if -fwide-exec-charset= is ever supported. diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt index f61737cd68021..9e38a1b8fbb44 100644 --- a/clang/lib/Lex/CMakeLists.txt +++ b/clang/lib/Lex/CMakeLists.txt @@ -12,6 +12,7 @@ add_clang_library(clangLex InitHeaderSearch.cpp Lexer.cpp LexHLSLRootSignature.cpp + LiteralConverter.cpp LiteralSupport.cpp MacroArgs.cpp MacroInfo.cpp diff --git a/clang/lib/Lex/LiteralConverter.cpp b/clang/lib/Lex/LiteralConverter.cpp new file mode 100644 index 0000000000000..a89781e182e83 --- /dev/null +++ b/clang/lib/Lex/LiteralConverter.cpp @@ -0,0 +1,68 @@ +//===--- LiteralConverter.cpp - Translator for String Literals -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/LiteralConverter.h" +#include "clang/Basic/DiagnosticDriver.h" + +using namespace llvm; + +llvm::CharSetConverter *LiteralConverter::getConverter(const char *Codepage) { + auto Iter = CharsetConverters.find(Codepage); + if (Iter != CharsetConverters.end()) + return &Iter->second; + return nullptr; +} + +llvm::CharSetConverter * +LiteralConverter::getConverter(ConversionAction Action) { + StringRef CodePage; + if (Action == ToSystemCharset) + CodePage = SystemCharset; + else if (Action == ToExecCharset) + CodePage = ExecCharset; + else + CodePage = InternalCharset; + return getConverter(CodePage.data()); +} + +llvm::CharSetConverter * +LiteralConverter::createAndInsertCharConverter(const char *To) { + const char *From = InternalCharset.data(); + llvm::CharSetConverter *Converter = getConverter(To); + if (Converter) + return Converter; + + ErrorOr ErrorOrConverter = + llvm::CharSetConverter::create(From, To); + if (!ErrorOrConverter) + return nullptr; + CharsetConverters.insert_or_assign(StringRef(To), + std::move(*ErrorOrConverter)); + return getConverter(To); +} + +void LiteralConverter::setConvertersFromOptions( + const clang::LangOptions &Opts, const clang::TargetInfo &TInfo, + clang::DiagnosticsEngine &Diags) { + using namespace llvm; + SystemCharset = TInfo.getTriple().getSystemCharset(); + InternalCharset = "UTF-8"; + ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset; + // Create converter between internal and system charset + if (!InternalCharset.equals(SystemCharset)) + createAndInsertCharConverter(SystemCharset.data()); + + // Create converter between internal and exec charset specified + // in fexec-charset option. + if (InternalCharset.equals(ExecCharset)) + return; + if (!createAndInsertCharConverter(ExecCharset.data())) { + Diags.Report(clang::diag::err_drv_invalid_value) + << "-fexec-charset" << ExecCharset; + } +} diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 75ad977d64b24..b7cee5a872c35 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -128,13 +128,11 @@ static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) { /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in /// either a character or a string literal. -static unsigned ProcessCharEscape(const char *ThisTokBegin, - const char *&ThisTokBuf, - const char *ThisTokEnd, bool &HadError, - FullSourceLoc Loc, unsigned CharWidth, - DiagnosticsEngine *Diags, - const LangOptions &Features, - StringLiteralEvalMethod EvalMethod) { +static unsigned ProcessCharEscape( + const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, + bool &HadError, FullSourceLoc Loc, unsigned CharWidth, + DiagnosticsEngine *Diags, const LangOptions &Features, + StringLiteralEvalMethod EvalMethod, llvm::CharSetConverter *Converter) { const char *EscapeBegin = ThisTokBuf; bool Delimited = false; bool EndDelimiterFound = false; @@ -146,6 +144,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, // that would have been \", which would not have been the end of string. unsigned ResultChar = *ThisTokBuf++; char Escape = ResultChar; + bool Translate = true; + bool Invalid = false; switch (ResultChar) { // These map to themselves. case '\\': case '\'': case '"': case '?': break; @@ -186,6 +186,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, ResultChar = 11; break; case 'x': { // Hex escape. + Translate = false; ResultChar = 0; if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { Delimited = true; @@ -249,6 +250,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, case '4': case '5': case '6': case '7': { // Octal escapes. --ThisTokBuf; + Translate = false; ResultChar = 0; // Octal escapes are a series of octal digits with maximum length 3. @@ -334,6 +336,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, << std::string(1, ResultChar); break; default: + Invalid = true; if (!Diags) break; @@ -367,6 +370,15 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, HadError = true; } + if (Translate && Converter) { + // Invalid escapes are written as '?' and then translated. + char ByteChar = Invalid ? '?' : ResultChar; + SmallString<8> ResultCharConv; + Converter->convert(StringRef(&ByteChar, 1), ResultCharConv); + assert(ResultCharConv.size() == 1 && + "Char size increased after translation"); + ResultChar = ResultCharConv[0]; + } return ResultChar; } @@ -1739,6 +1751,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, HadError = false; Kind = kind; + LiteralConverter *LiteralConv = &PP.getLiteralConverter(); const char *TokBegin = begin; @@ -1805,6 +1818,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, largest_character_for_kind = 0x7Fu; } + llvm::CharSetConverter *Converter = nullptr; + if (!isUTFLiteral(Kind) && LiteralConv) + Converter = LiteralConv->getConverter(ToExecCharset); + while (begin != end) { // Is this a span of non-escape characters? if (begin[0] != '\\') { @@ -1842,6 +1859,16 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, HadError = true; PP.Diag(Loc, diag::err_character_too_large); } + if (!HadError && Converter) { + assert(Kind != tok::wide_char_constant && + "Wide character translation not supported"); + char ByteChar = *tmp_out_start; + SmallString<1> ConvertedChar; + Converter->convert(StringRef(&ByteChar, 1), ConvertedChar); + assert(ConvertedChar.size() == 1 && + "Char size increased after translation"); + *tmp_out_start = ConvertedChar[0]; + } } } @@ -1849,16 +1876,35 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, } // Is this a Universal Character Name escape? if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') { - unsigned short UcnLen = 0; - if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, - FullSourceLoc(Loc, PP.getSourceManager()), - &PP.getDiagnostics(), PP.getLangOpts(), true)) { - HadError = true; - } else if (*buffer_begin > largest_character_for_kind) { - HadError = true; - PP.Diag(Loc, diag::err_character_too_large); + if (Converter == nullptr) { + unsigned short UcnLen = 0; + if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, + FullSourceLoc(Loc, PP.getSourceManager()), + &PP.getDiagnostics(), PP.getLangOpts(), true)) { + HadError = true; + } else if (*buffer_begin > largest_character_for_kind) { + HadError = true; + PP.Diag(Loc, diag::err_character_too_large); + } + } else { + char Cp[8]; + char *ResultPtr = Cp; + unsigned CharByteWidth = 1; + EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError, + FullSourceLoc(Loc, PP.getSourceManager()), + CharByteWidth, &PP.getDiagnostics(), PP.getLangOpts()); + if (!HadError) { + SmallString<8> CpConv; + Converter->convert(StringRef(Cp), CpConv); + if (CpConv.size() > 1) { + HadError = true; + PP.Diag(Loc, diag::err_character_too_large); + } else { + memcpy(Cp, CpConv.data(), CpConv.size()); + *buffer_begin = *Cp; + } + } } - ++buffer_begin; continue; } @@ -1867,7 +1913,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, ProcessCharEscape(TokBegin, begin, end, HadError, FullSourceLoc(Loc, PP.getSourceManager()), CharWidth, &PP.getDiagnostics(), PP.getLangOpts(), - StringLiteralEvalMethod::Evaluated); + StringLiteralEvalMethod::Evaluated, nullptr); *buffer_begin++ = result; } @@ -1977,16 +2023,18 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, /// StringLiteralParser::StringLiteralParser(ArrayRef StringToks, Preprocessor &PP, - StringLiteralEvalMethod EvalMethod) + StringLiteralEvalMethod EvalMethod, + ConversionAction Action) : SM(PP.getSourceManager()), Features(PP.getLangOpts()), Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()), - MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), - ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false), - Pascal(false) { - init(StringToks); + LiteralConv(&PP.getLiteralConverter()), MaxTokenLength(0), SizeBound(0), + CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()), + EvalMethod(EvalMethod), hadError(false), Pascal(false) { + init(StringToks, Action); } -void StringLiteralParser::init(ArrayRef StringToks){ +void StringLiteralParser::init(ArrayRef StringToks, + ConversionAction Action) { // The literal token may have come from an invalid source location (e.g. due // to a PCH error), in which case the token length will be 0. if (StringToks.empty() || StringToks[0].getLength() < 2) @@ -2078,6 +2126,10 @@ void StringLiteralParser::init(ArrayRef StringToks){ SourceLocation UDSuffixTokLoc; + llvm::CharSetConverter *Converter = nullptr; + if (!isUTFLiteral(Kind) && LiteralConv) + Converter = LiteralConv->getConverter(Action); + for (unsigned i = 0, e = StringToks.size(); i != e; ++i) { const char *ThisTokBuf = &TokenBuf[0]; // Get the spelling of the token, which eliminates trigraphs, etc. We know @@ -2191,6 +2243,16 @@ void StringLiteralParser::init(ArrayRef StringToks){ if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF)) hadError = true; + if (!hadError && Converter) { + assert(Kind != tok::wide_string_literal && + "Wide character translation not supported"); + SmallString<256> CpConv; + int ResultLength = BeforeCRLF.size() * CharByteWidth; + char *Cp = ResultPtr - ResultLength; + Converter->convert(StringRef(Cp, ResultLength), CpConv); + memcpy(Cp, CpConv.data(), ResultLength); + ResultPtr = Cp + CpConv.size(); + } // Point into the \n inside the \r\n sequence and operate on the // remaining portion of the literal. RemainingTokenSpan = AfterCRLF.substr(1); @@ -2225,26 +2287,45 @@ void StringLiteralParser::init(ArrayRef StringToks){ ++ThisTokBuf; } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); + int Length = ThisTokBuf - InStart; // Copy the character span over. if (CopyStringFragment(StringToks[i], ThisTokBegin, StringRef(InStart, ThisTokBuf - InStart))) hadError = true; + + if (!hadError && Converter) { + assert(Kind != tok::wide_string_literal && + "Wide character translation not supported"); + SmallString<256> CpConv; + int ResultLength = Length * CharByteWidth; + char *Cp = ResultPtr - ResultLength; + Converter->convert(StringRef(Cp, ResultLength), CpConv); + memcpy(Cp, CpConv.data(), ResultLength); + ResultPtr = Cp + CpConv.size(); + } continue; } // Is this a Universal Character Name escape? if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' || ThisTokBuf[1] == 'N') { - EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, - ResultPtr, hadError, + char *Cp = ResultPtr; + EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr, + hadError, FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth, Diags, Features); + if (!hadError && Converter) { + SmallString<8> CpConv; + Converter->convert(StringRef(Cp), CpConv); + memcpy(Cp, CpConv.data(), CpConv.size()); + ResultPtr = Cp + CpConv.size(); + } continue; } // Otherwise, this is a non-UCN escape character. Process it. - unsigned ResultChar = - ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, - FullSourceLoc(StringToks[i].getLocation(), SM), - CharByteWidth * 8, Diags, Features, EvalMethod); + unsigned ResultChar = ProcessCharEscape( + ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, + FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth * 8, + Diags, Features, EvalMethod, Converter); if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of @@ -2442,7 +2523,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, } else { ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8, - Diags, Features, StringLiteralEvalMethod::Evaluated); + Diags, Features, StringLiteralEvalMethod::Evaluated, + nullptr); --ByteNo; } assert(!HadError && "This method isn't valid on erroneous strings"); diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c new file mode 100644 index 0000000000000..aab43157b1be4 --- /dev/null +++ b/clang/test/CodeGen/systemz-charset.c @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s +// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s + +const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +// CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00" + +const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz"; +//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00" + +const char *Digits = "0123456789"; +// CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00" + +const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@="; +// CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00" + +const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00" + +const char *InvalidEscape = "\y\z"; +//CHECK: c"oo\00" + +const char *HexCharacters = "\x12\x13\x14"; +//CHECK: c"\12\13\14\00" + +const char *OctalCharacters = "\141\142\143"; +//CHECK: c"abc\00" + +const char singleChar = 'a'; +//CHECK: i8 -127 + +const char *UcnCharacters = "\u00E2\u00AC\U000000DF"; +//CHECK: c"B\B0Y\00" + +const char *Unicode = "ÿ"; +//CHECK: c"\DF\00" diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp new file mode 100644 index 0000000000000..7e66407fd2ff1 --- /dev/null +++ b/clang/test/CodeGen/systemz-charset.cpp @@ -0,0 +1,46 @@ +// RUN: %clang %s -std=c++17 -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s + +const char *RawString = R"(Hello\n)"; +//CHECK: c"\C8\85\93\93\96\E0\95\00" + +const char *MultiLineRawString = R"( +Hello +There)"; +//CHECK: c"\15\C8\85\93\93\96\15\E3\88\85\99\85\00" + +char UnicodeChar8 = u8'1'; +//CHECK: i8 49 +char16_t UnicodeChar16 = u'1'; +//CHECK: i16 49 +char32_t UnicodeChar32 = U'1'; +//CHECK: i32 49 + +const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00" + +const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0] + +const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0] + +const char *UnicodeString8 = u8"Hello"; +//CHECK: c"Hello\00" +const char16_t *UnicodeString16 = u"Hello"; +//CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0] +const char32_t *UnicodeString32 = U"Hello"; +//CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0] + +const char *UnicodeRawString8 = u8R"("Hello\")"; +//CHECK: c"\22Hello\\\22\00" +const char16_t *UnicodeRawString16 = uR"("Hello\")"; +//CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0] +const char32_t *UnicodeRawString32 = UR"("Hello\")"; +//CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0] + +const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF"; +//CHECK: c"\C3\A2\C2\AC\C3\9F\00" +const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF"; +//CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0] +const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF"; +//CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0] diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index beaef09c8a8d9..026172ed2c167 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -243,10 +243,11 @@ // RUN: not %clang_cl /source-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=source-charset-utf-16 %s // source-charset-utf-16: invalid value 'utf-16' in '/source-charset:utf-16' -// /execution-charset: should warn on everything except UTF-8. -// RUN: not %clang_cl /execution-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-utf-16 %s -// execution-charset-utf-16: invalid value 'utf-16' in '/execution-charset:utf-16' +// /execution-charset: should warn on invalid charsets. +// RUN: %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s +// execution-charset-invalid: invalid value 'invalid-charset' in '/execution-charset:invalid-charset' // + // RUN: %clang_cl /Umymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s // RUN: %clang_cl /U mymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s // U: "-U" "mymacro" diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index ee7ded265769b..d0efa5cab933b 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -217,8 +217,14 @@ // RUN: not %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-CHARSET %s // CHECK-INVALID-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1' -// RUN: not %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s -// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1' +// RUN: %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s +// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'invalid-charset' in '-fexec-charset=invalid-charset' + +// Test that we support the following exec charsets. +// RUN: %clang -### -S -fexec-charset=UTF-8 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s +// RUN: %clang -### -S -fexec-charset=ISO8859-1 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s +// RUN: %clang -### -S -fexec-charset=IBM-1047 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s +// INVALID-NOT: error: invalid value // Test that we don't error on these. // RUN: not %clang -### -S -Werror \ @@ -232,7 +238,7 @@ // RUN: -fident -fno-ident \ // RUN: -fimplicit-templates -fno-implicit-templates \ // RUN: -finput-charset=UTF-8 \ -// RUN: -fexec-charset=UTF-8 \ +// RUN: -fexec-charset=UTF-8 \ // RUN: -fivopts -fno-ivopts \ // RUN: -fnon-call-exceptions -fno-non-call-exceptions \ // RUN: -fpermissive -fno-permissive \ diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c index a8fbde46cbb75..9ff122def913f 100644 --- a/clang/test/Preprocessor/init-s390x.c +++ b/clang/test/Preprocessor/init-s390x.c @@ -206,4 +206,5 @@ // S390X-ZOS: #define __TOS_390__ 1 // S390X-ZOS: #define __TOS_MVS__ 1 // S390X-ZOS: #define __XPLINK__ 1 +// S390X-ZOS: #define __clang_literal_encoding__ IBM-1047 // S390X-ZOS-GNUXX: #define __wchar_t 1 diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index e8d9ec0d6153a..894c0e1d2e5ae 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -592,6 +592,10 @@ else() option(LLVM_ENABLE_THREADS "Use threads if available." ON) endif() +set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON") + +set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON") + set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON") set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON") diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 43311dad457ec..f7e826b34d26f 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128) set(LLVM_HAS_LOGF128 "${HAS_LOGF128}") endif() +if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON) + message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON") +endif() + +# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing. +if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON)) + set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) + set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}") + if (LLVM_ENABLE_ICU STREQUAL FORCE_ON) + find_package(ICU REQUIRED COMPONENTS uc i18n) + if (NOT ICU_FOUND) + message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON") + endif() + else() + find_package(ICU COMPONENTS uc i18n) + endif() + set(HAVE_ICU ${ICU_FOUND}) + set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES}) +endif() + +# Check for builtin iconv to avoid licensing issues. +if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU) + if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON) + find_package(Iconv REQUIRED) + if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN) + message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON") + endif() + else() + find_package(Iconv) + endif() + if(Iconv_FOUND AND Iconv_IS_BUILT_IN) + set(HAVE_ICONV 1) + endif() +endif() + # function checks check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM) find_package(Backtrace) diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index 7efac55ab0352..3f70a0150da4f 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -236,6 +236,12 @@ /* Have host's ___chkstk_ms */ #cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS} +/* Define if ICU library is available */ +#cmakedefine HAVE_ICU ${HAVE_ICU} + +/* Define if iconv library is available */ +#cmakedefine HAVE_ICONV ${HAVE_ICONV} + /* Linker version detected at compile time. */ #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}" diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h new file mode 100644 index 0000000000000..6a28cd19f4143 --- /dev/null +++ b/llvm/include/llvm/Support/CharSet.h @@ -0,0 +1,141 @@ +//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides a utility class to convert between different character +/// set encodings. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_CHARSET_H +#define LLVM_SUPPORT_CHARSET_H + +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Config/config.h" +#include "llvm/Support/ErrorOr.h" + +#include +#include + +namespace llvm { + +template class SmallVectorImpl; + +namespace details { +class CharSetConverterImplBase { + +private: + /// Converts a string. + /// \param[in] Source source string + /// \param[out] Result container for converted string + /// \return error code in case something went wrong + /// + /// The following error codes can occur, among others: + /// - std::errc::argument_list_too_long: The result requires more than + /// std::numeric_limits::max() bytes. + /// - std::errc::illegal_byte_sequence: The input contains an invalid + /// multibyte sequence. + /// - std::errc::invalid_argument: The input contains an incomplete + /// multibyte sequence. + /// + /// If the destination charset is a stateful character set, the shift state + /// will be set to the initial state. + /// + /// In case of an error, the result string contains the successfully converted + /// part of the input string. + /// + virtual std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) = 0; + + /// Resets the converter to the initial state. + virtual void reset() = 0; + +public: + virtual ~CharSetConverterImplBase() = default; + + /// Converts a string and resets the converter to the initial state. + std::error_code convert(StringRef Source, SmallVectorImpl &Result) { + auto EC = convertString(Source, Result); + reset(); + return EC; + } +}; +} // namespace details + +// Names inspired by https://wg21.link/p1885. +namespace text_encoding { +enum class id { + /// UTF-8 character set encoding. + UTF8, + + /// IBM EBCDIC 1047 character set encoding. + IBM1047 +}; +} // end namespace text_encoding + +/// Utility class to convert between different character set encodings. +class CharSetConverter { + std::unique_ptr Converter; + + CharSetConverter(std::unique_ptr Converter) + : Converter(std::move(Converter)) {} + +public: + /// Creates a CharSetConverter instance. + /// Returns std::errc::invalid_argument in case the requested conversion is + /// not supported. + /// \param[in] CSFrom the source character encoding + /// \param[in] CSTo the target character encoding + /// \return a CharSetConverter instance or an error code + static ErrorOr create(text_encoding::id CSFrom, + text_encoding::id CSTo); + + /// Creates a CharSetConverter instance. + /// Returns std::errc::invalid_argument in case the requested conversion is + /// not supported. + /// \param[in] CPFrom name of the source character encoding + /// \param[in] CPTo name of the target character encoding + /// \return a CharSetConverter instance or an error code + static ErrorOr create(StringRef CPFrom, StringRef CPTo); + + CharSetConverter(const CharSetConverter &) = delete; + CharSetConverter &operator=(const CharSetConverter &) = delete; + + CharSetConverter(CharSetConverter &&Other) + : Converter(std::move(Other.Converter)) {} + + CharSetConverter &operator=(CharSetConverter &&Other) { + if (this != &Other) + Converter = std::move(Other.Converter); + return *this; + } + + ~CharSetConverter() = default; + + /// Converts a string. + /// \param[in] Source source string + /// \param[out] Result container for converted string + /// \return error code in case something went wrong + std::error_code convert(StringRef Source, + SmallVectorImpl &Result) const { + return Converter->convert(Source, Result); + } + + ErrorOr convert(StringRef Source) const { + SmallString<100> Result; + auto EC = Converter->convert(Source, Result); + if (!EC) + return std::string(Result); + return EC; + } +}; + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 7fd5278f1ed53..059c176eaa56d 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -491,6 +491,9 @@ class Triple { /// For example, "fooos1.2.3" would return "1.2.3". StringRef getEnvironmentVersionString() const; + /// getSystemCharset - Get the system charset of the triple. + StringRef getSystemCharset() const; + /// @} /// @name Convenience Predicates /// @{ diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index df1e65f3a588c..9a7d26a35bf1a 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -162,6 +162,7 @@ add_llvm_component_library(LLVMSupport CachePruning.cpp Caching.cpp circular_raw_ostream.cpp + CharSet.cpp Chrono.cpp COM.cpp CodeGenCoverage.cpp @@ -316,6 +317,14 @@ add_llvm_component_library(LLVMSupport Demangle ) +# Link ICU library if it is an external library. +if(ICU_FOUND) + target_link_libraries(LLVMSupport + PRIVATE + ${ICU_LIBRARIES} + ) +endif() + set(llvm_system_libs ${system_libs}) # This block is only needed for llvm-config. When we deprecate llvm-config and diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp new file mode 100644 index 0000000000000..6810cf9c6e376 --- /dev/null +++ b/llvm/lib/Support/CharSet.cpp @@ -0,0 +1,344 @@ +//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides utility classes to convert between different character +/// set encodings. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/CharSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/ConvertEBCDIC.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +#ifdef HAVE_ICU +#include +#elif defined(HAVE_ICONV) +#include +#endif + +using namespace llvm; + +// Normalize the charset name with the charset alias matching algorithm proposed +// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching. +static void normalizeCharSetName(StringRef CSName, + SmallVectorImpl &Normalized) { + bool PrevDigit = false; + for (auto Ch : CSName) { + if (isAlnum(Ch)) { + Ch = toLower(Ch); + if (Ch != '0' || PrevDigit) { + PrevDigit = isDigit(Ch); + Normalized.push_back(Ch); + } + } + } +} + +// Maps the charset name to enum constant if possible. +static std::optional getKnownCharSet(StringRef CSName) { + SmallString<16> Normalized; + normalizeCharSetName(CSName, Normalized); + if (Normalized.equals("utf8")) + return text_encoding::id::UTF8; + if (Normalized.equals("ibm1047")) + return text_encoding::id::IBM1047; + return std::nullopt; +} + +LLVM_ATTRIBUTE_UNUSED static void +HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, + SmallVectorImpl &Result) { + // No space left in output buffer. Double the size of the underlying + // memory in the SmallVectorImpl, adjust pointer and length and continue + // the conversion. + Capacity = (Capacity < std::numeric_limits::max() / 2) + ? 2 * Capacity + : std::numeric_limits::max(); + Result.resize(0); + Result.resize_for_overwrite(Capacity); + Output = static_cast(Result.data()); + OutputLength = Capacity; +} + +namespace { +enum ConversionType { + UTF8ToIBM1047, + IBM1047ToUTF8, +}; + +// Support conversion between EBCDIC 1047 and UTF-8. This class uses +// built-in translation tables that allow for translation between the +// aforementioned character sets. The use of tables for conversion is only +// possible because EBCDIC 1047 is a single-byte, stateless encoding; other +// character sets are not supported. +class CharSetConverterTable : public details::CharSetConverterImplBase { + const ConversionType ConvType; + +public: + CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) override; + + void reset() override {} +}; + +std::error_code +CharSetConverterTable::convertString(StringRef Source, + SmallVectorImpl &Result) { + if (ConvType == IBM1047ToUTF8) { + ConverterEBCDIC::convertToUTF8(Source, Result); + return std::error_code(); + } else if (ConvType == UTF8ToIBM1047) { + return ConverterEBCDIC::convertToEBCDIC(Source, Result); + } + llvm_unreachable("Invalid ConvType!"); + return std::error_code(); +} + +#ifdef HAVE_ICU +struct UConverterDeleter { + void operator()(UConverter *Converter) const { + if (Converter) + ucnv_close(Converter); + } +}; +using UConverterUniquePtr = std::unique_ptr; + +class CharSetConverterICU : public details::CharSetConverterImplBase { + UConverterUniquePtr FromConvDesc; + UConverterUniquePtr ToConvDesc; + +public: + CharSetConverterICU(UConverterUniquePtr FromConverter, + UConverterUniquePtr ToConverter) + : FromConvDesc(std::move(FromConverter)), + ToConvDesc(std::move(ToConverter)) {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) override; + + void reset() override; +}; + +std::error_code +CharSetConverterICU::convertString(StringRef Source, + SmallVectorImpl &Result) { + // Setup the input in case it has no backing data. + size_t InputLength = Source.size(); + const char *In = InputLength ? const_cast(Source.data()) : ""; + + // Setup the output. We directly write into the SmallVector. + size_t Capacity = Result.capacity(); + size_t OutputLength = Capacity; + Result.resize_for_overwrite(Capacity); + char *Output = static_cast(Result.data()); + UErrorCode EC = U_ZERO_ERROR; + + ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, + &EC); + ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, + NULL, &EC); + assert(U_SUCCESS(EC)); + + do { + EC = U_ZERO_ERROR; + const char *Input = In; + + Output = InputLength ? static_cast(Result.data()) : nullptr; + ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input, + In + InputLength, /*pivotStart=*/NULL, + /*pivotSource=*/NULL, /*pivotTarget=*/NULL, + /*pivotLimit=*/NULL, /*reset=*/true, + /*flush=*/true, &EC); + if (U_FAILURE(EC)) { + if (EC == U_BUFFER_OVERFLOW_ERROR && + Capacity < std::numeric_limits::max()) { + HandleOverflow(Capacity, Output, OutputLength, Result); + continue; + } + // Some other error occured. + Result.resize(Output - Result.data()); + return std::error_code(EILSEQ, std::generic_category()); + } + break; + } while (true); + + Result.resize(Output - Result.data()); + return std::error_code(); +} + +void CharSetConverterICU::reset() { + ucnv_reset(&*FromConvDesc); + ucnv_reset(&*ToConvDesc); +} + +#elif defined(HAVE_ICONV) +class CharSetConverterIconv : public details::CharSetConverterImplBase { + class UniqueIconvT { + iconv_t ConvDesc; + + public: + operator iconv_t() const { return ConvDesc; } + UniqueIconvT(iconv_t CD) : ConvDesc(CD) {} + ~UniqueIconvT() { + if (ConvDesc != (iconv_t)-1) { + iconv_close(ConvDesc); + ConvDesc = (iconv_t)-1; + } + } + UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) { + Other.ConvDesc = (iconv_t)-1; + } + UniqueIconvT &operator=(UniqueIconvT &&Other) { + if (&Other != this) { + ConvDesc = Other.ConvDesc; + Other.ConvDesc = (iconv_t)-1; + } + return *this; + } + }; + UniqueIconvT ConvDesc; + +public: + CharSetConverterIconv(UniqueIconvT ConvDesc) + : ConvDesc(std::move(ConvDesc)) {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl &Result) override; + + void reset() override; +}; + +std::error_code +CharSetConverterIconv::convertString(StringRef Source, + SmallVectorImpl &Result) { + // Setup the output. We directly write into the SmallVector. + size_t Capacity = Result.capacity(); + char *Output = static_cast(Result.data()); + size_t OutputLength = Capacity; + Result.resize_for_overwrite(Capacity); + + size_t Ret; + // Handle errors returned from iconv(). + auto HandleError = [&Capacity, &Output, &OutputLength, &Result, + this](size_t Ret) { + if (Ret == static_cast(-1)) { + // An error occured. Check if we can gracefully handle it. + if (errno == E2BIG && Capacity < std::numeric_limits::max()) { + HandleOverflow(Capacity, Output, OutputLength, Result); + // Reset converter + iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr); + return std::error_code(); + } else { + // Some other error occured. + Result.resize(Output - Result.data()); + return std::error_code(errno, std::generic_category()); + } + } else { + // A positive return value indicates that some characters were converted + // in a nonreversible way, that is, replaced with a SUB symbol. Returning + // an error in this case makes sure that both conversion routines behave + // in the same way. + return std::make_error_code(std::errc::illegal_byte_sequence); + } + }; + + do { + // Setup the input. Use nullptr to reset iconv state if input length is + // zero. + size_t InputLength = Source.size(); + char *Input = InputLength ? const_cast(Source.data()) : nullptr; + Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength); + if (Ret != 0) { + if (auto EC = HandleError(Ret)) + return EC; + continue; + } + // Flush the converter + Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength); + if (Ret != 0) { + if (auto EC = HandleError(Ret)) + return EC; + continue; + } + break; + } while (true); + + // Re-adjust size to actual size. + Result.resize(Output - Result.data()); + return std::error_code(); +} + +void CharSetConverterIconv::reset() { + iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr); +} + +#endif // HAVE_ICONV +} // namespace + +ErrorOr CharSetConverter::create(text_encoding::id CPFrom, + text_encoding::id CPTo) { + + assert(CPFrom != CPTo && "Text encodings should be distinct"); + + ConversionType Conversion; + if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047) + Conversion = UTF8ToIBM1047; + else if (CPFrom == text_encoding::id::IBM1047 && + CPTo == text_encoding::id::UTF8) + Conversion = IBM1047ToUTF8; + else + return std::error_code(errno, std::generic_category()); + + std::unique_ptr Converter = + std::make_unique(Conversion); + return CharSetConverter(std::move(Converter)); +} + +ErrorOr CharSetConverter::create(StringRef CSFrom, + StringRef CSTo) { + std::optional From = getKnownCharSet(CSFrom); + std::optional To = getKnownCharSet(CSTo); + if (From && To) { + ErrorOr Converter = create(*From, *To); + if (Converter) + return Converter; + } +#ifdef HAVE_ICU + UErrorCode EC = U_ZERO_ERROR; + UConverterUniquePtr FromConvDesc(ucnv_open(CSFrom.str().c_str(), &EC)); + if (U_FAILURE(EC)) { + return std::error_code(errno, std::generic_category()); + } + UConverterUniquePtr ToConvDesc(ucnv_open(CSTo.str().c_str(), &EC)); + if (U_FAILURE(EC)) { + return std::error_code(errno, std::generic_category()); + } + std::unique_ptr Converter = + std::make_unique(std::move(FromConvDesc), + std::move(ToConvDesc)); + return CharSetConverter(std::move(Converter)); +#elif defined(HAVE_ICONV) + iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str()); + if (ConvDesc == (iconv_t)-1) + return std::error_code(errno, std::generic_category()); + std::unique_ptr Converter = + std::make_unique(ConvDesc); + return CharSetConverter(std::move(Converter)); +#else + return std::make_error_code(std::errc::invalid_argument); +#endif +} diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 6a559ff023caa..4f55d05528839 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -1384,6 +1384,13 @@ StringRef Triple::getOSAndEnvironmentName() const { return Tmp.split('-').second; // Strip second component } +// System charset on z/OS is IBM-1047 and UTF-8 otherwise +StringRef Triple::getSystemCharset() const { + if (getOS() == llvm::Triple::ZOS) + return "IBM-1047"; + return "UTF-8"; +} + static VersionTuple parseVersionFromName(StringRef Name) { VersionTuple Version; Version.tryParse(Name); diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index b6b9398df5e2e..09e55f116f780 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -20,6 +20,7 @@ add_llvm_unittest(SupportTests CrashRecoveryTest.cpp Caching.cpp Casting.cpp + CharSetTest.cpp CheckedArithmeticTest.cpp Chrono.cpp CommandLineTest.cpp diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp new file mode 100644 index 0000000000000..772d46ec73497 --- /dev/null +++ b/llvm/unittests/Support/CharSetTest.cpp @@ -0,0 +1,232 @@ +//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/CharSet.h" +#include "llvm/ADT/SmallString.h" +#include "gtest/gtest.h" +using namespace llvm; + +namespace { + +// String "Hello World!" +static const char HelloA[] = + "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a"; +static const char HelloE[] = + "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15"; + +// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +static const char ABCStrA[] = + "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52" + "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A" + "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A"; +static const char ABCStrE[] = + "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9" + "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91" + "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9"; + +// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë" +static const char AccentUTF[] = + "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89" + "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9" + "\xc3\xaa\xc3\xab"; +static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72" + "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53"; + +// String with Cyrillic character ya. +static const char CyrillicUTF[] = "\xd0\xaf"; + +// String "Earth地球". +// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and +// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII. +// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts +// back. +static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83"; +static const char EarthISO2022[] = + "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42"; +static const char EarthIBM939[] = + "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f"; +static const char EarthUTFExtraPartial[] = + "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83\xe5"; + +TEST(CharSet, FromUTF8) { + // Hello string. + StringRef Src(HelloA); + SmallString<64> Dst; + + ErrorOr Conv = CharSetConverter::create( + text_encoding::id::UTF8, text_encoding::id::IBM1047); + + // Stop test if conversion is not supported. + if (!Conv) { + ASSERT_EQ(Conv.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + std::error_code EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloE, static_cast(Dst).c_str()); + Dst.clear(); + + // ABC string. + Src = ABCStrA; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrE, static_cast(Dst).c_str()); + Dst.clear(); + + // Accent string. + Src = AccentUTF; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentE, static_cast(Dst).c_str()); + Dst.clear(); + + // Cyrillic string. Results in error because not representable in 1047. + Src = CyrillicUTF; + EC = Conv->convert(Src, Dst); + EXPECT_EQ(EC, std::errc::illegal_byte_sequence); +} + +TEST(CharSet, ToUTF8) { + // Hello string. + StringRef Src(HelloE); + SmallString<64> Dst; + + ErrorOr Conv = CharSetConverter::create( + text_encoding::id::IBM1047, text_encoding::id::UTF8); + + // Stop test if conversion is not supported. + if (!Conv) { + ASSERT_EQ(Conv.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + std::error_code EC = Conv->convert(Src, Dst); + + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloA, static_cast(Dst).c_str()); + Dst.clear(); + + // ABC string. + Src = ABCStrE; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrA, static_cast(Dst).c_str()); + Dst.clear(); + + // Accent string. + Src = AccentE; + EC = Conv->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentUTF, static_cast(Dst).c_str()); +} + +TEST(CharSet, RoundTrip) { + ErrorOr ConvToUTF16 = + CharSetConverter::create("IBM-1047", "UTF-16"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF16) { + ASSERT_EQ(ConvToUTF16.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + ErrorOr ConvToUTF32 = + CharSetConverter::create("UTF-16", "UTF-32"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF32) { + ASSERT_EQ(ConvToUTF32.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + ErrorOr ConvToEBCDIC = + CharSetConverter::create("UTF-32", "IBM-1047"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToEBCDIC) { + ASSERT_EQ(ConvToEBCDIC.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Setup source string. + char SrcStr[256]; + for (size_t I = 0; I < 256; ++I) + SrcStr[I] = (I + 1) % 256; + + SmallString<99> Dst1Str, Dst2Str, Dst3Str; + + std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str); + EXPECT_TRUE(!EC); + EC = ConvToUTF32->convert(Dst1Str, Dst2Str); + EXPECT_TRUE(!EC); + EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str); + EXPECT_TRUE(!EC); + EXPECT_STREQ(SrcStr, static_cast(Dst3Str).c_str()); +} + +TEST(CharSet, ShiftState2022) { + // Earth string. + StringRef Src(EarthUTF); + SmallString<8> Dst; + + ErrorOr ConvTo2022 = + CharSetConverter::create("UTF-8", "ISO-2022-JP"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvTo2022) { + ASSERT_EQ(ConvTo2022.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Check that the string is properly converted. + std::error_code EC = ConvTo2022->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(EarthISO2022, static_cast(Dst).c_str()); +} + +TEST(CharSet, ShiftState2022Partial) { + // Earth string. + StringRef Src(EarthUTFExtraPartial); + SmallString<8> Dst; + + ErrorOr ConvTo2022 = + CharSetConverter::create("UTF-8", "ISO-2022-JP"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvTo2022) { + ASSERT_EQ(ConvTo2022.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Check that the string is properly converted. + std::error_code EC = ConvTo2022->convert(Src, Dst); + EXPECT_TRUE(EC); +} + +TEST(CharSet, ShiftStateIBM939) { + // Earth string. + StringRef Src(EarthUTF); + SmallString<64> Dst; + + ErrorOr ConvToIBM939 = + CharSetConverter::create("UTF-8", "IBM-939"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToIBM939) { + ASSERT_EQ(ConvToIBM939.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Check that the string is properly converted. + std::error_code EC = ConvToIBM939->convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(EarthIBM939, static_cast(Dst).c_str()); +} + +} // namespace diff --git a/llvm/unittests/Support/ConvertEBCDICTest.cpp b/llvm/unittests/Support/ConvertEBCDICTest.cpp index eec76879ac92c..557f29c391f9c 100644 --- a/llvm/unittests/Support/ConvertEBCDICTest.cpp +++ b/llvm/unittests/Support/ConvertEBCDICTest.cpp @@ -41,7 +41,7 @@ static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72" // String with Cyrillic character ya. static const char CyrillicUTF[] = "\xd0\xaf"; -TEST(CharSet, FromUTF8) { +TEST(ConverterEBCDIC, convertToEBCDIC) { // Hello string. StringRef Src(HelloA); SmallString<64> Dst; @@ -72,7 +72,7 @@ TEST(CharSet, FromUTF8) { Dst.clear(); } -TEST(CharSet, ToUTF8) { +TEST(ConverterEBCDIC, convertFromEBCDIC) { // Hello string. StringRef Src(HelloE); SmallString<64> Dst;